Package TEES :: Package Utils :: Package InteractionXML :: Module CopyParse
[hide private]

Source Code for Module TEES.Utils.InteractionXML.CopyParse

  1  try: 
  2      import xml.etree.cElementTree as ET 
  3  except ImportError: 
  4      import cElementTree as ET 
  5  import Utils.ElementTreeUtils as ETUtils 
  6  import sys 
  7  import CorpusElements 
  8  from optparse import OptionParser 
  9   
10 -def copyParse(input, source, output, parse, tokenization):
11 print >> sys.stderr, "Loading input file", input 12 inputTree = ETUtils.ETFromObj(input) 13 inputRoot = inputTree.getroot() 14 print >> sys.stderr, "Loading source:", 15 sourceElements = CorpusElements.loadCorpus(source, parse, tokenization) 16 sourceSentencesByText = {} 17 for sentence in sourceElements.sentences: 18 sentenceText = sentence.sentence.get("text") 19 #assert not sourceSentencesByText.has_key(sentenceText) 20 if sourceSentencesByText.has_key(sentenceText): 21 print >> sys.stderr, "Duplicate text", sentence.sentence.get("id"), sourceSentencesByText[sentenceText].sentence.get("id") 22 sourceSentencesByText[sentenceText] = sentence 23 parsesCopied = [0,0] 24 tokenizationsCopied = [0,0] 25 for sentence in inputRoot.getiterator("sentence"): 26 parsesCopied[1] += 1 27 tokenizationsCopied[1] += 1 28 #sourceSentence = sourceElements.sentencesByOrigId[sentence.attrib["origId"]] 29 if not sourceSentencesByText.has_key(sentence.get("text")): 30 print >> sys.stderr, "Warning, no text found for sentence", sentence.get("id") 31 continue 32 sourceSentence = sourceSentencesByText[sentence.get("text")] 33 # Create analyses element (if needed) 34 targetAnalysesElement = sentence.find("sentenceanalyses") 35 if targetAnalysesElement == None: 36 targetAnalysesElement = ET.Element("sentenceanalyses") 37 sentence.append(targetAnalysesElement) 38 # Create parses element (if needed) 39 targetParsesElement = targetAnalysesElement.find("parses") 40 if targetParsesElement == None: 41 targetParsesElement = ET.Element("parses") 42 targetAnalysesElement.append(targetParsesElement) 43 # Check whether parse already exists 44 targetParseElements = targetParsesElement.findall("parse") 45 newParse = None 46 for parseElement in targetParseElements: 47 if parseElement.get("parser") == parse: 48 newParse = parseElement 49 break 50 # Copy parse if it doesn't 51 if newParse == None and sourceSentence.parseElement != None: 52 targetParsesElement.append(sourceSentence.parseElement) 53 parsesCopied[0] += 1 54 55 # Create tokenizations element (if needed) 56 targetTokenizationsElement = targetAnalysesElement.find("tokenizations") 57 if targetTokenizationsElement == None: 58 targetTokenizationsElement = ET.Element("tokenizations") 59 targetAnalysesElement.append(targetTokenizationsElement) 60 # Check whether tokenization already exists 61 targetTokenizationElements = targetTokenizationsElement.findall("tokenization") 62 newTokenization = None 63 for tokenizationElement in targetTokenizationElements: 64 if tokenizationElement.attrib["tokenizer"] == newParse.attrib["tokenizer"]: 65 newTokenization = tokenizationElement 66 break 67 # Copy parse if it doesn't 68 if newTokenization == None and sourceSentence.tokenizationElement != None: 69 targetTokenizationsElement.append(sourceSentence.tokenizationElement) 70 tokenizationsCopied[0] += 1 71 72 print >> sys.stderr, "Copied parse elements", parsesCopied 73 print >> sys.stderr, "Copied tokenization elements", tokenizationsCopied 74 75 if output != None: 76 print >> sys.stderr, "Writing output to", output 77 ETUtils.write(inputTree, output) 78 return inputTree
79 80 if __name__=="__main__": 81 print >> sys.stderr, "##### Copy Parse #####" 82 # Import Psyco if available 83 try: 84 import psyco 85 psyco.full() 86 print >> sys.stderr, "Found Psyco, using" 87 except ImportError: 88 print >> sys.stderr, "Psyco not installed" 89 90 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") 91 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") 92 optparser.add_option("-s", "--source", default=None, dest="source", help="Corpus in analysis format", metavar="FILE") 93 optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE") 94 optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="Tokenization element name") 95 optparser.add_option("-p", "--parse", default=None, dest="parse", help="Parse element name") 96 (options, args) = optparser.parse_args() 97 assert(options.input != None) 98 assert(options.source != None) 99 assert(options.output != None) 100 copyParse(options.input, options.source, options.output, options.parse, options.tokenization) 101