1 try:
2 import xml.etree.cElementTree as ET
3 except ImportError:
4 import cElementTree as ET
5 import Utils.ElementTreeUtils as ETUtils
6 import sys
7 import CorpusElements
8 from optparse import OptionParser
9
10 -def copyParse(input, source, output, parse, tokenization):
11 print >> sys.stderr, "Loading input file", input
12 inputTree = ETUtils.ETFromObj(input)
13 inputRoot = inputTree.getroot()
14 print >> sys.stderr, "Loading source:",
15 sourceElements = CorpusElements.loadCorpus(source, parse, tokenization)
16 sourceSentencesByText = {}
17 for sentence in sourceElements.sentences:
18 sentenceText = sentence.sentence.get("text")
19
20 if sourceSentencesByText.has_key(sentenceText):
21 print >> sys.stderr, "Duplicate text", sentence.sentence.get("id"), sourceSentencesByText[sentenceText].sentence.get("id")
22 sourceSentencesByText[sentenceText] = sentence
23 parsesCopied = [0,0]
24 tokenizationsCopied = [0,0]
25 for sentence in inputRoot.getiterator("sentence"):
26 parsesCopied[1] += 1
27 tokenizationsCopied[1] += 1
28
29 if not sourceSentencesByText.has_key(sentence.get("text")):
30 print >> sys.stderr, "Warning, no text found for sentence", sentence.get("id")
31 continue
32 sourceSentence = sourceSentencesByText[sentence.get("text")]
33
34 targetAnalysesElement = sentence.find("sentenceanalyses")
35 if targetAnalysesElement == None:
36 targetAnalysesElement = ET.Element("sentenceanalyses")
37 sentence.append(targetAnalysesElement)
38
39 targetParsesElement = targetAnalysesElement.find("parses")
40 if targetParsesElement == None:
41 targetParsesElement = ET.Element("parses")
42 targetAnalysesElement.append(targetParsesElement)
43
44 targetParseElements = targetParsesElement.findall("parse")
45 newParse = None
46 for parseElement in targetParseElements:
47 if parseElement.get("parser") == parse:
48 newParse = parseElement
49 break
50
51 if newParse == None and sourceSentence.parseElement != None:
52 targetParsesElement.append(sourceSentence.parseElement)
53 parsesCopied[0] += 1
54
55
56 targetTokenizationsElement = targetAnalysesElement.find("tokenizations")
57 if targetTokenizationsElement == None:
58 targetTokenizationsElement = ET.Element("tokenizations")
59 targetAnalysesElement.append(targetTokenizationsElement)
60
61 targetTokenizationElements = targetTokenizationsElement.findall("tokenization")
62 newTokenization = None
63 for tokenizationElement in targetTokenizationElements:
64 if tokenizationElement.attrib["tokenizer"] == newParse.attrib["tokenizer"]:
65 newTokenization = tokenizationElement
66 break
67
68 if newTokenization == None and sourceSentence.tokenizationElement != None:
69 targetTokenizationsElement.append(sourceSentence.tokenizationElement)
70 tokenizationsCopied[0] += 1
71
72 print >> sys.stderr, "Copied parse elements", parsesCopied
73 print >> sys.stderr, "Copied tokenization elements", tokenizationsCopied
74
75 if output != None:
76 print >> sys.stderr, "Writing output to", output
77 ETUtils.write(inputTree, output)
78 return inputTree
79
80 if __name__=="__main__":
81 print >> sys.stderr, "##### Copy Parse #####"
82
83 try:
84 import psyco
85 psyco.full()
86 print >> sys.stderr, "Found Psyco, using"
87 except ImportError:
88 print >> sys.stderr, "Psyco not installed"
89
90 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
91 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE")
92 optparser.add_option("-s", "--source", default=None, dest="source", help="Corpus in analysis format", metavar="FILE")
93 optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE")
94 optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="Tokenization element name")
95 optparser.add_option("-p", "--parse", default=None, dest="parse", help="Parse element name")
96 (options, args) = optparser.parse_args()
97 assert(options.input != None)
98 assert(options.source != None)
99 assert(options.output != None)
100 copyParse(options.input, options.source, options.output, options.parse, options.tokenization)
101