1 try:
2 import xml.etree.cElementTree as ET
3 except ImportError:
4 import cElementTree as ET
5 import Utils.ElementTreeUtils as ETUtils
6 import sys
7 import CorpusElements
8 from optparse import OptionParser
9
10 if __name__=="__main__":
11 print >> sys.stderr, "##### Compare Parse #####"
12
13 try:
14 import psyco
15 psyco.full()
16 print >> sys.stderr, "Found Psyco, using"
17 except ImportError:
18 print >> sys.stderr, "Psyco not installed"
19
20 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
21
22 optparser.add_option("-s", "--source", default=None, dest="source", help="Corpus in analysis format", metavar="FILE")
23 optparser.add_option("-r", "--target", default=None, dest="target", help="Corpus in analysis format", metavar="FILE")
24
25 optparser.add_option("-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization element name")
26 optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse element name")
27 (options, args) = optparser.parse_args()
28 assert(options.source != None)
29 assert(options.target != None)
30
31
32 print >> sys.stderr, "Loading source:",
33 sourceElements = CorpusElements.loadCorpus(options.source, options.parse, options.tokenization)
34 print >> sys.stderr, "Loading target:",
35 targetElements = CorpusElements.loadCorpus(options.target, options.parse, options.tokenization)
36 parseCopied = None
37 tokenizationCopied = None
38 print >> sys.stderr, "Mapping sentences"
39 origIdToSentences = {}
40 for sourceSentence in sourceElements.sentences:
41 origIdToSentences[sourceSentence.sentence.get("origId")] = [sourceSentence, None]
42 for targetSentence in targetElements.sentences:
43 assert origIdToSentences.has_key(targetSentence.sentence.get("origId")), targetSentence.sentence.get("origId")
44 origIdToSentences[targetSentence.sentence.get("origId")][1] = targetSentence
45 print >> sys.stderr, "Comparing sentences"
46 count = 0
47 for key in sorted(origIdToSentences.keys()):
48 sourceSentence = origIdToSentences[key][0]
49 targetSentence = origIdToSentences[key][1]
50
51 assert sourceSentence.sentence.get("origId") == targetSentence.sentence.get("origId"), (sourceSentence.sentence.get("origId"), targetSentence.sentence.get("origId"))
52 sId = sourceSentence.sentence.get("origId")
53 for sourceToken, targetToken in zip(sourceSentence.tokens, targetSentence.tokens):
54 if sourceToken.attrib != targetToken.attrib:
55 print >> sys.stderr, sId + ": tok diff " + sourceToken.get("id") + "/" + targetToken.get("id")
56 for sourceDep, targetDep in zip(sourceSentence.dependencies, targetSentence.dependencies):
57 if sourceDep.attrib != targetDep.attrib:
58 print >> sys.stderr, sId + ": dep diff " + sourceDep.get("id") + "/" + targetDep.get("id")
59 count += 1
60 print >> sys.stderr, "Done, compared", count, "sentences"
61