1 try:
2 import xml.etree.cElementTree as ET
3 except ImportError:
4 import cElementTree as ET
5 import Utils.ElementTreeUtils as ETUtils
6 import sys
7 import CorpusElements
8 from optparse import OptionParser
9 from collections import defaultdict
10
12 print >> sys.stderr, "Loading input file", input
13 inputTree = ETUtils.ETFromObj(input)
14 inputRoot = inputTree.getroot()
15 counts = defaultdict(int)
16 for sentence in inputRoot.getiterator("sentence"):
17 counts["sentence"] += 1
18 analysesElement = sentence.find("sentenceanalyses")
19 if analysesElement == None:
20 counts["sentence-no-analyses"] += 1
21 continue
22
23 parsesElement = analysesElement.find("parses")
24 if parsesElement == None:
25 counts["sentence-no-parses"] += 1
26 continue
27
28 for parseElement in parsesElement:
29 parserName = parseElement.get("parser")
30 counts["parse:"+parserName] += 1
31 if parseElement.get("pennstring") in ["", None]:
32 counts["parse:"+parserName+"(no penn)"] += 1
33 if len(parseElement.findall("dependency")) == 0:
34 counts["parse:"+parserName+"(no dependencies)"] += 1
35 if len(parseElement.findall("phrase")) == 0:
36 counts["parse:"+parserName+"(no phrases)"] += 1
37
38 tokenizationsElement = analysesElement.find("tokenizations")
39 if tokenizationsElement == None:
40 counts["sentence-no-tokenizations"] += 1
41 continue
42
43 for tokenizationElement in tokenizationsElement:
44 tokenizerName = tokenizationElement.get("tokenizer")
45 counts["tokenization:"+tokenizerName] += 1
46 if len(tokenizationElement.findall("token")) == 0:
47 counts["tokenization:"+tokenizerName+"(no tokens)"] += 1
48
49 print >> sys.stderr, "Parse statistics for", input
50 for key in sorted(counts.keys()):
51 print >> sys.stderr, " ", key + ":", counts[key]
52
53 if __name__=="__main__":
54 print >> sys.stderr, "##### Parse Statistics #####"
55
56 try:
57 import psyco
58 psyco.full()
59 print >> sys.stderr, "Found Psyco, using"
60 except ImportError:
61 print >> sys.stderr, "Psyco not installed"
62
63 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
64 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE")
65 (options, args) = optparser.parse_args()
66 assert(options.input != None)
67 parseStats(options.input)
68