1  try: 
 2      import xml.etree.cElementTree as ET 
 3  except ImportError: 
 4      import cElementTree as ET 
 5  import Utils.ElementTreeUtils as ETUtils 
 6  import sys 
 7  import CorpusElements 
 8  from optparse import OptionParser 
 9  from collections import defaultdict 
10   
12      print >> sys.stderr, "Loading input file", input 
13      inputTree = ETUtils.ETFromObj(input) 
14      inputRoot = inputTree.getroot() 
15      counts = defaultdict(int) 
16      for sentence in inputRoot.getiterator("sentence"): 
17          counts["sentence"] += 1 
18          analysesElement = sentence.find("sentenceanalyses") 
19          if analysesElement == None: 
20              counts["sentence-no-analyses"] += 1 
21              continue 
22           
23          parsesElement = analysesElement.find("parses") 
24          if parsesElement == None: 
25              counts["sentence-no-parses"] += 1 
26              continue 
27           
28          for parseElement in parsesElement: 
29              parserName = parseElement.get("parser") 
30              counts["parse:"+parserName] += 1 
31              if parseElement.get("pennstring") in ["", None]: 
32                  counts["parse:"+parserName+"(no penn)"] += 1 
33              if len(parseElement.findall("dependency")) == 0: 
34                  counts["parse:"+parserName+"(no dependencies)"] += 1 
35              if len(parseElement.findall("phrase")) == 0: 
36                  counts["parse:"+parserName+"(no phrases)"] += 1 
37           
38          tokenizationsElement = analysesElement.find("tokenizations") 
39          if tokenizationsElement == None: 
40              counts["sentence-no-tokenizations"] += 1 
41              continue 
42           
43          for tokenizationElement in tokenizationsElement: 
44              tokenizerName = tokenizationElement.get("tokenizer") 
45              counts["tokenization:"+tokenizerName] += 1 
46              if len(tokenizationElement.findall("token")) == 0: 
47                  counts["tokenization:"+tokenizerName+"(no tokens)"] += 1 
48       
49      print >> sys.stderr, "Parse statistics for", input 
50      for key in sorted(counts.keys()): 
51          print >> sys.stderr, " ", key + ":", counts[key] 
 52           
53  if __name__=="__main__": 
54      print >> sys.stderr, "##### Parse Statistics #####" 
55       
56      try: 
57          import psyco 
58          psyco.full() 
59          print >> sys.stderr, "Found Psyco, using" 
60      except ImportError: 
61          print >> sys.stderr, "Psyco not installed" 
62   
63      optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") 
64      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") 
65      (options, args) = optparser.parse_args() 
66      assert(options.input != None) 
67      parseStats(options.input) 
68