Package TEES :: Package Utils :: Package InteractionXML :: Module ParseStats
[hide private]

Source Code for Module TEES.Utils.InteractionXML.ParseStats

 1  try: 
 2      import xml.etree.cElementTree as ET 
 3  except ImportError: 
 4      import cElementTree as ET 
 5  import Utils.ElementTreeUtils as ETUtils 
 6  import sys 
 7  import CorpusElements 
 8  from optparse import OptionParser 
 9  from collections import defaultdict 
10   
11 -def parseStats(input):
12 print >> sys.stderr, "Loading input file", input 13 inputTree = ETUtils.ETFromObj(input) 14 inputRoot = inputTree.getroot() 15 counts = defaultdict(int) 16 for sentence in inputRoot.getiterator("sentence"): 17 counts["sentence"] += 1 18 analysesElement = sentence.find("sentenceanalyses") 19 if analysesElement == None: 20 counts["sentence-no-analyses"] += 1 21 continue 22 # Create parses element (if needed) 23 parsesElement = analysesElement.find("parses") 24 if parsesElement == None: 25 counts["sentence-no-parses"] += 1 26 continue 27 # Loop through parses 28 for parseElement in parsesElement: 29 parserName = parseElement.get("parser") 30 counts["parse:"+parserName] += 1 31 if parseElement.get("pennstring") in ["", None]: 32 counts["parse:"+parserName+"(no penn)"] += 1 33 if len(parseElement.findall("dependency")) == 0: 34 counts["parse:"+parserName+"(no dependencies)"] += 1 35 if len(parseElement.findall("phrase")) == 0: 36 counts["parse:"+parserName+"(no phrases)"] += 1 37 # Tokenizations 38 tokenizationsElement = analysesElement.find("tokenizations") 39 if tokenizationsElement == None: 40 counts["sentence-no-tokenizations"] += 1 41 continue 42 # Loop through tokenizations 43 for tokenizationElement in tokenizationsElement: 44 tokenizerName = tokenizationElement.get("tokenizer") 45 counts["tokenization:"+tokenizerName] += 1 46 if len(tokenizationElement.findall("token")) == 0: 47 counts["tokenization:"+tokenizerName+"(no tokens)"] += 1 48 49 print >> sys.stderr, "Parse statistics for", input 50 for key in sorted(counts.keys()): 51 print >> sys.stderr, " ", key + ":", counts[key]
52 53 if __name__=="__main__": 54 print >> sys.stderr, "##### Parse Statistics #####" 55 # Import Psyco if available 56 try: 57 import psyco 58 psyco.full() 59 print >> sys.stderr, "Found Psyco, using" 60 except ImportError: 61 print >> sys.stderr, "Psyco not installed" 62 63 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") 64 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") 65 (options, args) = optparser.parse_args() 66 assert(options.input != None) 67 parseStats(options.input) 68