Package TEES :: Package Utils :: Module FindHeads
[hide private]

Source Code for Module TEES.Utils.FindHeads

 1  import sys, os 
 2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
 3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
 4  import Utils.ElementTreeUtils as ETUtils 
 5  import sys, os 
 6  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 
 7  import Core.SentenceGraph as SentenceGraph 
 8   
9 -def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
10 if iterate: 11 from Utils.ProgressCounter import ProgressCounter 12 import InteractionXML.SentenceElements as SentenceElements 13 print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization 14 print >> sys.stderr, "Removing existing head offsets" 15 removeCount = 0 16 counter = ProgressCounter(None, "Find heads") 17 counter.showMilliseconds = True 18 for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization): 19 for sentence in sentences: 20 if removeExisting: 21 for e in sentence.sentence.findall("entity"): 22 if e.get("headOffset") != None: 23 removeCount += 1 24 del e.attrib["headOffset"] 25 graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) 26 graph.mapInteractions(sentence.entities, sentence.interactions) 27 # Make sure every parse gets head scores 28 #if graph.tokenHeadScores == None: 29 # graph.getTokenHeadScores() 30 counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ") 31 print >> sys.stderr, "Removed head offsets from", removeCount, "entities" 32 else: 33 xml = ETUtils.ETFromObj(input) 34 if removeExisting: 35 print >> sys.stderr, "Removing existing head offsets" 36 removeCount = 0 37 xml = ETUtils.ETFromObj(input) 38 for d in xml.getroot().findall("document"): 39 for s in d.findall("sentence"): 40 for e in s.findall("entity"): 41 if e.get("headOffset") != None: 42 removeCount += 1 43 del e.attrib["headOffset"] 44 print >> sys.stderr, "Removed head offsets from", removeCount, "entities" 45 46 # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing 47 print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization 48 corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) 49 50 # Make sure every parse gets head scores 51 for sentence in corpusElements.sentences: 52 if sentence.sentenceGraph == None: 53 continue 54 if sentence.sentenceGraph.tokenHeadScores == None: 55 sentence.sentenceGraph.getTokenHeadScores() 56 57 if output != None: 58 print >> sys.stderr, "Writing output to", output 59 ETUtils.write(corpusElements.rootElement, output) 60 return xml
61 62 if __name__=="__main__": 63 import sys 64 print >> sys.stderr, "##### Calculating entity head token offsets #####" 65 66 from optparse import OptionParser 67 # Import Psyco if available 68 try: 69 import psyco 70 psyco.full() 71 print >> sys.stderr, "Found Psyco, using" 72 except ImportError: 73 print >> sys.stderr, "Psyco not installed" 74 75 optparser = OptionParser(usage="%prog [options]\nRecalculate head token offsets.") 76 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 77 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 78 optparser.add_option("-p", "--parse", default=None, dest="parse", help="Parse element name for calculating head offsets") 79 optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="Tokenization element name for calculating head offsets") 80 optparser.add_option("-r", "--iterate", default=False, action="store_true", dest="iterate", help="") 81 (options, args) = optparser.parse_args() 82 83 findHeads(input=options.input, output=options.output, parse=options.parse, tokenization=options.tokenization, iterate=options.iterate) 84