1 import sys, os
2 thisPath = os.path.dirname(os.path.abspath(__file__))
3 sys.path.append(os.path.abspath(os.path.join(thisPath,"../..")))
4 import Utils.ElementTreeUtils as ETUtils
5 import sys, os
6 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
7 import Core.SentenceGraph as SentenceGraph
8
9 -def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
10 if iterate:
11 from Utils.ProgressCounter import ProgressCounter
12 import InteractionXML.SentenceElements as SentenceElements
13 print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
14 print >> sys.stderr, "Removing existing head offsets"
15 removeCount = 0
16 counter = ProgressCounter(None, "Find heads")
17 counter.showMilliseconds = True
18 for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
19 for sentence in sentences:
20 if removeExisting:
21 for e in sentence.sentence.findall("entity"):
22 if e.get("headOffset") != None:
23 removeCount += 1
24 del e.attrib["headOffset"]
25 graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
26 graph.mapInteractions(sentence.entities, sentence.interactions)
27
28
29
30 counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")
31 print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
32 else:
33 xml = ETUtils.ETFromObj(input)
34 if removeExisting:
35 print >> sys.stderr, "Removing existing head offsets"
36 removeCount = 0
37 xml = ETUtils.ETFromObj(input)
38 for d in xml.getroot().findall("document"):
39 for s in d.findall("sentence"):
40 for e in s.findall("entity"):
41 if e.get("headOffset") != None:
42 removeCount += 1
43 del e.attrib["headOffset"]
44 print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
45
46
47 print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
48 corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
49
50
51 for sentence in corpusElements.sentences:
52 if sentence.sentenceGraph == None:
53 continue
54 if sentence.sentenceGraph.tokenHeadScores == None:
55 sentence.sentenceGraph.getTokenHeadScores()
56
57 if output != None:
58 print >> sys.stderr, "Writing output to", output
59 ETUtils.write(corpusElements.rootElement, output)
60 return xml
61
62 if __name__=="__main__":
63 import sys
64 print >> sys.stderr, "##### Calculating entity head token offsets #####"
65
66 from optparse import OptionParser
67
68 try:
69 import psyco
70 psyco.full()
71 print >> sys.stderr, "Found Psyco, using"
72 except ImportError:
73 print >> sys.stderr, "Psyco not installed"
74
75 optparser = OptionParser(usage="%prog [options]\nRecalculate head token offsets.")
76 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
77 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
78 optparser.add_option("-p", "--parse", default=None, dest="parse", help="Parse element name for calculating head offsets")
79 optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="Tokenization element name for calculating head offsets")
80 optparser.add_option("-r", "--iterate", default=False, action="store_true", dest="iterate", help="")
81 (options, args) = optparser.parse_args()
82
83 findHeads(input=options.input, output=options.output, parse=options.parse, tokenization=options.tokenization, iterate=options.iterate)
84