1 import sys, os
2 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
3 import Core.SentenceGraph as SentenceGraph
4 from Utils.ProgressCounter import ProgressCounter
5 from FindHeads import findHeads
6 import Utils.ElementTreeUtils as ETUtils
7 import Utils.InteractionXML.CorpusElements
8 import Utils.Range as Range
9 import Utils.Libraries.PorterStemmer as PorterStemmer
10
12 """
13 Returns a dictionary of "entity type"->"entity text"->"count"
14 """
15 corpus = ETUtils.ETFromObj(corpus)
16 trigDict = {}
17 for entity in corpus.getroot().getiterator("entity"):
18 if entity.get("isName") == "True":
19 continue
20 eType = entity.get("type")
21 if not trigDict.has_key(eType):
22 trigDict[eType] = {}
23 eText = entity.get("text")
24 eText = PorterStemmer.stem(eText)
25 if not trigDict[eType].has_key(eText):
26 trigDict[eType][eText] = 0
27 trigDict[eType][eText] += 1
28 return trigDict
29
31 """
32 Converts a dictionary of "entity type"->"entity text"->"count"
33 to "entity text"->"entity type"->"(count, fraction)"
34 """
35 distDict = {}
36 eTypes = trigDict.keys()
37 for eType in trigDict.keys():
38 for string in trigDict[eType].keys():
39 if not distDict.has_key(string):
40 distDict[string] = {}
41 for e in eTypes:
42 distDict[string][e] = [0, None]
43 distDict[string][eType] = [trigDict[eType][string], None]
44
45 for string in distDict.keys():
46 count = 0.0
47 for eType in distDict[string].keys():
48 count += distDict[string][eType][0]
49 for eType in distDict[string].keys():
50 distDict[string][eType][1] = distDict[string][eType][0] / count
51 return distDict
52
54 corpus = ETUtils.ETFromObj(corpus)
55 headDict = {}
56 headDict["None"] = {}
57 for sentence in corpus.getiterator("sentence"):
58 headOffsetStrings = set()
59 for entity in sentence.findall("entity"):
60 eType = entity.get("type")
61 if not headDict.has_key(eType):
62 headDict[eType] = {}
63 eText = entity.get("text")
64 headOffset = entity.get("headOffset")
65 headOffsetStrings.add(headOffset)
66 headOffset = Range.charOffsetToSingleTuple(headOffset)
67 charOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
68 if headOffset == charOffset:
69 if not headDict[eType].has_key(eText): headDict[eType][eText] = 0
70 headDict[eType][eText] += 1
71 else:
72 headText = sentenceText[headOffset[0]-charOffset[0]:headOffset[1]-charOffset[0]+1]
73 if not headDict[eType].has_key(headText): headDict[eType][headText] = 0
74 headDict[eType][headText] += 1
75 for token in tokens:
76 if not token.get("charOffset") in headOffsetStrings:
77 headText = token.get("text")
78 if not headDict["None"].has_key(headText): headDict["None"][headText] = 0
79 headDict["None"][headText] += 1
80
81 return headDict
82
85
87 print >> sys.stderr, "Removing existing head offsets"
88 removeCount = 0
89 xml = ETUtils.ETFromObj(corpus)
90 for d in xml.getroot().findall("document"):
91 for s in d.findall("sentence"):
92 for e in s.findall("entity"):
93 if e.get("headOffset") != None:
94 removeCount += 1
95 del e.attrib["headOffset"]
96 print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
97 return [0, removeCount]
98
99 -def findHeads(corpus, stringsFrom, methods, parse, tokenization):
100 for m in methods:
101 assert m in ["REMOVE", "SYNTAX", "DICT"]
102 corpus = ETUtils.ETFromObj(corpus)
103 counts = {}
104 for method in methods:
105 print >> sys.stderr, method, "pass"
106 if method == "REMOVE":
107 counts[method] = removeHeads(corpus)
108 elif method == "DICT":
109 counts[method] = findHeadsDictionary(corpus, stringsFrom, parse, tokenization)
110 elif method == "SYNTAX":
111 counts[method] = findHeadsSyntactic(corpus, parse, tokenization)
112 print >> sys.stderr, method, "pass added", counts[method][0], "and removed", counts[method][1], "heads"
113
114 print >> sys.stderr, "Summary (pass/added/removed):"
115 for method in methods:
116 print >> sys.stderr, " ", method, "/", counts[method][0], "/", counts[method][1]
117
130
132 print "Extracting triggers from", stringsFrom
133 trigDict = getTriggers(stringsFrom)
134 print "Determining trigger distribution"
135 distDict = getDistribution(trigDict)
136 allStrings = sorted(distDict.keys())
137 print "Determining heads for", corpus
138 corpusElements = Utils.InteractionXML.CorpusElements.loadCorpus(corpus, parse, tokenization, removeIntersentenceInteractions=False, removeNameInfo=False)
139 cases = {}
140 counts = [0,0]
141 for sentence in corpusElements.sentences:
142
143 sText = sentence.sentence.get("text")
144
145 for entity in sentence.entities:
146 if entity.get("headOffset") != None:
147 continue
148 if entity.get("isName") == "True":
149 continue
150
151
152 eText = entity.get("text")
153 eType = entity.get("type")
154 eOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
155 wsSplits = eText.split()
156 if len(wsSplits) == 1 and eText.find("-") == -1:
157 continue
158 else:
159 candidates = []
160
161 for wsTuple in mapSplits(wsSplits, eText, eOffset):
162 if not distDict.has_key(wsTuple[1]):
163 candidates.append( ((-1, -1), wsTuple[2], wsTuple[0], wsTuple[1]) )
164 else:
165 assert distDict[wsTuple[1]].has_key(eType), (distDict[wsTuple[0]], wsTuple[0], eText)
166 candidates.append( (tuple(distDict[wsTuple[1]][eType]), wsTuple[2], wsTuple[0], wsTuple[1]) )
167
168 for candidate in candidates[:]:
169 hyphenSplits = candidate[2].split("-")
170 if len(hyphenSplits) > 1:
171
172 for hyphenTuple in mapSplits(hyphenSplits, eText, candidate[1]):
173 if not distDict.has_key(hyphenTuple[1]):
174 candidates.append( ((-1, -1), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) )
175 else:
176 candidates.append( (tuple(distDict[hyphenTuple[1]][eType]), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) )
177
178 candidates.sort(reverse=True)
179
180 if candidates[0][0][0] in [-1, 0]:
181 print "Substring matching", candidates, "for entity", entity.get("id")
182 for i in range(len(candidates)):
183 candidate = candidates[i]
184 cText = candidate[2]
185 for string in allStrings:
186 subStringPos = cText.find(string)
187 if subStringPos != -1:
188 print " Substring match", string, cText,
189 score = tuple(distDict[string][eType])
190 if score > candidate[0]:
191 print score, candidate[0], "Substring selected"
192 subStringCoords = [candidate[1][0] + subStringPos, len(string)]
193 candidate = (score, subStringCoords, candidate[2], ">"+string+"<")
194 else:
195 print score, candidate[0]
196 candidates[i] = candidate
197
198 candidates.sort(reverse=True)
199 if candidates[0][0][0] not in [-1, 0]:
200 candidateOffset = (candidates[0][1][0] + eOffset[0], candidates[0][1][0] + candidates[0][1][1] + eOffset[0])
201 entity.set("headOffset", str(candidateOffset[0]) + "-" + str(candidateOffset[1]-1))
202 entity.set("headMethod", "Dict")
203 entity.set("headString", sText[candidateOffset[0]:candidateOffset[1]])
204 counts[0] += 1
205
206 for i in range(len(candidates)):
207 c = candidates[i]
208 candidates[i] = (tuple(c[0]), c[2], c[3])
209 case = (eType, eText, tuple(candidates))
210 if not cases.has_key(case):
211 cases[case] = 0
212 cases[case] += 1
213 print entity.get("id"), eType + ": '" + eText + "'", candidates
214
215
216
217
218 print "Cases"
219 for case in sorted(cases.keys()):
220 print case, cases[case]
221
222 return counts
223
225 """
226 Determine the head token for a named entity or trigger. The head token is the token closest
227 to the root for the subtree of the dependency parse spanned by the text of the element.
228
229 @param entityElement: a semantic node (trigger or named entity)
230 @type entityElement: cElementTree.Element
231 @param verbose: Print selected head tokens on screen
232 @param verbose: boolean
233 """
234 counts = [0,0]
235 sentences = [x for x in corpus.getiterator("sentence")]
236 counter = ProgressCounter(len(sentences), "SYNTAX")
237 for sentence in sentences:
238 counter.update()
239 tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
240 parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
241 if tokElement == None or parseElement == None:
242 print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization"
243 tokens = tokElement.findall("token")
244 tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
245 for entity in sentence.findall("entity"):
246 if entity.get("headOffset") == None:
247 headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
248
249 entity.set("headOffset", headToken.get("charOffset"))
250 entity.set("headMethod", "Syntax")
251 entity.set("headString", headToken.get("text"))
252 counts[0] += 1
253 return counts
254
256 if entity.get("headOffset") != None:
257 charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
258 elif entity.get("charOffset") != "":
259 charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
260 else:
261 charOffsets = []
262
263
264 headTokens = []
265 for token in tokens:
266 tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
267 for offset in charOffsets:
268 if Range.overlap(offset, tokenOffset):
269 headTokens.append(token)
270 if len(headTokens)==1:
271 selectedHeadToken = headTokens[0]
272 else:
273 selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
274
275
276 assert selectedHeadToken != None, entityElement.get("id")
277 return selectedHeadToken
278
280 """
281 Select the candidate token that is closest to the root of the subtree of the depencdeny parse
282 to which the candidate tokens belong to. See getTokenHeadScores method for the algorithm.
283
284 @param candidateTokens: the list of syntactic tokens from which the head token is selected
285 @type candidateTokens: list of cElementTree.Element objects
286 """
287 if len(candidateTokens) == 0:
288 return None
289
290 highestScore = -9999999
291 bestTokens = []
292 for token in candidateTokens:
293 if tokenHeadScores[token] > highestScore:
294 highestScore = tokenHeadScores[token]
295 for token in candidateTokens:
296 if tokenHeadScores[token] == highestScore:
297 bestTokens.append(token)
298 return bestTokens[-1]
299
301 """
302 A head token is chosen using a heuristic that prefers tokens closer to the
303 root of the dependency parse. In a list of candidate tokens, the one with
304 the highest score is the head token. The return value of this method
305 is a dictionary that maps token elements to their scores.
306 """
307 tokenHeadScores = {}
308
309
310 for token in tokens:
311 tokenHeadScores[token] = 0
312 for dependency in dependencies:
313 if dependency.get("t1") == token.get("id") or dependency.get("t2") == token.get("id"):
314 tokenHeadScores[token] = 1
315 break
316
317
318 for token in tokens:
319 tokenText = token.get("text")
320 if tokenText == "\\" or tokenText == "/" or tokenText == "-":
321 tokenHeadScores[token] = -1
322
323
324
325
326
327 depTypesToInclude = ["prep", "nn", "det", "hyphen", "num", "amod", "nmod", "appos", "measure", "dep", "partmod"]
328
329 modifiedScores = True
330 loopCount = 0
331 while modifiedScores == True:
332 if loopCount > 20:
333 print >> sys.stderr, "Warning, possible loop in parse for sentence", sentenceId
334 break
335 modifiedScores = False
336 for token1 in tokens:
337 for token2 in tokens:
338 for dep in dependencies:
339 if dep.get("t1") == token1.get("id") and dep.get("t2") == token2.get("id") and (dep.get("type") in depTypesToInclude):
340
341
342 if tokenHeadScores[token1] <= tokenHeadScores[token2]:
343 tokenHeadScores[token1] = tokenHeadScores[token2] + 1
344 modifiedScores = True
345 loopCount += 1
346 return tokenHeadScores
347
348 if __name__=="__main__":
349 import sys
350 print >> sys.stderr, "##### Calculating entity head token offsets #####"
351
352 from optparse import OptionParser
353
354 try:
355 import psyco
356 psyco.full()
357 print >> sys.stderr, "Found Psyco, using"
358 except ImportError:
359 print >> sys.stderr, "Psyco not installed"
360
361 optparser = OptionParser(usage="%prog [options]\nRecalculate head token offsets.")
362 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
363 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
364 optparser.add_option("-d", "--dictionary", default=None, dest="dictionary", help="Corpus file to use as dictionary of entity strings.")
365 optparser.add_option("-m", "--methods", default=None, dest="methods", help="")
366 optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse element name for calculating head offsets")
367 optparser.add_option("-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization element name for calculating head offsets")
368 (options, args) = optparser.parse_args()
369
370 print >> sys.stderr, "Loading corpus"
371 corpus = ETUtils.ETFromObj(options.input)
372 print >> sys.stderr, "Finding heads"
373 findHeads(corpus, options.dictionary, ["REMOVE", "DICT", "SYNTAX"], options.parse, options.tokenization)
374
375 if options.output != None:
376 print >> sys.stderr, "Writing corpus"
377 ETUtils.write(corpus, options.output)
378