1 import sys, os
2 from collections import defaultdict
3 extraPath = os.path.dirname(os.path.abspath(__file__))+"/../.."
4 sys.path.append(extraPath)
5 from Utils.ProgressCounter import ProgressCounter
6 try:
7 import xml.etree.cElementTree as ET
8 except ImportError:
9 import cElementTree as ET
10 import Utils.ElementTreeUtils as ETUtils
11 import Utils.Range as Range
12
14 fixCount = 0
15 phraseCount = 0
16 for phrase in phrases:
17 fixed = False
18 phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
19 phraseBegin = int(phrase.get("begin"))
20 phraseEnd = int(phrase.get("end"))
21 for i in range(len(tokens)):
22 token = tokens[i]
23 tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
24 if tokOffset[0] == phraseOffset[0]:
25 if phraseBegin != i:
26 phrase.set("begin", str(i))
27 fixed = True
28 if tokOffset[1] == phraseOffset[1]:
29 if phraseEnd != i:
30 phrase.set("end", str(i))
31 fixed = True
32 break
33 if fixed:
34 fixCount += 1
35 phraseCount += 1
36
37
39 phrases = parse.findall("phrase")
40 toKeep = []
41 for phrase in phrases:
42 if phrase.get("charOffset") == None:
43 continue
44 if filter != None and phrase.get("type") not in filter:
45 continue
46 toKeep.append(phrase)
47 fixIndices(toKeep, tokens)
48 return toKeep
49
51 neOffsets = set()
52 for entity in entities:
53 if entity.get("isName") != "True":
54 continue
55 neOffsets.add(entity.get("charOffset"))
56 phrasesToKeep = []
57 for phrase in phrases:
58 phraseOffset = phrase.get("charOffset")
59 if phraseOffset in neOffsets:
60 phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset)
61 if phraseOffsetTuple in phraseDict:
62 del phraseDict[phraseOffsetTuple]
63 else:
64 phrasesToKeep.append(phrase)
65
66 return phrasesToKeep
67
69 phraseDict = {}
70
71 for phrase in phrases:
72 phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
73 if not phraseDict.has_key(phraseOffset):
74 phraseDict[phraseOffset] = []
75 phraseDict[phraseOffset].append(phrase)
76 return phraseDict
77
79 counts = {}
80 for phrase in phrases:
81 pType = phrase.get("type")
82 if pType not in counts:
83 counts[pType] = 0
84 counts[pType] += 1
85 return counts
86
88 e = ET.Element("phrase")
89 e.set("type", type)
90 e.set("begin", str(begin))
91 e.set("end", str(end))
92 e.set("charOffset", str(offset[0])+"-"+str(offset[1]))
93 return e
94
96 newPhrases = []
97 for phrase in phrases:
98 if filter != None and phrase.get("type") not in filter:
99 continue
100 phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
101 phraseBegin = int(phrase.get("begin"))
102 phraseEnd = int(phrase.get("end"))
103 prevToken = None
104 tokCount = 0
105 for token in tokens[phraseBegin:phraseEnd+1]:
106 if token.get("POS") == "IN" and prevToken != None:
107 newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple(prevToken.get("charOffset"))[-1])
108 newPhrase = makePhrase(phrase.get("type") + "-IN",
109 newPhraseOffset,
110 phraseBegin,
111 phraseBegin + tokCount-1)
112 if not phraseDict.has_key(newPhraseOffset):
113
114 newPhrases.append(newPhrase)
115 phraseDict[newPhraseOffset] = [newPhrase]
116 prevToken = token
117 tokCount += 1
118 return newPhrases
119
121 newPhrases = []
122 for phrase in phrases:
123 if filter != None and phrase.get("type") not in filter:
124 continue
125 phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
126 phraseBegin = int(phrase.get("begin"))
127 phraseEnd = int(phrase.get("end"))
128 if phraseBegin > 0 and tokens[phraseBegin-1].get("POS") == "DT":
129 newPhraseOffset = (Range.charOffsetToSingleTuple(tokens[phraseBegin-1].get("charOffset"))[0], phraseOffset[1])
130 newPhrase = makePhrase("DT-" + phrase.get("type"),
131 newPhraseOffset,
132 phraseBegin - 1,
133 phraseEnd)
134 if not phraseDict.has_key(newPhraseOffset):
135
136 newPhrases.append(newPhrase)
137 phraseDict[newPhraseOffset] = [newPhrase]
138 return newPhrases
139
141 newPhrases = []
142 for i in range(len(tokens)):
143 token = tokens[i]
144 tokPOS = token.get("POS")
145 if tokPOS in includePOS:
146 tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
147 if not phraseDict.has_key(tokOffset):
148 newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i)
149 newPhrases.append(newPhrase)
150 phraseDict[tokOffset] = [newPhrase]
151 return newPhrases
152
170
171
173 matches = []
174 if entity.get("isName") == "True":
175 return []
176 maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
177 minOffset = entity.get("altOffset")
178 if minOffset != None:
179 minOffset = Range.charOffsetToSingleTuple(minOffset)
180 else:
181 minOffset = maxOffset
182 for phraseOffset in phraseOffsets:
183 if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset):
184 matches.extend(phraseDict[phraseOffset])
185 return matches
186
197
199 phraseOffsets = phraseDict.keys()
200 phraseToEntity = {}
201 for entity in entities:
202 if entity.get("isName") == "True":
203 continue
204 matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
205 if len(matches) == 1:
206 bestMatch = matches[0]
207 elif len(matches) == 0:
208 bestMatch = None
209 else:
210 bestMatch = selectBestMatch(entity, matches)
211 if bestMatch != None:
212 if not phraseToEntity.has_key(bestMatch):
213 phraseToEntity[bestMatch] = []
214 phraseToEntity[bestMatch].append(entity)
215 return phraseToEntity
216
228
230 print >> sys.stderr, "Loading corpus file", input
231 corpusRoot = ETUtils.ETFromObj(input).getroot()
232 documents = corpusRoot.findall("document")
233
234 counts = defaultdict(int)
235 matchByType = defaultdict(lambda : [0,0])
236 filteredMatchByType = defaultdict(lambda : [0,0])
237 filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256 for document in documents:
257 for sentence in document.findall("sentence"):
258 entities = sentence.findall("entity")
259 parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser":parserName})
260 if parse == None:
261 continue
262 tokenization = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
263 phrases, phraseDict = makePhrases(parse, tokenization, entities)
264 phraseOffsets = phraseDict.keys()
265
266 phraseNECounts = getNECounts(phrases, entities)
267
268 for value in phraseDict.values():
269 counts["phrases"] += len(value)
270 for phrase in value:
271 matchByType[phrase.get("type")][0] += 1
272 if phrase.get("type") in filter:
273 filteredMatchByType[phrase.get("type")][0] += 1
274 counts["phrases-filtered"] += 1
275 if phrase.get("type").find("NP") != -1:
276 matchByType[phrase.get("type")+"_NE"+str(phraseNECounts[phrase])][0] += 1
277 counts["tokens"] += len(tokenization.findall("token"))
278
279 corefType = {}
280 for interaction in sentence.findall("interaction"):
281 if interaction.get("type") == "Coref":
282 corefType[interaction.get("e1")] = "Anaphora"
283 corefType[interaction.get("e2")] = "Antecedent"
284
285 for entity in entities:
286 if entity.get("isName") == "True":
287 continue
288 counts["entity"] += 1
289 print "entity", entity.get("id")
290 print ETUtils.toStr(entity)
291 matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
292 count = 0
293 filteredCount = 0
294 for phrase in matches:
295 cType = "UNKNOWN"
296 if corefType.has_key(entity.get("id")):
297 cType = corefType[entity.get("id")]
298 print " match", count, ETUtils.toStr(phrase), "NE" + str(phraseNECounts[phrase]), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
299 count += 1
300 matchByType[phrase.get("type")][1] += 1
301 matchByType[phrase.get("type")+"_"+cType][1] += 1
302 matchByType[phrase.get("type")+"_"+cType+"_NE"+str(phraseNECounts[phrase])][1] += 1
303 if phrase.get("type") in filter:
304 filteredCount += 1
305 filteredMatchByType[phrase.get("type")][1] += 1
306
307 if count == 0:
308 print " NO MATCH", ETUtils.toStr(entity)
309 counts["no-match"] += 1
310 else:
311 counts["match"] += 1
312
313 if len(matches) > 1:
314 bestMatch = selectBestMatch(entity, matches)
315 print " MULTIMATCH("+ entity.get("charOffset")+","+str(entity.get("altOffset")) + ")", ", ".join([x.get("type") + "_" + x.get("charOffset") for x in matches]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get("charOffset") + ")"
316
317 if filteredCount == 0: counts["no-match-filtered"] += 1
318 else: counts["match-filtered"] += 1
319 print "Match"
320 for key in sorted(matchByType.keys()):
321 print " ", key, " ", matchByType[key]
322 print "Filtered", filteredMatchByType
323 print "Counts", counts
324
325 if __name__=="__main__":
326 import sys
327 print >> sys.stderr, "##### Split elements with merged types #####"
328
329 from optparse import OptionParser
330
331 try:
332 import psyco
333 psyco.full()
334 print >> sys.stderr, "Found Psyco, using"
335 except ImportError:
336 print >> sys.stderr, "Psyco not installed"
337
338 optparser = OptionParser(usage="%prog [options]\n")
339 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
340 optparser.add_option("-p", "--parser", default="split-McClosky", dest="parser", help="")
341 (options, args) = optparser.parse_args()
342
343 if options.input == None:
344 print >> sys.stderr, "Error, input file not defined."
345 optparser.print_help()
346 sys.exit(1)
347
348 processCorpus(options.input, options.parser)
349