1 """
2 Edge Examples
3 """
4 __version__ = "$Revision: 1.13 $"
5
6 import sys, os
7 thisPath = os.path.dirname(os.path.abspath(__file__))
8 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
9 from ExampleBuilders.ExampleBuilder import ExampleBuilder
10 from Core.IdSet import IdSet
11 import Core.ExampleUtils as ExampleUtils
12 from FeatureBuilders.MultiEdgeFeatureBuilder import MultiEdgeFeatureBuilder
13 from FeatureBuilders.TriggerFeatureBuilder import TriggerFeatureBuilder
14
15 from Core.SimpleGraph import Graph
16 from Utils.ProgressCounter import ProgressCounter
17 import Utils.Libraries.combine as combine
18 import Utils.ElementTreeUtils as ETUtils
19 import gzip
20 import types
21
23
24
25 pool = tuple(iterable)
26 n = len(pool)
27 if r > n:
28 return
29 indices = range(r)
30 yield tuple(pool[i] for i in indices)
31 while True:
32 for i in reversed(range(r)):
33 if indices[i] != i + n - r:
34 break
35 else:
36 return
37 indices[i] += 1
38 for j in range(i+1, r):
39 indices[j] = indices[j-1] + 1
40 yield tuple(pool[i] for i in indices)
41
43 """
44 e1/e2 = (interaction, pathdist, lindist, tok2pos)
45 """
46 if e1[1] > e2[1]:
47 return 1
48 elif e1[1] < e2[1]:
49 return -1
50 else:
51 if e1[2] > e2[2]:
52 return 1
53 elif e1[2] < e2[2]:
54 return -1
55 else:
56 if e1[3] > e2[3]:
57 return 1
58 elif e1[3] < e2[3]:
59 return -1
60 else:
61 return 0
62
63
65 """
66 This example builder makes unmerging examples, i.e. examples describing
67 potential events.
68 """
69
70 - def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
71
72
73 if featureSet == None:
74 featureSet = IdSet()
75 if classSet == None:
76 classSet = IdSet(1)
77 else:
78 classSet = classSet
79 assert( classSet.getId("neg") == 1 )
80
81 ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
82
83 self.styles = self._setDefaultParameters(["trigger_features","typed","directed","no_linear","entities","genia_limits",
84 "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features",
85 "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"])
86 self.styles = self.getParameters(style)
87 self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
88 self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
89 self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
90 self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
91
92 self.pathLengths = length
93 assert(self.pathLengths == None)
94 self.types = types
95
96 self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
97 self.triggerFeatureBuilder.useNonNameEntities = True
98
99
100
102 """
103 Return dependency and linear length of all interaction edges
104 (measured between the two tokens).
105 """
106 interactionLengths = {}
107 for interaction in sentenceGraph.interactions:
108
109 e1 = sentenceGraph.entitiesById[interaction.get("e1")]
110 e2 = sentenceGraph.entitiesById[interaction.get("e2")]
111 t1 = sentenceGraph.entityHeadTokenByEntity[e1]
112 t2 = sentenceGraph.entityHeadTokenByEntity[e2]
113
114 if t1 != t2:
115 path = paths.getPaths(t1, t2)
116 if t1 != t2 and len(path) > 0:
117 pathLength = min(len(x) for x in path)
118 else:
119 pathLength = 999999
120
121 t1Pos = -1
122 t2Pos = -1
123 for i in range(len(sentenceGraph.tokens)):
124 if sentenceGraph.tokens[i] == t1:
125 t1Pos = i
126 if t2Pos != -1:
127 break
128 if sentenceGraph.tokens[i] == t2:
129 t2Pos = i
130 if t1Pos != -1:
131 break
132 linLength = abs(t1Pos - t2Pos)
133 interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos)
134 return interactionLengths
135
136 - def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset):
137 offset = entity.get("headOffset")
138 if not goldEntitiesByOffset.has_key(offset):
139 return False
140 eType = entity.get("type")
141 goldEntities = goldEntitiesByOffset[offset]
142
143
144 for goldEntity in goldEntities:
145 isGold = True
146
147
148 if goldEntity.get("type") != eType:
149 isGold = False
150 continue
151 goldEntityId = goldEntity.get("id")
152
153
154 goldInteractions = []
155 for goldInteraction in goldGraph.interactions:
156 if goldInteraction.get("e1") == goldEntityId:
157 goldInteractions.append(goldInteraction)
158
159
160 if len(goldInteractions) != len(arguments):
161 isGold = False
162 continue
163
164 argTypeCounts = {}
165 for argument in arguments:
166 argType = argument.get("type")
167 if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0
168 argTypeCounts[argType] += 1
169
170 goldTypeCounts = {}
171 for argument in goldInteractions:
172 argType = argument.get("type")
173 if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0
174 goldTypeCounts[argType] += 1
175
176 if argTypeCounts != goldTypeCounts:
177 isGold = False
178 continue
179
180
181 for argument in arguments:
182 e1 = argument.get("e1")
183 e2 = argument.get("e2")
184 e2Entity = sentenceGraph.entitiesById[e2]
185 e2Offset = e2Entity.get("headOffset")
186 e2Type = e2Entity.get("type")
187 argType = argument.get("type")
188
189 found = False
190 for goldInteraction in goldInteractions:
191 if goldInteraction.get("type") == argType:
192 goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")]
193 if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type:
194 found = True
195 break
196 if found == False:
197 isGold = False
198 break
199
200
201 if isGold:
202 break
203
204 return isGold
205
207 combs = []
208 if eType == "Binding":
209
210
211
212
213
214
215
216
217
218 themes = []
219 for interaction in interactions:
220 if interaction.get("type") == "Theme":
221 themes.append(interaction)
222
223 for i in range(len(themes)):
224
225
226 if i < 10:
227 for j in combinations(themes, i+1):
228 combs.append(j)
229
230
231
232 return combs
233 elif eType == "Process":
234 argCombinations = []
235 argCombinations.append([])
236 for interaction in interactions:
237 if interaction.get("type") == "Participant":
238 argCombinations.append([interaction])
239 return argCombinations
240 else:
241 themes = []
242 causes = []
243 siteArgs = []
244 contextGenes = []
245 sideChains = []
246 locTargets = []
247 for interaction in interactions:
248 iType = interaction.get("type")
249
250 if iType not in ["Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"]:
251 continue
252 if iType == "Theme":
253 themes.append(interaction)
254 elif iType == "Cause":
255 causes.append(interaction)
256 elif iType == "SiteArg":
257 siteArgs.append(interaction)
258 elif iType == "Contextgene":
259 contextGenes.append(interaction)
260 elif iType == "Sidechain":
261 sideChains.append(interaction)
262 elif iType in ["AtLoc", "ToLoc"]:
263 locTargets.append(iType)
264 else:
265 assert False, (iType, interaction.get("id"))
266
267 if eType.find("egulation") == -1 and eType != "Catalysis":
268 causes = []
269 if eType != "Glycosylation": sideChains = []
270 if eType not in ["Acetylation", "Methylation"]: contextGenes = []
271 if eType == "Catalysis": siteArgs = []
272
273 themeAloneCombinations = []
274 for theme in themes:
275 themeAloneCombinations.append([theme])
276
277 return combine.combine(themes, causes) \
278 + combine.combine(themes, siteArgs) \
279 + combine.combine(themes, sideChains) \
280 + combine.combine(themes, contextGenes) \
281 + combine.combine(themes, siteArgs, sideChains) \
282 + combine.combine(themes, siteArgs, contextGenes) \
283 + combine.combine(themes, locTargets) \
284 + themeAloneCombinations
285
286
287
288
289
290
291
292
293
295
296
297
298
299
300
301 pairs = []
302 for interaction in interactions:
303 pairs.append( (int(interaction.get("id").split(".i")[-1]), interaction) )
304 pairs.sort()
305 return [x[1] for x in pairs]
306
308 """
309 Build examples for a single sentence. Returns a list of examples.
310 See Core/ExampleUtils for example format.
311 """
312 self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
313 self.triggerFeatureBuilder.initSentence(sentenceGraph)
314
315
316 exampleIndex = 0
317
318
319
320 undirected = sentenceGraph.dependencyGraph.toUndirected()
321 paths = undirected
322
323
324 self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
325
326
327 tokenByOffset = {}
328 for i in range(len(sentenceGraph.tokens)):
329 token = sentenceGraph.tokens[i]
330 if goldGraph != None:
331 goldToken = goldGraph.tokens[i]
332 assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
333 tokenByOffset[token.get("charOffset")] = token.get("id")
334
335
336 goldEntitiesByOffset = {}
337 if goldGraph != None:
338 for entity in goldGraph.entities:
339 offset = entity.get("headOffset")
340 assert offset != None
341 if not goldEntitiesByOffset.has_key(offset):
342 goldEntitiesByOffset[offset] = []
343 goldEntitiesByOffset[offset].append(entity)
344
345
346
347
348
349
350
351
352
353
354 if self.styles["no_merge"]:
355 mergeInput = False
356 entities = sentenceGraph.entities
357 else:
358 mergeInput = True
359 sentenceGraph.mergeInteractionGraph(True)
360 entities = sentenceGraph.mergedEntities
361 self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
362
363 exampleIndex = 0
364 for entity in entities:
365 eType = entity.get("type")
366 assert eType != None, entity.attrib
367 eType = str(eType)
368
369
370
371
372
373
374
375 interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
376 interactions = self.sortInteractionsById(interactions)
377 argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
378
379
380 assert argCombinations != None, (entity.get("id"), entity.get("type"))
381 for argCombination in argCombinations:
382 if eType != "Process":
383 assert len(argCombination) > 0, eType + ": " + str(argCombinations)
384
385 if goldGraph != None:
386 isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset)
387
388
389 else:
390 isGoldEvent = False
391
392 if isGoldEvent:
393
394 category = eType
395 if category.find("egulation") != -1:
396 category = "All_regulation"
397 elif category != "Binding":
398 category = "Other"
399 else:
400 category = "neg"
401
402 features = {}
403
404 argString = ""
405 for arg in argCombination:
406 argString += "," + arg.get("id")
407 extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
408 assert type(extra["etype"]) == types.StringType, extra
409 self.exampleStats.addExample(category)
410 example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
411 example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
412 example[1] = self.classSet.getId(category)
413 example[3] = extra
414
415 ExampleUtils.appendExamples([example], outfile)
416 exampleIndex += 1
417
418
419 return exampleIndex
420
421 - def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions):
422
423
424
425 features = {}
426 self.features = features
427
428 self.buildInterArgumentBagOfWords(argCombination, sentenceGraph)
429
430 eventEntityType = eventEntity.get("type")
431 if eventEntityType == "Binding":
432 interactionIndex = {}
433 groupInteractionLengths = []
434 for interaction in allInteractions:
435 groupInteractionLengths.append(self.interactionLenghts[interaction])
436 groupInteractionLengths.sort(compareInteractionPrecedence)
437
438 for i in range(len(groupInteractionLengths)):
439 interactionIndex[groupInteractionLengths[i][0]] = i
440
441 eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity]
442 self.triggerFeatureBuilder.setFeatureVector(self.features)
443 self.triggerFeatureBuilder.tag = "trg_"
444 self.triggerFeatureBuilder.buildFeatures(eventToken)
445 self.triggerFeatureBuilder.tag = None
446
447
448
449 argThemeCount = 0
450 argCauseCount = 0
451 argCounts = {}
452
453 for arg in argCombination:
454 if arg.get("type") == "Theme":
455 argThemeCount += 1
456 tag = "argTheme"
457 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
458 if eventEntityType == "Binding":
459 tag += str(interactionIndex[arg])
460 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
461 elif arg.get("type") == "Cause":
462 argCauseCount += 1
463 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause")
464 else:
465 argType = arg.get("type")
466 if argType not in argCounts: argCounts[argType] = 0
467 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType)
468 argCounts[argType] += 1
469
470
471 contextThemeCount = 0
472 contextCauseCount = 0
473 for interaction in allInteractions:
474 if interaction in argCombination:
475 continue
476 if interaction.get("type") == "Theme":
477 contextThemeCount += 1
478 tag = "conTheme"
479 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
480 if eventEntityType == "Binding":
481 tag += str(interactionIndex[interaction])
482 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
483 else:
484 contextCauseCount += 1
485 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause")
486
487 self.setFeature("argCount", len(argCombination))
488 self.setFeature("argCount_" + str(len(argCombination)), 1)
489 self.setFeature("interactionCount", len(allInteractions))
490 self.setFeature("interactionCount_" + str(len(allInteractions)), 1)
491
492 self.setFeature("argThemeCount", argThemeCount)
493 self.setFeature("argThemeCount_" + str(argThemeCount), 1)
494 self.setFeature("argCauseCount", argCauseCount)
495 self.setFeature("argCauseCount_" + str(argCauseCount), 1)
496 for key in sorted(argCounts.keys()):
497 self.setFeature("arg" + key + "Count", argCounts[key])
498 self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1)
499
500 self.setFeature("interactionThemeCount", contextThemeCount)
501 self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1)
502 self.setFeature("interactionCauseCount", contextCauseCount)
503 self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1)
504
505 self.triggerFeatureBuilder.tag = ""
506 self.triggerFeatureBuilder.setFeatureVector(None)
507
508
509
510
511
512
513
514
515
516 return [None,None,features,None]
517
519 argEntity = sentenceGraph.entitiesById[arg.get("e2")]
520 argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
521 self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag)
522 self.triggerFeatureBuilder.tag = tag + "trg_"
523 self.triggerFeatureBuilder.buildFeatures(argToken)
524 if argEntity.get("isName") == "True":
525 self.setFeature(tag+"Protein", 1)
526 else:
527 self.setFeature(tag+"Event", 1)
528 self.setFeature("nestingEvent", 1)
529 self.setFeature(tag+"_"+argEntity.get("type"), 1)
530
531 - def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag):
532
533
534 self.multiEdgeFeatureBuilder.tag = tag + "_"
535 self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False)
536
537 self.setFeature(tag+"_present", 1)
538
539 path = paths.getPaths(eventToken, argToken)
540 if eventToken != argToken and len(path) > 0:
541 path = path[0]
542 else:
543 path = [eventToken, argToken]
544
545
546 if not self.styles["disable_entity_features"]:
547 self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
548 self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
549
550
551 if not self.styles["disable_single_element_features"]:
552 self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
553 if not self.styles["disable_ngram_features"]:
554 self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph)
555 self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph)
556 self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph)
557 if not self.styles["disable_path_edge_features"]:
558 self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
559
560 self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
561 self.multiEdgeFeatureBuilder.tag = ""
562
564 if len(arguments) < 2:
565 return
566
567 indexByToken = {}
568 for i in range(len(sentenceGraph.tokens)):
569 indexByToken[sentenceGraph.tokens[i]] = i
570
571 argTokenIndices = set()
572 for arg in arguments:
573 argEntity = sentenceGraph.entitiesById[arg.get("e2")]
574 argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
575 argTokenIndices.add(indexByToken[argToken])
576 minIndex = min(argTokenIndices)
577 maxIndex = max(argTokenIndices)
578 self.setFeature("argBoWRange", (maxIndex-minIndex))
579 self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1)
580 bow = set()
581 for i in range(minIndex+1, maxIndex):
582 token = sentenceGraph.tokens[i]
583 if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]:
584 bow.add(token.get("text"))
585 bow = sorted(list(bow))
586 for word in bow:
587 self.setFeature("argBoW_"+word, 1)
588 if word in ["/", "-"]:
589 self.setFeature("argBoW_slashOrHyphen", 1)
590 if len(bow) == 1:
591 self.setFeature("argBoWonly_"+bow[0], 1)
592 if bow[0] in ["/", "-"]:
593 self.setFeature("argBoWonly_slashOrHyphen", 1)
594