1 """
2 Main class for representing a sentence
3 """
4 __version__ = "$Revision: 1.40 $"
5
6
7 from SimpleGraph import Graph
8 import sys, os
9 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
10 import Utils.Range as Range
11 import types
12 import copy
13
14
15
16 -def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True):
17 """
18 Load an entire corpus through CorpusElements and add SentenceGraph-objects
19 to its SentenceElements-objects.
20 """
21 import Utils.ElementTreeUtils as ETUtils
22 import sys
23 from Utils.ProgressCounter import ProgressCounter
24 from Utils.InteractionXML.CorpusElements import CorpusElements
25
26
27 if type(corpus) == types.StringType:
28 print >> sys.stderr, "Loading corpus file", corpus
29 corpusTree = ETUtils.ETFromObj(corpus)
30 corpusRoot = corpusTree.getroot()
31
32 corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements)
33 print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences"
34
35 duplicateInteractionEdgesRemoved = 0
36 sentences = []
37 counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs")
38 counter.showMilliseconds = True
39 for sentence in corpusElements.sentences[:]:
40 counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ")
41
42
43 if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0:
44
45 sentence.sentenceGraph = None
46 continue
47 for pair in sentence.pairs:
48
49
50
51
52
53
54 isInteraction = pair.get("interaction")
55 if isInteraction == "True" or isInteraction == None:
56 sentence.interactions.append(pair)
57 if pair.get("type") == None:
58 pair.set("type", "undefined")
59
60 graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
61
62 graph.mapInteractions(sentence.entities, sentence.interactions)
63 graph.interSentenceInteractions = sentence.interSentenceInteractions
64 duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
65 sentence.sentenceGraph = graph
66
67 graph.parseElement = sentence.parseElement
68
69
70 print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs"
71 return corpusElements
72
73 -def getCorpusIterator(input, output, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractions=True):
74 import Utils.ElementTreeUtils as ETUtils
75 from Utils.InteractionXML.SentenceElements import SentenceElements
76
77
78 if output != None:
79 etWriter = ETUtils.ETWriter(output)
80 for eTuple in ETUtils.ETIteratorFromObj(input, ("start", "end")):
81 element = eTuple[1]
82 if eTuple[0] in ["end", "memory"] and element.tag == "document":
83 sentences = []
84 for sentenceElement in element.findall("sentence"):
85
86 sentence = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions)
87 if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0:
88 sentence.sentenceGraph = None
89 else:
90
91 graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
92
93 graph.mapInteractions(sentence.entities, sentence.interactions)
94 graph.interSentenceInteractions = sentence.interSentenceInteractions
95
96 sentence.sentenceGraph = graph
97 graph.parseElement = sentence.parseElement
98 sentences.append(sentence)
99 yield sentences
100 if output != None:
101 etWriter.write(element)
102 elif element.tag == "corpus" and output != None:
103 if eTuple[0] == "start":
104 etWriter.begin(element)
105 else:
106 etWriter.end(element)
107 if eTuple[0] == "end" and element.tag in ["document", "corpus"]:
108 element.clear()
109 if output != None:
110 etWriter.close()
111
113 """
114 The main purpose of SentenceGraph is to connect the syntactic dependency
115 parse (a graph where dependencies are edges and tokens are nodes) to the
116 semantic interactions (which form a graph where interactions are edges
117 and entities are nodes). Additionally, SentenceGraph provides several
118 dictionaries that e.g. map element ids to their corresponding elements.
119 """
120 - def __init__(self, sentenceElement, tokenElements, dependencyElements):
121 """
122 Creates the syntactic graph part of the SentenceGraph. The semantic graph
123 can be added with mapInteractions.
124
125 @param sentenceElement: interaction-XML sentence-element
126 @type sentenceElement: cElementTree.Element
127 @param tokenElements: interaction-XML syntactic token elements
128 @type tokenElements: list of cElementTree.Element objects
129 @param dependencyElements: interacton-XML syntactic dependency elements
130 @type dependencyElements: list of cElementTree.Element objects
131 """
132 self.sentenceElement = sentenceElement
133 self.tokens = tokenElements
134 self.dependencies = dependencyElements
135
136
137
138
139
140 self.dependencyGraph = Graph()
141 self.interactions = None
142 self.entities = None
143 self.interactionGraph = None
144 self.entityGraph = None
145 self.duplicateInteractionEdgesRemoved = 0
146 self.tokenHeadScores = None
147
148 self.mergedEntities = None
149 self.mergedEntityToDuplicates = None
150 self.mergedEntityGraph = None
151
152 self.tokensById = {}
153 for token in self.tokens:
154 self.tokensById[token.get("id")] = token
155
156 self.dependencyGraph.addNodes(self.tokens)
157
158
159 for dependency in self.dependencies:
160
161 self.dependencyGraph.addEdge(self.tokensById[dependency.get("t1")],\
162 self.tokensById[dependency.get("t2")],\
163 dependency)
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
181 return self.sentenceElement.get("id")
182
183 - def makeEntityGraph(self, entities, interactions, entityToDuplicates=None):
184 graph = Graph()
185 graph.addNodes(entities)
186
187 interactionMap = {}
188 for interaction in interactions:
189 e1 = self.entitiesById[interaction.get("e1")]
190 e2 = self.entitiesById[interaction.get("e2")]
191 if e1 not in interactionMap:
192 interactionMap[e1] = {}
193 if e2 not in interactionMap[e1]:
194 interactionMap[e1][e2] = []
195 interactionMap[e1][e2].append(interaction)
196 if entityToDuplicates == None:
197 entityToDuplicates = {}
198 for e in entities:
199 entityToDuplicates[e] = []
200
201 for e1 in entities:
202 for e2 in entities:
203 interactionTypes = set()
204 for d1 in [e1] + entityToDuplicates[e1]:
205 for d2 in [e2] + entityToDuplicates[e2]:
206 if d1 in interactionMap and d2 in interactionMap[d1]:
207 for interaction in interactionMap[d1][d2]:
208 if interaction.get("type") not in interactionTypes:
209 graph.addEdge(e1, e2, interaction)
210 interactionTypes.add(interaction.get("type"))
211 return graph
212
213
215 """
216 Return a list of interaction-elements which represent directed
217 interactions from entity1 to entity2.
218
219 @param entity1: a semantic node (trigger or named entity)
220 @type entity1: cElementTree.Element
221 @param entity2: a semantic node (trigger or named entity)
222 @type entity2: cElementTree.Element
223 """
224 if merged:
225
226 if self.mergedEntityToDuplicates == None:
227 self.mergeInteractionGraph(True)
228 if self.mergedEntityGraph == None:
229 self.mergedEntityGraph = self.makeEntityGraph(self.mergedEntities, self.interactions, self.mergedEntityToDuplicates)
230 return self.mergedEntityGraph.getEdges(entity1, entity2)
231 else:
232 if self.entityGraph == None:
233 self.entityGraph = self.makeEntityGraph(self.entities, self.interactions)
234 return self.entityGraph.getEdges(entity1, entity2)
235
237 if merged:
238
239
240 if self.mergedEntityToDuplicates == None:
241 self.mergeInteractionGraph(True)
242 if self.mergedEntityGraph == None:
243 self.mergedEntityGraph = self.makeEntityGraph(self.mergedEntities, self.interactions, self.mergedEntityToDuplicates)
244 return self.mergedEntityGraph.getOutEdges(entity)
245 else:
246 if self.entityGraph == None:
247 self.entityGraph = self.makeEntityGraph(self.entities, self.interactions)
248 return self.entityGraph.getOutEdges(entity)
249
250
251
252
253
254
255
256 - def mapInteractions(self, entityElements, interactionElements, verbose=False):
257 """
258 Maps the semantic interactions to the syntactic graph.
259
260 Syntactic dependencies are defined between tokens. Semantic edges (interactions)
261 are defined between annotated entities. To utilize the correlation of the dependency
262 parse with the semantic interactions, the graphs must be aligned by mapping the
263 interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This
264 is done by determining the head tokens of the entities.
265
266 @param entityElements: the semantic nodes (triggers and named entities)
267 @type entityElements: list of cElementTree.Element objects
268 @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA)
269 @type interactionElements: list of cElementTree.Element objects
270 @param verbose: Print selected head tokens on screen
271 @param verbose: boolean
272 """
273 self.interactions = interactionElements
274 self.entities = entityElements
275
276 for entity in self.entities[:]:
277 if entity.get("charOffset") == "":
278 self.entities.remove(entity)
279
280
281
282
283
284 self.interactionGraph = Graph()
285 self.interactionGraph.addNodes(self.tokens)
286
287
288
289 self.entitiesByToken = {}
290 self.entitiesById = {}
291 self.entityHeadTokenByEntity = {}
292 for entity in self.entities[:]:
293 headToken = self.mapEntity(entity, verbose)
294 if headToken != None:
295 self.entityHeadTokenByEntity[entity] = headToken
296 self.entitiesById[entity.get("id")] = entity
297 else:
298 self.entities.remove(entity)
299 self._markNamedEntities()
300
301 for interaction in self.interactions:
302 if not self.entitiesById.has_key(interaction.get("e1")):
303 continue
304 if not self.entitiesById.has_key(interaction.get("e2")):
305 continue
306 token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]]
307 token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]]
308
309
310
311
312
313
314
315
316
317
318
319
320
321 found = False
322 edges = self.interactionGraph.getEdges(token1, token2)
323 for edge in edges:
324 if edge[2].get("type") == interaction.get("type"):
325 found = True
326 break
327 if not found:
328 self.interactionGraph.addEdge(token1, token2, interaction)
329 else:
330
331 self.duplicateInteractionEdgesRemoved += 1
332
333 - def mapEntity(self, entityElement, verbose=False):
334 """
335 Determine the head token for a named entity or trigger. The head token is the token closest
336 to the root for the subtree of the dependency parse spanned by the text of the element.
337
338 @param entityElement: a semantic node (trigger or named entity)
339 @type entityElement: cElementTree.Element
340 @param verbose: Print selected head tokens on screen
341 @param verbose: boolean
342 """
343 headOffset = None
344 if entityElement.get("headOffset") != None:
345 headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset"))
346 if entityElement.get("charOffset") != "":
347 charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset"))
348 else:
349 charOffsets = []
350
351
352 headTokens = []
353 for token in self.tokens:
354
355 tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
356 if headOffset != None and entityElement.get("type") != "Binding":
357
358
359
360
361 if Range.overlap(headOffset,tokenOffset):
362 headTokens.append(token)
363 else:
364 for offset in charOffsets:
365 if Range.overlap(offset,tokenOffset):
366 headTokens.append(token)
367 if len(headTokens)==1:
368 token = headTokens[0]
369 else:
370 selHead = None
371 if entityElement.get("type") == "Binding":
372 for t in headTokens:
373 compText = t.get("text").lower()
374 if compText.find("bind") != -1 or compText.find("complex") != -1:
375 selHead = t
376
377 entityElement.set("headOffset", selHead.get("charOffset"))
378 break
379 if selHead == None:
380 token = self.findHeadToken(headTokens)
381 else:
382 token = selHead
383 if verbose:
384 print >> sys.stderr, "Selected head:", token.get("id"), token.get("text")
385
386 if token != None:
387
388 if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"):
389 entityElement.set("headOffset", token.get("charOffset"))
390 if not self.entitiesByToken.has_key(token):
391 self.entitiesByToken[token] = []
392 self.entitiesByToken[token].append(entityElement)
393 else:
394 print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id")
395 return token
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
450 """
451 Select the candidate token that is closest to the root of the subtree of the depencdeny parse
452 to which the candidate tokens belong to. See getTokenHeadScores method for the algorithm.
453
454 @param candidateTokens: the list of syntactic tokens from which the head token is selected
455 @type candidateTokens: list of cElementTree.Element objects
456 """
457 tokenHeadScores = self.getTokenHeadScores()
458
459
460
461
462
463 if len(candidateTokens) == 0:
464 return None
465
466 highestScore = -9999999
467 bestTokens = []
468 for token in candidateTokens:
469 if tokenHeadScores[token] > highestScore:
470 highestScore = tokenHeadScores[token]
471 for token in candidateTokens:
472 if tokenHeadScores[token] == highestScore:
473 bestTokens.append(token)
474
475
476
477
478 return bestTokens[-1]
479
481 """
482 A head token is chosen using a heuristic that prefers tokens closer to the
483 root of the dependency parse. In a list of candidate tokens, the one with
484 the highest score is the head token. The return value of this method
485 is a dictionary that maps token elements to their scores.
486 """
487
488 if self.tokenHeadScores != None:
489 return self.tokenHeadScores
490 else:
491 self.tokenHeadScores = {}
492
493
494 for token in self.tokens:
495 self.tokenHeadScores[token] = 0
496 for dependency in self.dependencies:
497 if dependency.get("t1") == token.get("id") or dependency.get("t2") == token.get("id"):
498 self.tokenHeadScores[token] = 1
499 break
500
501
502 for token in self.tokens:
503 tokenText = token.get("text")
504 if tokenText == "\\" or tokenText == "/" or tokenText == "-":
505 self.tokenHeadScores[token] = -1
506
507
508
509
510
511 depTypesToInclude = ["prep", "nn", "det", "hyphen", "num", "amod", "nmod", "appos", "measure", "dep", "partmod"]
512
513 modifiedScores = True
514 loopCount = 0
515 while modifiedScores == True:
516 if loopCount > 20:
517 print >> sys.stderr, "Warning, possible loop in parse for sentence", self.getSentenceId()
518 break
519 modifiedScores = False
520 for token1 in self.tokens:
521 for token2 in self.tokens:
522 for dep in self.dependencies:
523 if dep.get("t1") == token1.get("id") and dep.get("t2") == token2.get("id") and (dep.get("type") in depTypesToInclude):
524
525
526 if self.tokenHeadScores[token1] <= self.tokenHeadScores[token2]:
527 self.tokenHeadScores[token1] = self.tokenHeadScores[token2] + 1
528 modifiedScores = True
529
530
531
532
533
534 loopCount += 1
535
536
537 for token in self.tokens:
538 token.set("headScore", str(self.tokenHeadScores[token]))
539
540 return self.tokenHeadScores
541
543 """
544 This method is used to define which tokens belong to _named_ entities.
545 Named entities are sometimes masked when testing learning of interactions, to
546 prevent the system making a trivial decision based on commonly interacting names.
547 """
548 self.tokenIsName = {}
549 self.tokenIsEntity = {}
550 self.tokenIsEntityHead = {}
551
552 for token in self.tokens:
553 self.tokenIsName[token] = False
554 self.tokenIsEntity[token] = False
555 self.tokenIsEntityHead[token] = []
556 for entity in self.entities:
557 entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
558 entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
559 for token in self.tokens:
560 tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
561 for entityOffset in entityOffsets:
562 if Range.overlap(entityOffset, tokenOffset):
563 self.tokenIsEntity[token] = True
564 if entity.get("isName") != None:
565 if entity.get("isName") == "True":
566 self.tokenIsName[token] = True
567 else:
568 entity.set("isName", "True")
569 self.tokenIsName[token] = True
570 if Range.overlap(entityHeadOffset, tokenOffset):
571 self.tokenIsEntityHead[token].append(entity)
572
573 - def getTokenText(self, token):
574 """
575 Returns the text of a token, and masks it if the token is the head token
576 of a named entity.
577
578 @param token: interaction-XML syntactic token.
579 @type token: cElementTree.Element
580 """
581 if self.tokenIsName[token]:
582 return "NAMED_ENT"
583 else:
584 return token.get("text")
585
587 c = SentenceGraph(self.sentenceElement, self.tokens, self.dependencies)
588 namedEntities = []
589 for entity in self.entities:
590 if entity.get("isName") == "True":
591 namedEntities.append(entity)
592 c.mapInteractions(namedEntities, [])
593 return c
594
596 """
597 For merging duplicate entities
598
599 keepDuplicates - allows calling the function with no effect, so that the same code
600 can be used for merged and unmerged cases
601 """
602 self.mergedEntities = []
603 self.mergedEntityToDuplicates = {}
604
605
606 if not merge:
607
608 for entity in self.entities:
609 mergedIds[entity] = entity.get("id")
610 self.mergedEntities.append(entity)
611 self.mergedEntityToDuplicates[entity] = []
612 return
613
614 removeEntities = [False] * len(self.entities)
615 entitiesToKeep = []
616 for i in range(len(self.entities)):
617 if removeEntities[i]:
618 continue
619 self.mergedEntities.append(self.entities[i])
620
621 self.mergedEntityToDuplicates[self.entities[i]] = []
622 if self.entities[i].get("isName") == "True":
623 continue
624 for j in range(i+1, len(self.entities)):
625
626
627
628
629 if self.entities[i].get("type") == self.entities[j].get("type") and \
630 self.entities[i].get("charOffset") == self.entities[j].get("charOffset"):
631 removeEntities[j] = True
632
633 self.mergedEntityToDuplicates[self.entities[i]].append(self.entities[j])
634
635