TEES.Core.SentenceGraph

113 """ 114 The main purpose of SentenceGraph is to connect the syntactic dependency 115 parse (a graph where dependencies are edges and tokens are nodes) to the 116 semantic interactions (which form a graph where interactions are edges 117 and entities are nodes). Additionally, SentenceGraph provides several 118 dictionaries that e.g. map element ids to their corresponding elements. 119 """

120 - def __init__(self, sentenceElement, tokenElements, dependencyElements):

121 """ 122 Creates the syntactic graph part of the SentenceGraph. The semantic graph 123 can be added with mapInteractions. 124 125 @param sentenceElement: interaction-XML sentence-element 126 @type sentenceElement: cElementTree.Element 127 @param tokenElements: interaction-XML syntactic token elements 128 @type tokenElements: list of cElementTree.Element objects 129 @param dependencyElements: interacton-XML syntactic dependency elements 130 @type dependencyElements: list of cElementTree.Element objects 131 """ 132 self.sentenceElement = sentenceElement 133 self.tokens = tokenElements 134 self.dependencies = dependencyElements 135 #self.dependencyGraph = NX.XDiGraph(multiedges = multiedges) 136 #if multiedges: 137 # self.dependencyGraph = NX10.MultiDiGraph() 138 #else: 139 # self.dependencyGraph = NX10.DiGraph() 140 self.dependencyGraph = Graph() 141 self.interactions = None 142 self.entities = None 143 self.interactionGraph = None 144 self.entityGraph = None 145 self.duplicateInteractionEdgesRemoved = 0 146 self.tokenHeadScores = None 147 # Merged graph 148 self.mergedEntities = None 149 self.mergedEntityToDuplicates = None 150 self.mergedEntityGraph = None 151 152 self.tokensById = {} 153 for token in self.tokens: 154 self.tokensById[token.get("id")] = token 155 #self.dependencyGraph.add_node(token) 156 self.dependencyGraph.addNodes(self.tokens) 157 # Build the dependency graph using token-elements as nodes and dependency-elements 158 # as edge data 159 for dependency in self.dependencies: 160 #self.dependencyGraph.add_edge(self.tokensById[dependency.attrib["t1"]],\ 161 self.dependencyGraph.addEdge(self.tokensById[dependency.get("t1")],\ 162 self.tokensById[dependency.get("t2")],\ 163 dependency)

164 # element=dependency) 165 166 # def getUndirectedDependencyGraph(self): 167 # """ 168 # Create an undirected version of the syntactic dependency graph. 169 # """ 170 # u = NX10.MultiGraph() 171 # for token in self.tokens: 172 # u.add_node(token) 173 # for dependency in self.dependencies: 174 # u.add_edge(self.tokensById[dependency.attrib["t1"]],\ 175 # self.tokensById[dependency.attrib["t2"]], element=dependency) 176 # u.add_edge(self.tokensById[dependency.attrib["t2"]],\ 177 # self.tokensById[dependency.attrib["t1"]], element=dependency) 178 # return u 179

180 - def getSentenceId(self):

181 return self.sentenceElement.get("id")

182

183 - def makeEntityGraph(self, entities, interactions, entityToDuplicates=None):

184 graph = Graph() 185 graph.addNodes(entities) 186 # initialize a helper map 187 interactionMap = {} 188 for interaction in interactions: 189 e1 = self.entitiesById[interaction.get("e1")] 190 e2 = self.entitiesById[interaction.get("e2")] 191 if e1 not in interactionMap: 192 interactionMap[e1] = {} 193 if e2 not in interactionMap[e1]: 194 interactionMap[e1][e2] = [] 195 interactionMap[e1][e2].append(interaction) 196 if entityToDuplicates == None: 197 entityToDuplicates = {} 198 for e in entities: 199 entityToDuplicates[e] = [] 200 # make the graph 201 for e1 in entities: # loop through all given entities 202 for e2 in entities: # loop through all given entities 203 interactionTypes = set() 204 for d1 in [e1] + entityToDuplicates[e1]: # add duplicates to each iteration 205 for d2 in [e2] + entityToDuplicates[e2]: # add duplicates to each iteration 206 if d1 in interactionMap and d2 in interactionMap[d1]: 207 for interaction in interactionMap[d1][d2]: 208 if interaction.get("type") not in interactionTypes: # remove edges with the same type that another edge already had 209 graph.addEdge(e1, e2, interaction) # add primary and duplicate edges for the main entity pair 210 interactionTypes.add(interaction.get("type")) 211 return graph

212 213 # TODO: This method shouldn't be needed anymore

214 - def getInteractions(self, entity1, entity2, merged=False):

215 """ 216 Return a list of interaction-elements which represent directed 217 interactions from entity1 to entity2. 218 219 @param entity1: a semantic node (trigger or named entity) 220 @type entity1: cElementTree.Element 221 @param entity2: a semantic node (trigger or named entity) 222 @type entity2: cElementTree.Element 223 """ 224 if merged: 225 # Note: mergeInteractionGraph must be called before 226 if self.mergedEntityToDuplicates == None: 227 self.mergeInteractionGraph(True) 228 if self.mergedEntityGraph == None: 229 self.mergedEntityGraph = self.makeEntityGraph(self.mergedEntities, self.interactions, self.mergedEntityToDuplicates) 230 return self.mergedEntityGraph.getEdges(entity1, entity2) 231 else: 232 if self.entityGraph == None: 233 self.entityGraph = self.makeEntityGraph(self.entities, self.interactions) 234 return self.entityGraph.getEdges(entity1, entity2)

235

236 - def getOutInteractions(self, entity, merged=False):

237 if merged: 238 # Note: mergeInteractionGraph must be called before 239 #assert self.mergedEntityToDuplicates != None 240 if self.mergedEntityToDuplicates == None: 241 self.mergeInteractionGraph(True) 242 if self.mergedEntityGraph == None: 243 self.mergedEntityGraph = self.makeEntityGraph(self.mergedEntities, self.interactions, self.mergedEntityToDuplicates) 244 return self.mergedEntityGraph.getOutEdges(entity) 245 else: 246 if self.entityGraph == None: 247 self.entityGraph = self.makeEntityGraph(self.entities, self.interactions) 248 return self.entityGraph.getOutEdges(entity)

249 250 # rv = [] 251 # for interaction in self.interactions: 252 # if interaction.get("e1") == entity1.get("id") and interaction.get("e2") == entity2.get("id"): 253 # rv.append(interaction) 254 # return rv 255

256 - def mapInteractions(self, entityElements, interactionElements, verbose=False):

257 """ 258 Maps the semantic interactions to the syntactic graph. 259 260 Syntactic dependencies are defined between tokens. Semantic edges (interactions) 261 are defined between annotated entities. To utilize the correlation of the dependency 262 parse with the semantic interactions, the graphs must be aligned by mapping the 263 interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This 264 is done by determining the head tokens of the entities. 265 266 @param entityElements: the semantic nodes (triggers and named entities) 267 @type entityElements: list of cElementTree.Element objects 268 @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) 269 @type interactionElements: list of cElementTree.Element objects 270 @param verbose: Print selected head tokens on screen 271 @param verbose: boolean 272 """ 273 self.interactions = interactionElements 274 self.entities = entityElements 275 # Entities that have no text binding can not be mapped and are therefore removed 276 for entity in self.entities[:]: 277 if entity.get("charOffset") == "": 278 self.entities.remove(entity) 279 #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) 280 #if multiedges: 281 # self.interactionGraph = NX10.MultiDiGraph() 282 #else: 283 # self.interactionGraph = NX10.DiGraph() 284 self.interactionGraph = Graph() 285 self.interactionGraph.addNodes(self.tokens) 286 #for token in self.tokens: 287 # self.interactionGraph.add_node(token) 288 289 self.entitiesByToken = {} # a mapping for fast access 290 self.entitiesById = {} 291 self.entityHeadTokenByEntity = {} 292 for entity in self.entities[:]: 293 headToken = self.mapEntity(entity, verbose) 294 if headToken != None: 295 self.entityHeadTokenByEntity[entity] = headToken 296 self.entitiesById[entity.get("id")] = entity 297 else: 298 self.entities.remove(entity) 299 self._markNamedEntities() 300 301 for interaction in self.interactions: 302 if not self.entitiesById.has_key(interaction.get("e1")): 303 continue # e1 is outside of this sentence 304 if not self.entitiesById.has_key(interaction.get("e2")): 305 continue # e2 is outside of this sentence 306 token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] 307 token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] 308 309 # found = False 310 # if multiedges: 311 # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) 312 # for i in range(len(edges)): 313 # edge = edges[i]["element"] 314 # if edge.attrib["type"] == interaction.attrib["type"]: 315 # found = True 316 # break 317 # if not found: 318 # self.interactionGraph.add_edge(token1, token2, element=interaction) 319 # else: 320 # self.duplicateInteractionEdgesRemoved += 1 321 found = False 322 edges = self.interactionGraph.getEdges(token1, token2) 323 for edge in edges: 324 if edge[2].get("type") == interaction.get("type"): 325 found = True 326 break 327 if not found: 328 self.interactionGraph.addEdge(token1, token2, interaction) 329 else: 330 # TODO: "skipped" would be better than "removed" 331 self.duplicateInteractionEdgesRemoved += 1

332

333 - def mapEntity(self, entityElement, verbose=False):

334 """ 335 Determine the head token for a named entity or trigger. The head token is the token closest 336 to the root for the subtree of the dependency parse spanned by the text of the element. 337 338 @param entityElement: a semantic node (trigger or named entity) 339 @type entityElement: cElementTree.Element 340 @param verbose: Print selected head tokens on screen 341 @param verbose: boolean 342 """ 343 headOffset = None 344 if entityElement.get("headOffset") != None: 345 headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset")) 346 if entityElement.get("charOffset") != "": 347 charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset")) 348 else: 349 charOffsets = [] 350 # Each entity can consist of multiple syntactic tokens, covered by its 351 # charOffset-range. One of these must be chosen as the head token. 352 headTokens = [] # potential head tokens 353 for token in self.tokens: 354 #print token.attrib["id"], token.attrib["charOffset"] 355 tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) 356 if headOffset != None and entityElement.get("type") != "Binding": 357 # A head token can already be defined in the headOffset-attribute. 358 # However, depending on the tokenization, even this range may 359 # contain multiple tokens. Still, it can always be assumed that 360 # if headOffset is defined, the corret head token is in this range. 361 if Range.overlap(headOffset,tokenOffset): 362 headTokens.append(token) 363 else: 364 for offset in charOffsets: 365 if Range.overlap(offset,tokenOffset): 366 headTokens.append(token) 367 if len(headTokens)==1: # An unambiguous head token was found 368 token = headTokens[0] 369 else: # One head token must be chosen from the candidates 370 selHead = None 371 if entityElement.get("type") == "Binding": 372 for t in headTokens: 373 compText = t.get("text").lower() 374 if compText.find("bind") != -1 or compText.find("complex") != -1: 375 selHead = t 376 #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") 377 entityElement.set("headOffset", selHead.get("charOffset")) 378 break 379 if selHead == None: 380 token = self.findHeadToken(headTokens) 381 else: 382 token = selHead 383 if verbose: 384 print >> sys.stderr, "Selected head:", token.get("id"), token.get("text") 385 #assert token != None, entityElement.get("id") 386 if token != None: 387 # The ElementTree entity-element is modified by setting the headOffset attribute 388 if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"): 389 entityElement.set("headOffset", token.get("charOffset")) 390 if not self.entitiesByToken.has_key(token): 391 self.entitiesByToken[token] = [] 392 self.entitiesByToken[token].append(entityElement) 393 else: 394 print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id") 395 return token

396 397 # def mapEntityHints(self, verbose=False): 398 # """ 399 # Determine the head token for a named entity or trigger. The head token is the token closest 400 # to the root for the subtree of the dependency parse spanned by the text of the element. 401 # 402 # @param entityElement: a semantic node (trigger or named entity) 403 # @type entityElement: cElementTree.Element 404 # @param verbose: Print selected head tokens on screen 405 # @param verbose: boolean 406 # """ 407 # self.entityHints = self.sentenceElement.findall("entityHint") 408 # self.entityHintsByToken = {} 409 # for entityElement in self.entityHints: 410 # headOffset = None 411 # if entityElement.attrib.has_key("headOffset"): 412 # headOffset = Range.charOffsetToSingleTuple(entityElement.attrib["headOffset"]) 413 # if entityElement.attrib["charOffset"] != "": 414 # charOffsets = Range.charOffsetToTuples(entityElement.attrib["charOffset"]) 415 # else: 416 # charOffsets = [] 417 # # Each entity can consist of multiple syntactic tokens, covered by its 418 # # charOffset-range. One of these must be chosen as the head token. 419 # headTokens = [] # potential head tokens 420 # for token in self.tokens: 421 # #print token.attrib["id"], token.attrib["charOffset"] 422 # tokenOffset = Range.charOffsetToSingleTuple(token.attrib["charOffset"]) 423 # if headOffset != None: 424 # # A head token can already be defined in the headOffset-attribute. 425 # # However, depending on the tokenization, even this range may 426 # # contain multiple tokens. Still, it can always be assumed that 427 # # if headOffset is defined, the corret head token is in this range. 428 # if Range.overlap(headOffset,tokenOffset): 429 # headTokens.append(token) 430 # else: 431 # for offset in charOffsets: 432 # if Range.overlap(offset,tokenOffset): 433 # headTokens.append(token) 434 # if len(headTokens)==1: # An unambiguous head token was found 435 # token = headTokens[0] 436 # else: # One head token must be chosen from the candidates 437 # token = self.findHeadToken(headTokens) 438 # if verbose: 439 # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] 440 # assert token != None, entityElement.get("id") 441 # if token != None: 442 # # The ElementTree entity-element is modified by setting the headOffset attribute 443 # if not entityElement.attrib.has_key("headOffset"): 444 # entityElement.attrib["headOffset"] = token.attrib["charOffset"] 445 # if not self.entityHintsByToken.has_key(token): 446 # self.entityHintsByToken[token] = [] 447 # self.entityHintsByToken[token].append(entityElement) 448

449 - def findHeadToken(self, candidateTokens):

450 """ 451 Select the candidate token that is closest to the root of the subtree of the depencdeny parse 452 to which the candidate tokens belong to. See getTokenHeadScores method for the algorithm. 453 454 @param candidateTokens: the list of syntactic tokens from which the head token is selected 455 @type candidateTokens: list of cElementTree.Element objects 456 """ 457 tokenHeadScores = self.getTokenHeadScores() 458 459 #if debug: 460 # print "Tokens:", candidateTokenIds 461 # print "Scores:", tokenScores 462 463 if len(candidateTokens) == 0: 464 return None 465 466 highestScore = -9999999 467 bestTokens = [] 468 for token in candidateTokens: 469 if tokenHeadScores[token] > highestScore: 470 highestScore = tokenHeadScores[token] 471 for token in candidateTokens: 472 if tokenHeadScores[token] == highestScore: 473 bestTokens.append(token) 474 # if debug: 475 # print "tokens:" 476 # for i in range(len(candidateTokenIds)): 477 # print "[", candidateTokenIds[i], self.tokensById[candidateTokenIds[i]].text, tokenHeadScores[candidateTokenIds[i]], "]" 478 return bestTokens[-1]

479

480 - def getTokenHeadScores(self):

481 """ 482 A head token is chosen using a heuristic that prefers tokens closer to the 483 root of the dependency parse. In a list of candidate tokens, the one with 484 the highest score is the head token. The return value of this method 485 is a dictionary that maps token elements to their scores. 486 """ 487 # Token head scores are cached the first time this function is called 488 if self.tokenHeadScores != None: 489 return self.tokenHeadScores 490 else: 491 self.tokenHeadScores = {} 492 493 # Give all tokens initial scores 494 for token in self.tokens: 495 self.tokenHeadScores[token] = 0 # initialize score as zero (unconnected token) 496 for dependency in self.dependencies: 497 if dependency.get("t1") == token.get("id") or dependency.get("t2") == token.get("id"): 498 self.tokenHeadScores[token] = 1 # token is connected by a dependency 499 break 500 501 # Give a low score for tokens that clearly can't be head and are probably produced by hyphen-splitter 502 for token in self.tokens: 503 tokenText = token.get("text") 504 if tokenText == "\\" or tokenText == "/" or tokenText == "-": 505 self.tokenHeadScores[token] = -1 506 507 # Loop over all dependencies and increase the scores of all governor tokens 508 # until each governor token has a higher score than its dependent token. 509 # Some dependencies might form a loop so a list is used to define those 510 # dependency types used in determining head scores. 511 depTypesToInclude = ["prep", "nn", "det", "hyphen", "num", "amod", "nmod", "appos", "measure", "dep", "partmod"] 512 #depTypesToRemoveReverse = ["A/AN"] 513 modifiedScores = True 514 loopCount = 0 # loopcount for devel set approx. 2-4 515 while modifiedScores == True: # loop until the scores no longer change 516 if loopCount > 20: # survive loops 517 print >> sys.stderr, "Warning, possible loop in parse for sentence", self.getSentenceId() 518 break 519 modifiedScores = False 520 for token1 in self.tokens: 521 for token2 in self.tokens: # for each combination of tokens... 522 for dep in self.dependencies: # ... check each dependency 523 if dep.get("t1") == token1.get("id") and dep.get("t2") == token2.get("id") and (dep.get("type") in depTypesToInclude): 524 # The governor token of the dependency must have a higher score 525 # than the dependent token. 526 if self.tokenHeadScores[token1] <= self.tokenHeadScores[token2]: 527 self.tokenHeadScores[token1] = self.tokenHeadScores[token2] + 1 528 modifiedScores = True 529 # elif dep.attrib["t1"] == tokenI.attrib["id"] and dep.attrib["t2"] == tokenJ.attrib["id"] and (dep.attrib["type"] in depTypesToRemoveReverse): 530 # #tokenScores[i] -= 1 531 # if self.tokenHeadScores[tokenJ] <= self.tokenHeadScores[tokenI]: 532 # self.tokenHeadScores[tokenJ] = self.tokenHeadScores[tokenI] + 1 533 # modifiedScores = True 534 loopCount += 1 535 536 # Add scores to tokens 537 for token in self.tokens: 538 token.set("headScore", str(self.tokenHeadScores[token])) 539 540 return self.tokenHeadScores

541

542 - def _markNamedEntities(self):

543 """ 544 This method is used to define which tokens belong to _named_ entities. 545 Named entities are sometimes masked when testing learning of interactions, to 546 prevent the system making a trivial decision based on commonly interacting names. 547 """ 548 self.tokenIsName = {} 549 self.tokenIsEntity = {} 550 self.tokenIsEntityHead = {} 551 # Initialize the dictionaries 552 for token in self.tokens: 553 self.tokenIsName[token] = False 554 self.tokenIsEntity[token] = False 555 self.tokenIsEntityHead[token] = [] 556 for entity in self.entities: 557 entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) 558 entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset")) 559 for token in self.tokens: 560 tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) 561 for entityOffset in entityOffsets: 562 if Range.overlap(entityOffset, tokenOffset): 563 self.tokenIsEntity[token] = True 564 if entity.get("isName") != None: 565 if entity.get("isName") == "True": 566 self.tokenIsName[token] = True 567 else: 568 entity.set("isName", "True") 569 self.tokenIsName[token] = True 570 if Range.overlap(entityHeadOffset, tokenOffset): 571 self.tokenIsEntityHead[token].append(entity)

572

573 - def getTokenText(self, token):

574 """ 575 Returns the text of a token, and masks it if the token is the head token 576 of a named entity. 577 578 @param token: interaction-XML syntactic token. 579 @type token: cElementTree.Element 580 """ 581 if self.tokenIsName[token]: 582 return "NAMED_ENT" 583 else: 584 return token.get("text")

585

586 - def getCleared(self):

587 c = SentenceGraph(self.sentenceElement, self.tokens, self.dependencies) 588 namedEntities = [] 589 for entity in self.entities: 590 if entity.get("isName") == "True": 591 namedEntities.append(entity) 592 c.mapInteractions(namedEntities, []) 593 return c

594

595 - def mergeInteractionGraph(self, merge=True):

596 """ 597 For merging duplicate entities 598 599 keepDuplicates - allows calling the function with no effect, so that the same code 600 can be used for merged and unmerged cases 601 """ 602 self.mergedEntities = [] 603 self.mergedEntityToDuplicates = {} 604 #duplicates = {} 605 #mergedIds = {} 606 if not merge: # no entities are filtered 607 # Create dummy structures 608 for entity in self.entities: 609 mergedIds[entity] = entity.get("id") 610 self.mergedEntities.append(entity) 611 self.mergedEntityToDuplicates[entity] = [] 612 return 613 # Mark all duplicates after the first one in the list for removal 614 removeEntities = [False] * len(self.entities) 615 entitiesToKeep = [] 616 for i in range(len(self.entities)): # loop through all entities, including the last one 617 if removeEntities[i]: # entity has been already removed 618 continue 619 self.mergedEntities.append(self.entities[i]) 620 #mergedIds[entities[i]] = entities[i].get("id") 621 self.mergedEntityToDuplicates[self.entities[i]] = [] 622 if self.entities[i].get("isName") == "True": # named entities are never merged 623 continue 624 for j in range(i+1, len(self.entities)): # loop through all entities coming after entity "i" 625 # Entities are duplicates if they have the same type and head token 626 # Also, they are not duplicates if the charOffset differs. This shoulnd't matter, 627 # as the head tokens are the same, but in practice, on the GE, task improves performance, 628 # maybe due to multiple similar examples affecting SVM learning. 629 if self.entities[i].get("type") == self.entities[j].get("type") and \ 630 self.entities[i].get("charOffset") == self.entities[j].get("charOffset"): # and self.entityHeadTokenByEntity[self.entities[i]] == self.entityHeadTokenByEntity[self.entities[j]]: 631 removeEntities[j] = True 632 #mergedIds[entities[i]] += "/" + entities[j].get("id") 633 self.mergedEntityToDuplicates[self.entities[i]].append(self.entities[j])

Source Code for Module TEES.Core.SentenceGraph