TEES.ExampleBuilders.UnmergingExampleBuilder

65 """ 66 This example builder makes unmerging examples, i.e. examples describing 67 potential events. 68 """ 69 #def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None):

70 - def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):

71 # reset style regardless of input 72 #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" 73 if featureSet == None: 74 featureSet = IdSet() 75 if classSet == None: 76 classSet = IdSet(1) 77 else: 78 classSet = classSet 79 assert( classSet.getId("neg") == 1 ) 80 81 ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) 82 83 self.styles = self._setDefaultParameters(["trigger_features","typed","directed","no_linear","entities","genia_limits", 84 "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 85 "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]) 86 self.styles = self.getParameters(style) 87 self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) 88 self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] 89 self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] 90 self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] 91 #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) 92 self.pathLengths = length 93 assert(self.pathLengths == None) 94 self.types = types 95 96 self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) 97 self.triggerFeatureBuilder.useNonNameEntities = True

98 99 #self.outFile = open("exampleTempFile.txt","wt") 100

101 - def getInteractionEdgeLengths(self, sentenceGraph, paths):

102 """ 103 Return dependency and linear length of all interaction edges 104 (measured between the two tokens). 105 """ 106 interactionLengths = {} 107 for interaction in sentenceGraph.interactions: 108 # Calculated interaction edge dep and lin length 109 e1 = sentenceGraph.entitiesById[interaction.get("e1")] 110 e2 = sentenceGraph.entitiesById[interaction.get("e2")] 111 t1 = sentenceGraph.entityHeadTokenByEntity[e1] 112 t2 = sentenceGraph.entityHeadTokenByEntity[e2] 113 # Get dep path length 114 if t1 != t2: 115 path = paths.getPaths(t1, t2) 116 if t1 != t2 and len(path) > 0: 117 pathLength = min(len(x) for x in path) #len(paths[t1][t2]) 118 else: # no dependencyPath 119 pathLength = 999999 # more than any real path 120 # Linear distance 121 t1Pos = -1 122 t2Pos = -1 123 for i in range(len(sentenceGraph.tokens)): 124 if sentenceGraph.tokens[i] == t1: 125 t1Pos = i 126 if t2Pos != -1: 127 break 128 if sentenceGraph.tokens[i] == t2: 129 t2Pos = i 130 if t1Pos != -1: 131 break 132 linLength = abs(t1Pos - t2Pos) 133 interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) 134 return interactionLengths

135

136 - def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset):

137 offset = entity.get("headOffset") 138 if not goldEntitiesByOffset.has_key(offset): 139 return False 140 eType = entity.get("type") 141 goldEntities = goldEntitiesByOffset[offset] 142 143 # Check all gold entities for a match 144 for goldEntity in goldEntities: 145 isGold = True 146 147 # The entity type must match 148 if goldEntity.get("type") != eType: 149 isGold = False 150 continue 151 goldEntityId = goldEntity.get("id") 152 153 # Collect the gold interactions 154 goldInteractions = [] 155 for goldInteraction in goldGraph.interactions: 156 if goldInteraction.get("e1") == goldEntityId: 157 goldInteractions.append(goldInteraction) 158 159 # Argument count rules 160 if len(goldInteractions) != len(arguments): # total number of edges differs 161 isGold = False 162 continue 163 # count number of edges per type 164 argTypeCounts = {} 165 for argument in arguments: 166 argType = argument.get("type") 167 if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0 168 argTypeCounts[argType] += 1 169 # count number of gold edges per type 170 goldTypeCounts = {} 171 for argument in goldInteractions: 172 argType = argument.get("type") 173 if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0 174 goldTypeCounts[argType] += 1 175 # argument edge counts per type must match 176 if argTypeCounts != goldTypeCounts: 177 isGold = False 178 continue 179 180 # Exact argument matching 181 for argument in arguments: # check all edges 182 e1 = argument.get("e1") 183 e2 = argument.get("e2") 184 e2Entity = sentenceGraph.entitiesById[e2] 185 e2Offset = e2Entity.get("headOffset") 186 e2Type = e2Entity.get("type") 187 argType = argument.get("type") 188 189 found = False 190 for goldInteraction in goldInteractions: 191 if goldInteraction.get("type") == argType: 192 goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")] 193 if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type: 194 found = True 195 break 196 if found == False: # this edge did not have a corresponding gold edge 197 isGold = False 198 break 199 200 # Event is in gold 201 if isGold: 202 break 203 204 return isGold

205

206 - def getArgumentCombinations(self, eType, interactions, entityId=None):

207 combs = [] 208 if eType == "Binding": 209 # Making examples for only all-together/all-separate cases 210 # doesn't work, since even gold data has several cases of 211 # overlapping bindings with different numbers of arguments 212 #if len(interactions) > 0: 213 # return [interactions] 214 #else: 215 # return interactions 216 217 # Skip causes 218 themes = [] 219 for interaction in interactions: 220 if interaction.get("type") == "Theme": 221 themes.append(interaction) 222 223 for i in range(len(themes)): 224 # Looking at a2-normalize.pl reveals that there can be max 6 themes 225 # Based on training+devel data, four is maximum 226 if i < 10: #4: 227 for j in combinations(themes, i+1): 228 combs.append(j) 229 # if len(combs) >= 100: 230 # print >> sys.stderr, "Warning, truncating unmerging examples at 100 for Binding entity", entityId 231 # break 232 return combs 233 elif eType == "Process": # For ID-task 234 argCombinations = [] 235 argCombinations.append([]) # process can have 0 interactions 236 for interaction in interactions: 237 if interaction.get("type") == "Participant": 238 argCombinations.append([interaction]) 239 return argCombinations 240 else: # one of the regulation-types, or one of the simple types 241 themes = [] 242 causes = [] 243 siteArgs = [] 244 contextGenes = [] 245 sideChains = [] 246 locTargets = [] 247 for interaction in interactions: 248 iType = interaction.get("type") 249 #assert iType in ["Theme", "Cause"], (iType, ETUtils.toStr(interaction)) 250 if iType not in ["Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"]: # "AtLoc", "ToLoc"]: 251 continue 252 if iType == "Theme": 253 themes.append(interaction) 254 elif iType == "Cause": 255 causes.append(interaction) 256 elif iType == "SiteArg": 257 siteArgs.append(interaction) 258 elif iType == "Contextgene": 259 contextGenes.append(interaction) 260 elif iType == "Sidechain": 261 sideChains.append(interaction) 262 elif iType in ["AtLoc", "ToLoc"]: 263 locTargets.append(iType) 264 else: 265 assert False, (iType, interaction.get("id")) 266 # Limit arguments to event types that can have them 267 if eType.find("egulation") == -1 and eType != "Catalysis": 268 causes = [] 269 if eType != "Glycosylation": sideChains = [] 270 if eType not in ["Acetylation", "Methylation"]: contextGenes = [] 271 if eType == "Catalysis": siteArgs = [] 272 # Themes can always appear alone 273 themeAloneCombinations = [] 274 for theme in themes: 275 themeAloneCombinations.append([theme]) 276 #print "Combine", combine.combine(themes, causes), "TA", themeAloneCombinations 277 return combine.combine(themes, causes) \ 278 + combine.combine(themes, siteArgs) \ 279 + combine.combine(themes, sideChains) \ 280 + combine.combine(themes, contextGenes) \ 281 + combine.combine(themes, siteArgs, sideChains) \ 282 + combine.combine(themes, siteArgs, contextGenes) \ 283 + combine.combine(themes, locTargets) \ 284 + themeAloneCombinations

285 286 # The predicted value range is not used in the features the UnmergingExampleBuilder gets 287 # from the MultiEdgeFeatureBuilder 288 # def definePredictedValueRange(self, sentences, elementName): 289 # self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) 290 # 291 # def getPredictedValueRange(self): 292 # return self.multiEdgeFeatureBuilder.predictedRange 293

294 - def sortInteractionsById(self, interactions):

295 # The order of the interactions affects the order of the unmerging examples, and this 296 # affects performance. It's not clear whether this is what really happens, or whether 297 # the order of the interactions has some effect on the consistency of the unmerging 298 # features (it shouldn't). However, in case it does, this function is left here for now, 299 # although it shouldn't be needed at all. In any case the impact is minimal, for GE 300 # 53.22 vs 53.28 on the development set. 301 pairs = [] 302 for interaction in interactions: 303 pairs.append( (int(interaction.get("id").split(".i")[-1]), interaction) ) 304 pairs.sort() 305 return [x[1] for x in pairs]

306

307 - def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):

308 """ 309 Build examples for a single sentence. Returns a list of examples. 310 See Core/ExampleUtils for example format. 311 """ 312 self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) 313 self.triggerFeatureBuilder.initSentence(sentenceGraph) 314 315 #examples = [] 316 exampleIndex = 0 317 318 #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) 319 #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) 320 undirected = sentenceGraph.dependencyGraph.toUndirected() 321 paths = undirected 322 323 # Get argument order 324 self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) 325 326 # Map tokens to character offsets 327 tokenByOffset = {} 328 for i in range(len(sentenceGraph.tokens)): 329 token = sentenceGraph.tokens[i] 330 if goldGraph != None: # check that the tokenizations match 331 goldToken = goldGraph.tokens[i] 332 assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") 333 tokenByOffset[token.get("charOffset")] = token.get("id") 334 335 # Map gold entities to their head offsets 336 goldEntitiesByOffset = {} 337 if goldGraph != None: 338 for entity in goldGraph.entities: 339 offset = entity.get("headOffset") 340 assert offset != None 341 if not goldEntitiesByOffset.has_key(offset): 342 goldEntitiesByOffset[offset] = [] 343 goldEntitiesByOffset[offset].append(entity) 344 345 # Generate examples based on interactions between entities or interactions between tokens 346 # interactionsByEntityId = {} 347 # for entity in sentenceGraph.entities: 348 # interactionsByEntityId[entity.get("id")] = [] 349 # for interaction in sentenceGraph.interactions: 350 # if interaction.get("type") == "neg": 351 # continue 352 # e1Id = interaction.get("e1") 353 # interactionsByEntityId[e1Id].append(interaction) 354 if self.styles["no_merge"]: 355 mergeInput = False 356 entities = sentenceGraph.entities 357 else: 358 mergeInput = True 359 sentenceGraph.mergeInteractionGraph(True) 360 entities = sentenceGraph.mergedEntities 361 self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) 362 363 exampleIndex = 0 364 for entity in entities: # sentenceGraph.entities: 365 eType = entity.get("type") 366 assert eType != None, entity.attrib 367 eType = str(eType) 368 #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]: 369 # continue 370 371 #if not goldEntitiesByOffset.has_key(entity.get("headOffset")): 372 # continue 373 374 #interactions = interactionsByEntityId[entity.get("id")] 375 interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] 376 interactions = self.sortInteractionsById(interactions) 377 argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) 378 #if len(argCombinations) <= 1: 379 # continue 380 assert argCombinations != None, (entity.get("id"), entity.get("type")) 381 for argCombination in argCombinations: 382 if eType != "Process": 383 assert len(argCombination) > 0, eType + ": " + str(argCombinations) 384 # Originally binary classification 385 if goldGraph != None: 386 isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset) 387 #if eType == "Binding": 388 # print argCombination[0].get("e1"), len(argCombination), isGoldEvent 389 else: 390 isGoldEvent = False 391 # Named (multi-)class 392 if isGoldEvent: 393 #category = "event" 394 category = eType 395 if category.find("egulation") != -1: 396 category = "All_regulation" 397 elif category != "Binding": 398 category = "Other" #"simple6" 399 else: 400 category = "neg" 401 402 features = {} 403 404 argString = "" 405 for arg in argCombination: 406 argString += "," + arg.get("id") 407 extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} 408 assert type(extra["etype"]) == types.StringType, extra 409 self.exampleStats.addExample(category) 410 example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) 411 example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) 412 example[1] = self.classSet.getId(category) 413 example[3] = extra 414 #examples.append( example ) 415 ExampleUtils.appendExamples([example], outfile) 416 exampleIndex += 1 417 418 #return examples 419 return exampleIndex

420

421 - def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None):

422 # NOTE!!!! TODO 423 # add also features for arguments present, but not in this combination 424 425 features = {} 426 self.features = features 427 428 self.buildInterArgumentBagOfWords(argCombination, sentenceGraph) 429 430 eventEntityType = eventEntity.get("type") 431 if eventEntityType == "Binding": 432 interactionIndex = {} 433 groupInteractionLengths = [] 434 for interaction in allInteractions: 435 groupInteractionLengths.append(self.interactionLenghts[interaction]) 436 groupInteractionLengths.sort(compareInteractionPrecedence) 437 #print groupInteractionLengths 438 for i in range(len(groupInteractionLengths)): 439 interactionIndex[groupInteractionLengths[i][0]] = i 440 441 eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity] 442 self.triggerFeatureBuilder.setFeatureVector(self.features) 443 self.triggerFeatureBuilder.tag = "trg_" 444 self.triggerFeatureBuilder.buildFeatures(eventToken) 445 self.triggerFeatureBuilder.tag = None 446 447 #self.setFeature("rootType_"+eventEntity.get("type"), 1) 448 449 argThemeCount = 0 450 argCauseCount = 0 451 argCounts = {} 452 # Current example's edge combination 453 for arg in argCombination: 454 if arg.get("type") == "Theme": 455 argThemeCount += 1 456 tag = "argTheme" 457 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) 458 if eventEntityType == "Binding": 459 tag += str(interactionIndex[arg]) 460 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) 461 elif arg.get("type") == "Cause": # Cause 462 argCauseCount += 1 463 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause") 464 else: 465 argType = arg.get("type") 466 if argType not in argCounts: argCounts[argType] = 0 467 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType) 468 argCounts[argType] += 1 469 470 # Edge group context 471 contextThemeCount = 0 472 contextCauseCount = 0 473 for interaction in allInteractions: 474 if interaction in argCombination: # Already part of current example's combination 475 continue 476 if interaction.get("type") == "Theme": 477 contextThemeCount += 1 478 tag = "conTheme" 479 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) 480 if eventEntityType == "Binding": 481 tag += str(interactionIndex[interaction]) 482 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) 483 else: # Cause 484 contextCauseCount += 1 485 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause") 486 487 self.setFeature("argCount", len(argCombination)) 488 self.setFeature("argCount_" + str(len(argCombination)), 1) 489 self.setFeature("interactionCount", len(allInteractions)) 490 self.setFeature("interactionCount_" + str(len(allInteractions)), 1) 491 492 self.setFeature("argThemeCount", argThemeCount) 493 self.setFeature("argThemeCount_" + str(argThemeCount), 1) 494 self.setFeature("argCauseCount", argCauseCount) 495 self.setFeature("argCauseCount_" + str(argCauseCount), 1) 496 for key in sorted(argCounts.keys()): 497 self.setFeature("arg" + key + "Count", argCounts[key]) 498 self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1) 499 500 self.setFeature("interactionThemeCount", contextThemeCount) 501 self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1) 502 self.setFeature("interactionCauseCount", contextCauseCount) 503 self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1) 504 505 self.triggerFeatureBuilder.tag = "" 506 self.triggerFeatureBuilder.setFeatureVector(None) 507 508 # Common features 509 # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization 510 # if entity2.get("isName") == "True": 511 # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 512 # else: 513 # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 514 515 # define extra attributes 516 return [None,None,features,None]

Source Code for Module TEES.ExampleBuilders.UnmergingExampleBuilder