Package TEES :: Package ExampleBuilders :: Module EdgeExampleBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.EdgeExampleBuilder

  1  """ 
  2  Edge Examples 
  3  """ 
  4   
  5  import sys, os 
  6  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  7  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  8  from ExampleBuilders.ExampleBuilder import ExampleBuilder 
  9  from Core.IdSet import IdSet 
 10  import Core.ExampleUtils as ExampleUtils 
 11  from FeatureBuilders.MultiEdgeFeatureBuilder import MultiEdgeFeatureBuilder 
 12  from FeatureBuilders.TokenFeatureBuilder import TokenFeatureBuilder 
 13  from FeatureBuilders.BioInferOntologyFeatureBuilder import BioInferOntologyFeatureBuilder 
 14  from FeatureBuilders.NodalidaFeatureBuilder import NodalidaFeatureBuilder 
 15  from FeatureBuilders.BacteriaRenamingFeatureBuilder import BacteriaRenamingFeatureBuilder 
 16  from FeatureBuilders.RELFeatureBuilder import RELFeatureBuilder 
 17  from FeatureBuilders.DrugFeatureBuilder import DrugFeatureBuilder 
 18  from FeatureBuilders.EVEXFeatureBuilder import EVEXFeatureBuilder 
 19  from FeatureBuilders.GiulianoFeatureBuilder import GiulianoFeatureBuilder 
 20  #import Graph.networkx_v10rc1 as NX10 
 21  from Core.SimpleGraph import Graph 
 22  from FeatureBuilders.TriggerFeatureBuilder import TriggerFeatureBuilder 
 23  import Utils.Range as Range 
 24  from multiprocessing import Process 
 25   
 26  # For gold mapping 
 27  import Evaluators.EvaluateInteractionXML as EvaluateInteractionXML 
 28   
29 -class EdgeExampleBuilder(ExampleBuilder):
30 """ 31 This example builder makes edge examples, i.e. examples describing 32 the event arguments. 33 """
34 - def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
35 if featureSet == None: 36 featureSet = IdSet() 37 if classSet == None: 38 classSet = IdSet(1) 39 else: 40 classSet = classSet 41 assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) 42 43 ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) 44 45 self._setDefaultParameters([ 46 "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", 47 "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", 48 "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", 49 "ddi_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", 50 "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", 51 "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 52 "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", 53 "entity_type", "filter_shortest_path", "maskTypeAsProtein"]) 54 self.styles = self.getParameters(style) 55 if style == None: # no parameters given 56 style["typed"] = style["directed"] = style["headsOnly"] = True 57 # self.styles = style 58 # if "selftrain_group" in self.styles: 59 # self.selfTrainGroups = set() 60 # if "selftrain_group-1" in self.styles: 61 # self.selfTrainGroups.add("-1") 62 # if "selftrain_group0" in self.styles: 63 # self.selfTrainGroups.add("0") 64 # if "selftrain_group1" in self.styles: 65 # self.selfTrainGroups.add("1") 66 # if "selftrain_group2" in self.styles: 67 # self.selfTrainGroups.add("2") 68 # if "selftrain_group3" in self.styles: 69 # self.selfTrainGroups.add("3") 70 # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups 71 72 self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles) 73 # NOTE Temporarily re-enabling predicted range 74 #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) 75 if self.styles["graph_kernel"]: 76 from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder 77 self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) 78 if self.styles["noAnnType"]: 79 self.multiEdgeFeatureBuilder.noAnnType = True 80 if self.styles["noMasking"]: 81 self.multiEdgeFeatureBuilder.maskNamedEntities = False 82 if self.styles["maxFeatures"]: 83 self.multiEdgeFeatureBuilder.maximum = True 84 if self.styles["genia_task1"]: 85 self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") 86 self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) 87 if self.styles["ontology"]: 88 self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) 89 if self.styles["nodalida"]: 90 self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) 91 if self.styles["bacteria_renaming"]: 92 self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) 93 if self.styles["trigger_features"]: 94 self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles) 95 self.triggerFeatureBuilder.useNonNameEntities = True 96 if self.styles["genia_task1"]: 97 self.triggerFeatureBuilder.filterAnnTypes.add("Entity") 98 #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) 99 if self.styles["rel_features"]: 100 self.relFeatureBuilder = RELFeatureBuilder(featureSet) 101 if self.styles["ddi_features"]: 102 self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) 103 if self.styles["evex"]: 104 self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) 105 if self.styles["giuliano"]: 106 self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) 107 self.pathLengths = length 108 assert(self.pathLengths == None) 109 self.types = types 110 if self.styles["random"]: 111 from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder 112 self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
113
114 - def definePredictedValueRange(self, sentences, elementName):
115 self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)
116
117 - def getPredictedValueRange(self):
118 return self.multiEdgeFeatureBuilder.predictedRange
119
120 - def filterEdgesByType(self, edges, typesToInclude):
121 if len(typesToInclude) == 0: 122 return edges 123 edgesToKeep = [] 124 for edge in edges: 125 if edge.get("type") in typesToInclude: 126 edgesToKeep.append(edge) 127 return edgesToKeep
128
129 - def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
130 """ 131 Example class. Multiple overlapping edges create a merged type. 132 """ 133 types = set() 134 # if sentenceGraph.interactionGraph.has_edge(t1, t2): 135 # intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) 136 # # NOTE: Only works if keys are ordered integers 137 # for i in range(len(intEdges)): 138 # types.add(intEdges[i]["element"].get("type")) 139 # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): 140 # intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) 141 # # NOTE: Only works if keys are ordered integers 142 # for i in range(len(intEdges)): 143 # types.add(intEdges[i]["element"].get("type")) 144 intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) 145 if not directed: 146 intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1) 147 for intEdge in intEdges: 148 types.add(intEdge[2].get("type")) 149 types = list(types) 150 types.sort() 151 categoryName = "" 152 for name in types: 153 if categoryName != "": 154 categoryName += "---" 155 categoryName += name 156 if categoryName != "": 157 return categoryName 158 else: 159 return "neg"
160
161 - def getCategoryName(self, sentenceGraph, e1, e2, directed=True, duplicateEntities=None):
162 """ 163 Example class. Multiple overlapping edges create a merged type. 164 """ 165 # interactions = [] 166 # e1s = [e1] 167 # if duplicateEntities != None and e1 in duplicateEntities: 168 # e1s += duplicateEntities[e1] 169 # e2s = [e2] 170 # if duplicateEntities != None and e2 in duplicateEntities: 171 # e2s += duplicateEntities[e2] 172 # for entity1 in e1s: 173 # for entity2 in e2s: 174 # interactions = interactions + sentenceGraph.getInteractions(entity1, entity2) 175 # if not directed: 176 # interactions = interactions + sentenceGraph.getInteractions(entity2, entity1) 177 interactions = sentenceGraph.getInteractions(e1, e2, True) 178 if not directed: 179 interactions = interactions + sentenceGraph.getInteractions(e2, e1, True) 180 #print interactions 181 182 types = set() 183 for interaction in interactions: 184 types.add(interaction[2].get("type")) 185 types = list(types) 186 types.sort() 187 categoryName = "" 188 for name in types: 189 if self.styles["causeOnly"] and name != "Cause": 190 continue 191 if self.styles["themeOnly"] and name != "Theme": 192 continue 193 if categoryName != "": 194 categoryName += "---" 195 categoryName += name 196 if categoryName != "": 197 return categoryName 198 else: 199 return "neg"
200
201 - def isPotentialRELInteraction(self, e1, e2):
202 if e1.get("type") == "Protein" and e2.get("type") == "Entity": 203 return True 204 else: 205 return False
206
207 - def isPotentialBBInteraction(self, e1, e2, sentenceGraph):
208 #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]: 209 # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation 210 if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environment", "Food", "Medical", "Soil", "Water"]: 211 return True 212 elif e1.get("type") == "Host" and e2.get("type") == "HostPart": 213 return True 214 else: 215 return False
216
217 - def getBISuperType(self, eType):
218 if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]: 219 return "ProteinEntity" 220 elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]: 221 return "GeneEntity" 222 else: 223 return None
224
225 - def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats):
226 e1Type = e1.get("type") 227 e1SuperType = self.getBISuperType(e1Type) 228 e2Type = e2.get("type") 229 e2SuperType = self.getBISuperType(e2Type) 230 231 tag = "(" + e1Type + "/" + e2Type + ")" 232 if e1Type == "Regulon": 233 if e2SuperType in ["GeneEntity", "ProteinEntity"]: 234 return True 235 if e1SuperType == "ProteinEntity": 236 if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]: 237 return True 238 if e1Type in ["Action", "Transcription", "Expression"]: 239 return True 240 if e1Type == "Site": 241 if e2SuperType == "GeneEntity": 242 return True 243 if e1Type == "Promoter": 244 if e2SuperType in ["GeneEntity", "ProteinEntity"]: 245 return True 246 if e1SuperType in ["GeneEntity", "ProteinEntity"]: 247 if e2SuperType in ["GeneEntity", "ProteinEntity"]: 248 return True 249 stats.filter("bi_limits") #+tag) 250 return False
251
252 - def isPotentialEPIInteraction(self, e1, e2, sentenceGraph):
253 if e1.get("type") != "Catalysis": 254 if e1.get("type") in ["Protein", "Entity"]: 255 return False 256 elif e2.get("type") in ["Protein", "Entity"]: 257 return True 258 else: 259 return False 260 else: # Catalysis 261 if e2.get("type") != "Entity": 262 return True 263 else: 264 return False 265 assert False, (e1.get("type"), e2.get("type"))
266
267 - def isPotentialIDInteraction(self, e1, e2, sentenceGraph):
268 e1Type = e1.get("type") 269 e2Type = e2.get("type") 270 e1IsCore = e1Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"] 271 e2IsCore = e2Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"] 272 if e1IsCore: 273 return False 274 elif e1Type in ["Gene_expression", "Transcription"]: 275 if e2Type in ["Protein", "Regulon-operon"]: 276 return True 277 else: 278 return False 279 elif e1Type in ["Protein_catabolism", "Phosphorylation"]: 280 if e2Type == "Protein": 281 return True 282 else: 283 return False 284 elif e1Type == "Localization": 285 if e2IsCore or e2Type == "Entity": 286 return True 287 else: 288 return False 289 elif e1Type in ["Binding", "Process"]: 290 if e2IsCore: 291 return True 292 else: 293 return False 294 elif "egulation" in e1Type: 295 if e2Type != "Entity": 296 return True 297 else: 298 return False 299 elif e1Type == "Entity": 300 if e2IsCore: 301 return True 302 else: 303 return False 304 assert False, (e1Type, e2Type)
305
306 - def isPotentialCOInteraction(self, e1, e2, sentenceGraph):
307 if e1.get("type") == "Exp" and e2.get("type") == "Exp": 308 anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1] 309 antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2] 310 antecedentTokenFound = False 311 for token in sentenceGraph.tokens: 312 if token == antecedentTok: 313 antecedentTokenFound = True 314 if token == anaphoraTok: # if, not elif, to take into accoutn cases where e1Tok == e2Tok 315 if antecedentTokenFound: 316 return True 317 else: 318 return False 319 assert False 320 elif e1.get("type") == "Exp" and e2.get("type") == "Protein": 321 return True 322 else: 323 return False
324
325 - def isPotentialGeniaInteraction(self, e1, e2):
326 e1Type = e1.get("type") 327 e2Type = e2.get("type") 328 if e1Type == "Protein": 329 return False 330 elif e1Type in ["Entity", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Binding"]: 331 if e2Type == "Protein": 332 return True 333 else: 334 return False 335 elif e1Type == "Localization": 336 if e2Type in ["Protein", "Entity"]: 337 return True 338 else: 339 return False 340 elif "egulation" in e1Type: 341 if e2Type != "Entity": 342 return True 343 else: 344 return False 345 assert False, (e1Type, e2Type)
346
347 - def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True):
348 if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0: 349 return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed) 350 else: 351 return "neg"
352
353 - def filterEdge(self, edge, edgeTypes):
354 import types 355 assert edgeTypes != None 356 if type(edgeTypes) not in [types.ListType, types.TupleType]: 357 edgeTypes = [edgeTypes] 358 if edge[2].get("type") in edgeTypes: 359 return True 360 else: 361 return False
362
363 - def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None):
364 """ 365 Build examples for a single sentence. Returns a list of examples. 366 See Core/ExampleUtils for example format. 367 """ 368 #examples = [] 369 exampleIndex = 0 370 371 if self.styles["trigger_features"]: 372 self.triggerFeatureBuilder.initSentence(sentenceGraph) 373 if self.styles["evex"]: 374 self.evexFeatureBuilder.initSentence(sentenceGraph) 375 376 # Filter entities, if needed 377 #mergedIds = None 378 #duplicateEntities = None 379 #entities = sentenceGraph.entities 380 #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) 381 sentenceGraph.mergeInteractionGraph(True) 382 entities = sentenceGraph.mergedEntities 383 entityToDuplicates = sentenceGraph.mergedEntityToDuplicates 384 self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) 385 386 # Connect to optional gold graph 387 if goldGraph != None: 388 entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) 389 390 paths = None 391 if not self.styles["no_path"]: 392 ##undirected = sentenceGraph.getUndirectedDependencyGraph() 393 #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) 394 ###undirected = sentenceGraph.dependencyGraph.to_undirected() 395 ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work 396 undirected = sentenceGraph.dependencyGraph.toUndirected() 397 #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) 398 paths = undirected 399 if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and 400 paths.resetAnalyses() # just in case 401 paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) 402 403 #for edge in sentenceGraph.dependencyGraph.edges: 404 # assert edge[2] != None 405 #for edge in undirected.edges: 406 # assert edge[2] != None 407 #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": 408 # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] 409 410 # Generate examples based on interactions between entities or interactions between tokens 411 if self.styles["entities"]: 412 loopRange = len(entities) 413 else: 414 loopRange = len(sentenceGraph.tokens) 415 for i in range(loopRange-1): 416 for j in range(i+1,loopRange): 417 eI = None 418 eJ = None 419 if self.styles["entities"]: 420 eI = entities[i] 421 eJ = entities[j] 422 tI = sentenceGraph.entityHeadTokenByEntity[eI] 423 tJ = sentenceGraph.entityHeadTokenByEntity[eJ] 424 #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": 425 # continue 426 if eI.get("type") == "neg" or eJ.get("type") == "neg": 427 continue 428 if self.styles["skip_extra_triggers"]: 429 if eI.get("source") != None or eJ.get("source") != None: 430 continue 431 else: 432 tI = sentenceGraph.tokens[i] 433 tJ = sentenceGraph.tokens[j] 434 # only consider paths between entities (NOTE! entities, not only named entities) 435 if self.styles["headsOnly"]: 436 if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): 437 continue 438 439 if self.styles["directed"]: 440 # define forward 441 if self.styles["entities"]: 442 categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) 443 if goldGraph != None: 444 categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True) 445 else: 446 categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) 447 # make forward 448 self.exampleStats.beginExample(categoryName) 449 makeExample = True 450 if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ): 451 makeExample = False 452 self.exampleStats.filter("genia_limits") 453 if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): 454 makeExample = False 455 self.exampleStats.filter("genia_task1") 456 if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ): 457 makeExample = False 458 self.exampleStats.filter("rel_limits") 459 if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph): 460 makeExample = False 461 self.exampleStats.filter("co_limits") 462 if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph): 463 makeExample = False 464 self.exampleStats.filter("bb_limits") 465 if categoryName != "neg": 466 self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") 467 if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats): 468 makeExample = False 469 #self.exampleStats.filter("bi_limits") 470 if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph): 471 makeExample = False 472 self.exampleStats.filter("epi_limits") 473 if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph): 474 makeExample = False 475 self.exampleStats.filter("id_limits") 476 # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): 477 # makeExample = False 478 # self.exampleStats.filter("selftrain_limits") 479 # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): 480 # makeExample = False 481 # self.exampleStats.filter("selftrain_group") 482 if self.styles["pos_only"] and categoryName == "neg": 483 makeExample = False 484 self.exampleStats.filter("pos_only") 485 if makeExample: 486 #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) 487 ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile) 488 exampleIndex += 1 489 self.exampleStats.endExample() 490 491 # define reverse 492 if self.styles["entities"]: 493 categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) 494 if goldGraph != None: 495 categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True) 496 else: 497 categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) 498 # make reverse 499 self.exampleStats.beginExample(categoryName) 500 makeExample = True 501 if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI): 502 makeExample = False 503 self.exampleStats.filter("genia_limits") 504 if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): 505 makeExample = False 506 self.exampleStats.filter("genia_task1") 507 if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI): 508 makeExample = False 509 self.exampleStats.filter("rel_limits") 510 if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph): 511 makeExample = False 512 self.exampleStats.filter("co_limits") 513 if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph): 514 makeExample = False 515 self.exampleStats.filter("bb_limits") 516 if categoryName != "neg": 517 self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") 518 if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats): 519 makeExample = False 520 #self.exampleStats.filter("bi_limits") 521 if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph): 522 makeExample = False 523 self.exampleStats.filter("epi_limits") 524 if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph): 525 makeExample = False 526 self.exampleStats.filter("id_limits") 527 # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): 528 # makeExample = False 529 # self.exampleStats.filter("selftrain_limits") 530 # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): 531 # makeExample = False 532 # self.exampleStats.filter("selftrain_group") 533 if self.styles["pos_only"] and categoryName == "neg": 534 makeExample = False 535 self.exampleStats.filter("pos_only") 536 if makeExample: 537 #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) 538 ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile) 539 exampleIndex += 1 540 self.exampleStats.endExample() 541 else: 542 if self.styles["entities"]: 543 categoryName = self.getCategoryName(sentenceGraph, eI, eJ, directed=False) 544 else: 545 categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, directed=False) 546 self.exampleStats.beginExample(categoryName) 547 forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) 548 if not self.styles["graph_kernel"]: 549 reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) 550 forwardExample[2].update(reverseExample[2]) 551 #examples.append(forwardExample) 552 ExampleUtils.appendExamples([forwardExample], outfile) 553 exampleIndex += 1 554 self.exampleStats.endExample() 555 556 #return examples 557 return exampleIndex
558
559 - def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None):
560 """ 561 Build a single directed example for the potential edge between token1 and token2 562 """ 563 # dummy return for speed testing 564 #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{}) 565 566 # define features 567 features = {} 568 if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): 569 #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): 570 # path = paths[token1][token2] 571 #else: 572 # path = [token1, token2] 573 if not self.styles["no_path"]: 574 # directedPath reduces performance by 0.01 pp 575 #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2) 576 #if len(directedPath) == 0: 577 # directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1) 578 # for dp in directedPath: 579 # dp.reverse() 580 #if len(directedPath) == 0: 581 # path = paths.getPaths(token1, token2) 582 #else: 583 # path = directedPath 584 585 path = paths.getPaths(token1, token2) 586 if len(path) > 0: 587 #if len(path) > 1: 588 # print len(path) 589 path = path[0] 590 pathExists = True 591 else: 592 path = [token1, token2] 593 pathExists = False 594 else: 595 path = [token1, token2] 596 pathExists = False 597 #print token1.get("id"), token2.get("id") 598 assert(self.pathLengths == None) 599 if self.pathLengths == None or len(path)-1 in self.pathLengths: 600 # if not "no_ontology" in self.styles: 601 # self.ontologyFeatureBuilder.setFeatureVector(features) 602 # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) 603 # self.ontologyFeatureBuilder.setFeatureVector(None) 604 if self.styles["trigger_features"]: # F 85.52 -> 85.55 605 self.triggerFeatureBuilder.setFeatureVector(features) 606 self.triggerFeatureBuilder.tag = "trg1_" 607 self.triggerFeatureBuilder.buildFeatures(token1) 608 self.triggerFeatureBuilder.tag = "trg2_" 609 self.triggerFeatureBuilder.buildFeatures(token2) 610 self.triggerFeatureBuilder.setFeatureVector(None) 611 # REL features 612 if self.styles["rel_features"] and not self.styles["no_task"]: 613 self.relFeatureBuilder.setFeatureVector(features) 614 self.relFeatureBuilder.tag = "rel1_" 615 self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) 616 self.relFeatureBuilder.tag = "rel2_" 617 self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) 618 self.relFeatureBuilder.setFeatureVector(None) 619 if self.styles["bacteria_renaming"] and not self.styles["no_task"]: 620 self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) 621 self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) 622 #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 623 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) 624 if self.styles["co_limits"] and not self.styles["no_task"]: 625 e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) 626 e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) 627 if Range.contains(e1Offset, e2Offset): 628 features[self.featureSet.getId("e1_contains_e2")] = 1 629 if entity2.get("isName") == "True": 630 features[self.featureSet.getId("e1_contains_e2name")] = 1 631 if Range.contains(e2Offset, e1Offset): 632 features[self.featureSet.getId("e2_contains_e1")] = 1 633 if entity1.get("isName") == "True": 634 features[self.featureSet.getId("e2_contains_e1name")] = 1 635 if self.styles["ddi_features"]: 636 self.drugFeatureBuilder.setFeatureVector(features) 637 self.drugFeatureBuilder.tag = "ddi_" 638 self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) 639 if self.styles["ddi_mtmx"]: 640 self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) 641 self.drugFeatureBuilder.setFeatureVector(None) 642 #if "graph_kernel" in self.styles or not "no_dependency" in self.styles: 643 # #print "Getting edges" 644 # if token1 != token2 and pathExists: 645 # #print "g1" 646 # edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) 647 # #print "g2" 648 # else: 649 # edges = None 650 if self.styles["graph_kernel"]: 651 self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) 652 self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) 653 self.graphKernelFeatureBuilder.setFeatureVector(None) 654 if self.styles["entity_type"]: 655 e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) 656 e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) 657 features[self.featureSet.getId("e1_"+e1Type)] = 1 658 features[self.featureSet.getId("e2_"+e2Type)] = 1 659 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 660 if not self.styles["no_dependency"]: 661 #print "Dep features" 662 self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) 663 #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast 664 if not self.styles["disable_entity_features"]: 665 self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) 666 self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) 667 if not self.styles["disable_terminus_features"]: 668 self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast 669 if not self.styles["disable_single_element_features"]: 670 self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) 671 if not self.styles["disable_ngram_features"]: 672 #print "NGrams" 673 self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast 674 self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast 675 self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast 676 #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast 677 #if edges != None: 678 # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast 679 # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast 680 if not self.styles["disable_path_edge_features"]: 681 self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) 682 self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) 683 self.multiEdgeFeatureBuilder.setFeatureVector(None) 684 if self.styles["nodalida"]: 685 self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) 686 shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) 687 print shortestPaths 688 if len(shortestPaths) > 0: 689 self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) 690 self.nodalidaFeatureBuilder.setFeatureVector(None) 691 if not self.styles["no_linear"]: 692 self.tokenFeatureBuilder.setFeatureVector(features) 693 for i in range(len(sentenceGraph.tokens)): 694 if sentenceGraph.tokens[i] == token1: 695 token1Index = i 696 if sentenceGraph.tokens[i] == token2: 697 token2Index = i 698 linearPreTag = "linfw_" 699 if token1Index > token2Index: 700 token1Index, token2Index = token2Index, token1Index 701 linearPreTag = "linrv_" 702 self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") 703 self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") 704 # Before, middle, after 705 # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") 706 # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") 707 # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") 708 # before-middle, middle, middle-after 709 # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) 710 # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) 711 # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) 712 self.tokenFeatureBuilder.setFeatureVector(None) 713 if self.styles["random"]: 714 self.randomFeatureBuilder.setFeatureVector(features) 715 self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) 716 self.randomFeatureBuilder.setFeatureVector(None) 717 if self.styles["genia_limits"] and not self.styles["no_task"]: 718 e1Type = entity1.get("type") 719 e2Type = entity2.get("type") 720 assert(entity1.get("isName") == "False") 721 if entity2.get("isName") == "True": 722 features[self.featureSet.getId("GENIA_target_protein")] = 1 723 else: 724 features[self.featureSet.getId("GENIA_nested_event")] = 1 725 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization 726 if entity2.get("isName") == "True": 727 features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 728 else: 729 features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 730 if self.styles["bi_limits"]: 731 # Make features based on entity types 732 e1Type = entity1.get("type") 733 e2Type = entity2.get("type") 734 e1SuperType = str(self.getBISuperType(e1Type)) 735 e2SuperType = str(self.getBISuperType(e2Type)) 736 features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 737 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 738 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 739 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 740 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 741 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 742 if self.styles["evex"]: 743 self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) 744 self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) 745 self.evexFeatureBuilder.setFeatureVector(None) 746 if self.styles["giuliano"]: 747 self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) 748 self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) 749 self.giulianoFeatureBuilder.setFeatureVector(None) 750 else: 751 features[self.featureSet.getId("always_negative")] = 1 752 if self.styles["subset"]: 753 features[self.featureSet.getId("out_of_scope")] = 1 754 else: 755 features[self.featureSet.getId("always_negative")] = 1 756 if self.styles["subset"]: 757 features[self.featureSet.getId("out_of_scope")] = 1 758 path = [token1, token2] 759 # define extra attributes 760 #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]): 761 if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]): 762 #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} 763 extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} 764 extra["deprev"] = False 765 else: 766 #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} 767 extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} 768 extra["deprev"] = True 769 if entity1 != None: 770 #extra["e1"] = entity1 771 extra["e1"] = entity1.get("id") 772 if sentenceGraph.mergedEntityToDuplicates != None: 773 #extra["e1GoldIds"] = mergedEntityIds[entity1] 774 extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]]) 775 if entity2 != None: 776 #extra["e2"] = entity2 777 extra["e2"] = entity2.get("id") 778 if sentenceGraph.mergedEntityToDuplicates != None: 779 extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]]) 780 #extra["e2GoldIds"] = mergedEntityIds[entity2] 781 extra["categoryName"] = categoryName 782 if self.styles["bacteria_renaming"]: 783 if entity1.get("text") != None and entity1.get("text") != "": 784 extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-") 785 if entity2.get("text") != None and entity2.get("text") != "": 786 extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-") 787 sentenceOrigId = sentenceGraph.sentenceElement.get("origId") 788 if sentenceOrigId != None: 789 extra["SOID"] = sentenceOrigId 790 # make example 791 if self.styles["binary"]: 792 if categoryName != "neg": 793 category = 1 794 else: 795 category = -1 796 categoryName = "i" 797 else: 798 category = self.classSet.getId(categoryName) 799 800 # NOTE: temporarily disable for replicating 110310 experiment 801 #features[self.featureSet.getId("extra_constant")] = 1 802 return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
803