1  """ 
  2  Edge Examples 
  3  """ 
  4  __version__ = "$Revision: 1.13 $" 
  5   
  6  import sys, os 
  7  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  8  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  9  from ExampleBuilders.ExampleBuilder import ExampleBuilder 
 10  from Core.IdSet import IdSet 
 11  import Core.ExampleUtils as ExampleUtils 
 12  from FeatureBuilders.MultiEdgeFeatureBuilder import MultiEdgeFeatureBuilder 
 13  from FeatureBuilders.TriggerFeatureBuilder import TriggerFeatureBuilder 
 14   
 15  from Core.SimpleGraph import Graph 
 16  from Utils.ProgressCounter import ProgressCounter 
 17  import Utils.Libraries.combine as combine 
 18  import Utils.ElementTreeUtils as ETUtils 
 19  import gzip 
 20  import types 
 21   
 23       
 24       
 25      pool = tuple(iterable) 
 26      n = len(pool) 
 27      if r > n: 
 28          return 
 29      indices = range(r) 
 30      yield tuple(pool[i] for i in indices) 
 31      while True: 
 32          for i in reversed(range(r)): 
 33              if indices[i] != i + n - r: 
 34                  break 
 35          else: 
 36              return 
 37          indices[i] += 1 
 38          for j in range(i+1, r): 
 39              indices[j] = indices[j-1] + 1 
 40          yield tuple(pool[i] for i in indices) 
  41   
 43      """ 
 44      e1/e2 = (interaction, pathdist, lindist, tok2pos) 
 45      """ 
 46      if e1[1] > e2[1]: 
 47          return 1 
 48      elif e1[1] < e2[1]: 
 49          return -1 
 50      else:  
 51          if e1[2] > e2[2]: 
 52              return 1 
 53          elif e1[2] < e2[2]: 
 54              return -1 
 55          else:  
 56              if e1[3] > e2[3]: 
 57                  return 1 
 58              elif e1[3] < e2[3]: 
 59                  return -1 
 60              else:  
 61                  return 0 
  62                   
 63   
 65      """ 
 66      This example builder makes unmerging examples, i.e. examples describing 
 67      potential events. 
 68      """ 
 69       
 70 -    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): 
  71           
 72           
 73          if featureSet == None: 
 74              featureSet = IdSet() 
 75          if classSet == None: 
 76              classSet = IdSet(1) 
 77          else: 
 78              classSet = classSet 
 79          assert( classSet.getId("neg") == 1 ) 
 80           
 81          ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) 
 82           
 83          self.styles = self._setDefaultParameters(["trigger_features","typed","directed","no_linear","entities","genia_limits", 
 84              "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features",  
 85              "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]) 
 86          self.styles = self.getParameters(style) 
 87          self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) 
 88          self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] 
 89          self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] 
 90          self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] 
 91           
 92          self.pathLengths = length 
 93          assert(self.pathLengths == None) 
 94          self.types = types 
 95   
 96          self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) 
 97          self.triggerFeatureBuilder.useNonNameEntities = True 
  98           
 99           
100       
102          """ 
103          Return dependency and linear length of all interaction edges 
104          (measured between the two tokens). 
105          """ 
106          interactionLengths = {} 
107          for interaction in sentenceGraph.interactions: 
108               
109              e1 = sentenceGraph.entitiesById[interaction.get("e1")] 
110              e2 = sentenceGraph.entitiesById[interaction.get("e2")] 
111              t1 = sentenceGraph.entityHeadTokenByEntity[e1] 
112              t2 = sentenceGraph.entityHeadTokenByEntity[e2] 
113               
114              if t1 != t2: 
115                  path = paths.getPaths(t1, t2) 
116              if t1 != t2 and len(path) > 0: 
117                  pathLength = min(len(x) for x in path)  
118              else:  
119                  pathLength = 999999  
120               
121              t1Pos = -1 
122              t2Pos = -1 
123              for i in range(len(sentenceGraph.tokens)): 
124                  if sentenceGraph.tokens[i] == t1: 
125                      t1Pos = i 
126                      if t2Pos != -1: 
127                          break 
128                  if sentenceGraph.tokens[i] == t2: 
129                      t2Pos = i 
130                      if t1Pos != -1: 
131                          break 
132              linLength = abs(t1Pos - t2Pos) 
133              interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) 
134          return interactionLengths 
 135       
136 -    def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset): 
 137          offset = entity.get("headOffset") 
138          if not goldEntitiesByOffset.has_key(offset): 
139              return False 
140          eType = entity.get("type") 
141          goldEntities = goldEntitiesByOffset[offset] 
142           
143           
144          for goldEntity in goldEntities: 
145              isGold = True 
146               
147               
148              if goldEntity.get("type") != eType: 
149                  isGold = False 
150                  continue 
151              goldEntityId = goldEntity.get("id") 
152               
153               
154              goldInteractions = [] 
155              for goldInteraction in goldGraph.interactions: 
156                  if goldInteraction.get("e1") == goldEntityId: 
157                      goldInteractions.append(goldInteraction) 
158               
159               
160              if len(goldInteractions) != len(arguments):  
161                  isGold = False 
162                  continue 
163               
164              argTypeCounts = {} 
165              for argument in arguments: 
166                  argType = argument.get("type") 
167                  if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0 
168                  argTypeCounts[argType] += 1 
169               
170              goldTypeCounts = {} 
171              for argument in goldInteractions: 
172                  argType = argument.get("type") 
173                  if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0 
174                  goldTypeCounts[argType] += 1 
175               
176              if argTypeCounts != goldTypeCounts: 
177                  isGold = False 
178                  continue 
179               
180               
181              for argument in arguments:  
182                  e1 = argument.get("e1") 
183                  e2 = argument.get("e2") 
184                  e2Entity = sentenceGraph.entitiesById[e2] 
185                  e2Offset = e2Entity.get("headOffset") 
186                  e2Type = e2Entity.get("type") 
187                  argType = argument.get("type") 
188                   
189                  found = False 
190                  for goldInteraction in goldInteractions: 
191                      if goldInteraction.get("type") == argType: 
192                          goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")]  
193                          if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type: 
194                              found = True 
195                              break 
196                  if found == False:  
197                      isGold = False 
198                      break 
199   
200               
201              if isGold: 
202                  break 
203           
204          return isGold 
 205       
207          combs = [] 
208          if eType == "Binding": 
209               
210               
211               
212               
213               
214               
215               
216               
217               
218              themes = [] 
219              for interaction in interactions: 
220                  if interaction.get("type") == "Theme": 
221                      themes.append(interaction) 
222                   
223              for i in range(len(themes)): 
224                   
225                   
226                  if i < 10:  
227                      for j in combinations(themes, i+1): 
228                          combs.append(j) 
229   
230   
231   
232              return combs 
233          elif eType == "Process":  
234              argCombinations = [] 
235              argCombinations.append([])  
236              for interaction in interactions: 
237                  if interaction.get("type") == "Participant": 
238                      argCombinations.append([interaction]) 
239              return argCombinations 
240          else:  
241              themes = [] 
242              causes = [] 
243              siteArgs = [] 
244              contextGenes = [] 
245              sideChains = [] 
246              locTargets = [] 
247              for interaction in interactions: 
248                  iType = interaction.get("type") 
249                   
250                  if iType not in ["Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"]:  
251                      continue 
252                  if iType == "Theme": 
253                      themes.append(interaction) 
254                  elif iType == "Cause": 
255                      causes.append(interaction) 
256                  elif iType == "SiteArg": 
257                      siteArgs.append(interaction) 
258                  elif iType == "Contextgene": 
259                      contextGenes.append(interaction) 
260                  elif iType == "Sidechain": 
261                      sideChains.append(interaction) 
262                  elif iType in ["AtLoc", "ToLoc"]: 
263                      locTargets.append(iType) 
264                  else: 
265                      assert False, (iType, interaction.get("id")) 
266               
267              if eType.find("egulation") == -1 and eType != "Catalysis":  
268                  causes = [] 
269              if eType != "Glycosylation": sideChains = [] 
270              if eType not in ["Acetylation", "Methylation"]: contextGenes = [] 
271              if eType == "Catalysis": siteArgs = [] 
272               
273              themeAloneCombinations = [] 
274              for theme in themes: 
275                  themeAloneCombinations.append([theme]) 
276               
277              return combine.combine(themes, causes) \ 
278                     + combine.combine(themes, siteArgs) \ 
279                     + combine.combine(themes, sideChains) \ 
280                     + combine.combine(themes, contextGenes) \ 
281                     + combine.combine(themes, siteArgs, sideChains) \ 
282                     + combine.combine(themes, siteArgs, contextGenes) \ 
283                     + combine.combine(themes, locTargets) \ 
284                     + themeAloneCombinations 
 285   
286   
287   
288   
289   
290   
291   
292   
293       
295           
296           
297           
298           
299           
300           
301          pairs = [] 
302          for interaction in interactions: 
303              pairs.append( (int(interaction.get("id").split(".i")[-1]), interaction) ) 
304          pairs.sort() 
305          return [x[1] for x in pairs] 
 306       
308          """ 
309          Build examples for a single sentence. Returns a list of examples. 
310          See Core/ExampleUtils for example format. 
311          """ 
312          self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) 
313          self.triggerFeatureBuilder.initSentence(sentenceGraph) 
314           
315           
316          exampleIndex = 0 
317           
318           
319           
320          undirected = sentenceGraph.dependencyGraph.toUndirected() 
321          paths = undirected 
322           
323           
324          self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) 
325           
326           
327          tokenByOffset = {} 
328          for i in range(len(sentenceGraph.tokens)): 
329              token = sentenceGraph.tokens[i] 
330              if goldGraph != None:  
331                  goldToken = goldGraph.tokens[i] 
332                  assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") 
333              tokenByOffset[token.get("charOffset")] = token.get("id") 
334           
335           
336          goldEntitiesByOffset = {} 
337          if goldGraph != None: 
338              for entity in goldGraph.entities: 
339                  offset = entity.get("headOffset") 
340                  assert offset != None 
341                  if not goldEntitiesByOffset.has_key(offset): 
342                      goldEntitiesByOffset[offset] = [] 
343                  goldEntitiesByOffset[offset].append(entity) 
344           
345           
346   
347   
348   
349   
350   
351   
352   
353   
354          if self.styles["no_merge"]: 
355              mergeInput = False 
356              entities = sentenceGraph.entities 
357          else: 
358              mergeInput = True 
359              sentenceGraph.mergeInteractionGraph(True) 
360              entities = sentenceGraph.mergedEntities 
361              self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) 
362           
363          exampleIndex = 0 
364          for entity in entities:  
365              eType = entity.get("type") 
366              assert eType != None, entity.attrib 
367              eType = str(eType) 
368               
369               
370               
371               
372               
373               
374               
375              interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] 
376              interactions = self.sortInteractionsById(interactions) 
377              argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) 
378               
379               
380              assert argCombinations != None, (entity.get("id"), entity.get("type")) 
381              for argCombination in argCombinations: 
382                  if eType != "Process": 
383                      assert len(argCombination) > 0, eType + ": " + str(argCombinations) 
384                   
385                  if goldGraph != None: 
386                      isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset) 
387                       
388                       
389                  else: 
390                      isGoldEvent = False 
391                   
392                  if isGoldEvent: 
393                       
394                      category = eType 
395                      if category.find("egulation") != -1: 
396                          category = "All_regulation" 
397                      elif category != "Binding": 
398                          category = "Other"  
399                  else: 
400                      category = "neg" 
401                       
402                  features = {} 
403                   
404                  argString = "" 
405                  for arg in argCombination: 
406                      argString += "," + arg.get("id") 
407                  extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} 
408                  assert type(extra["etype"]) == types.StringType, extra 
409                  self.exampleStats.addExample(category) 
410                  example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) 
411                  example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) 
412                  example[1] = self.classSet.getId(category) 
413                  example[3] = extra 
414                   
415                  ExampleUtils.appendExamples([example], outfile) 
416                  exampleIndex += 1 
417               
418           
419          return exampleIndex 
 420       
421 -    def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions):  
 422           
423           
424           
425          features = {} 
426          self.features = features 
427           
428          self.buildInterArgumentBagOfWords(argCombination, sentenceGraph) 
429           
430          eventEntityType = eventEntity.get("type") 
431          if eventEntityType == "Binding": 
432              interactionIndex = {} 
433              groupInteractionLengths = [] 
434              for interaction in allInteractions: 
435                  groupInteractionLengths.append(self.interactionLenghts[interaction]) 
436              groupInteractionLengths.sort(compareInteractionPrecedence) 
437               
438              for i in range(len(groupInteractionLengths)): 
439                  interactionIndex[groupInteractionLengths[i][0]] = i 
440           
441          eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity] 
442          self.triggerFeatureBuilder.setFeatureVector(self.features) 
443          self.triggerFeatureBuilder.tag = "trg_" 
444          self.triggerFeatureBuilder.buildFeatures(eventToken) 
445          self.triggerFeatureBuilder.tag = None 
446           
447           
448           
449          argThemeCount = 0 
450          argCauseCount = 0 
451          argCounts = {} 
452           
453          for arg in argCombination: 
454              if arg.get("type") == "Theme": 
455                  argThemeCount += 1 
456                  tag = "argTheme" 
457                  self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) 
458                  if eventEntityType == "Binding": 
459                      tag += str(interactionIndex[arg]) 
460                      self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) 
461              elif arg.get("type") == "Cause":  
462                  argCauseCount += 1 
463                  self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause") 
464              else: 
465                  argType = arg.get("type") 
466                  if argType not in argCounts: argCounts[argType] = 0 
467                  self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType) 
468                  argCounts[argType] += 1 
469           
470           
471          contextThemeCount = 0 
472          contextCauseCount = 0 
473          for interaction in allInteractions: 
474              if interaction in argCombination:  
475                  continue 
476              if interaction.get("type") == "Theme": 
477                  contextThemeCount += 1 
478                  tag = "conTheme" 
479                  self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) 
480                  if eventEntityType == "Binding": 
481                      tag += str(interactionIndex[interaction]) 
482                      self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) 
483              else:  
484                  contextCauseCount += 1 
485                  self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause") 
486           
487          self.setFeature("argCount", len(argCombination)) 
488          self.setFeature("argCount_" + str(len(argCombination)), 1) 
489          self.setFeature("interactionCount", len(allInteractions)) 
490          self.setFeature("interactionCount_" + str(len(allInteractions)), 1) 
491           
492          self.setFeature("argThemeCount", argThemeCount) 
493          self.setFeature("argThemeCount_" + str(argThemeCount), 1) 
494          self.setFeature("argCauseCount", argCauseCount) 
495          self.setFeature("argCauseCount_" + str(argCauseCount), 1) 
496          for key in sorted(argCounts.keys()): 
497              self.setFeature("arg" + key + "Count", argCounts[key]) 
498              self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1) 
499               
500          self.setFeature("interactionThemeCount", contextThemeCount) 
501          self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1) 
502          self.setFeature("interactionCauseCount", contextCauseCount) 
503          self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1)         
504           
505          self.triggerFeatureBuilder.tag = "" 
506          self.triggerFeatureBuilder.setFeatureVector(None) 
507       
508           
509   
510   
511   
512   
513   
514   
515           
516          return [None,None,features,None] 
 517   
519          argEntity = sentenceGraph.entitiesById[arg.get("e2")] 
520          argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] 
521          self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) 
522          self.triggerFeatureBuilder.tag = tag + "trg_" 
523          self.triggerFeatureBuilder.buildFeatures(argToken) 
524          if argEntity.get("isName") == "True": 
525              self.setFeature(tag+"Protein", 1) 
526          else: 
527              self.setFeature(tag+"Event", 1) 
528              self.setFeature("nestingEvent", 1) 
529          self.setFeature(tag+"_"+argEntity.get("type"), 1) 
 530       
531 -    def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): 
 532           
533           
534          self.multiEdgeFeatureBuilder.tag = tag + "_" 
535          self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) 
536           
537          self.setFeature(tag+"_present", 1) 
538           
539          path = paths.getPaths(eventToken, argToken) 
540          if eventToken != argToken and len(path) > 0: 
541              path = path[0] 
542          else: 
543              path = [eventToken, argToken] 
544               
545           
546          if not self.styles["disable_entity_features"]: 
547              self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) 
548          self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) 
549           
550           
551          if not self.styles["disable_single_element_features"]: 
552              self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) 
553          if not self.styles["disable_ngram_features"]: 
554              self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph)  
555              self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph)  
556              self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph)  
557          if not self.styles["disable_path_edge_features"]: 
558              self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) 
559           
560          self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) 
561          self.multiEdgeFeatureBuilder.tag = "" 
 562       
564          if len(arguments) < 2: 
565              return 
566   
567          indexByToken = {} 
568          for i in range(len(sentenceGraph.tokens)): 
569              indexByToken[sentenceGraph.tokens[i]] = i 
570           
571          argTokenIndices = set() 
572          for arg in arguments: 
573              argEntity = sentenceGraph.entitiesById[arg.get("e2")] 
574              argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] 
575              argTokenIndices.add(indexByToken[argToken]) 
576          minIndex = min(argTokenIndices) 
577          maxIndex = max(argTokenIndices) 
578          self.setFeature("argBoWRange", (maxIndex-minIndex)) 
579          self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1) 
580          bow = set() 
581          for i in range(minIndex+1, maxIndex): 
582              token = sentenceGraph.tokens[i] 
583              if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]: 
584                  bow.add(token.get("text")) 
585          bow = sorted(list(bow)) 
586          for word in bow: 
587              self.setFeature("argBoW_"+word, 1) 
588              if word in ["/", "-"]: 
589                  self.setFeature("argBoW_slashOrHyphen", 1) 
590          if len(bow) == 1: 
591              self.setFeature("argBoWonly_"+bow[0], 1) 
592              if bow[0] in ["/", "-"]: 
593                  self.setFeature("argBoWonly_slashOrHyphen", 1) 
 594