Package TEES :: Package Utils :: Package InteractionXML :: Module ParseGraph
[hide private]

Source Code for Module TEES.Utils.InteractionXML.ParseGraph

  1  """ 
  2    Program:    Classes for working with parse graphs 
  3    Date:       Jan. 31, 2008 
  4    Author:     Jari Bjorne 
  5   
  6    Description: This file contains classes and functions for working with 
  7    sentence parse graphs. A parse graph can be created from interaction xml. 
  8    The nodes can be modified and assigned attributes like weight, and the 
  9    result can be converted into an adjacency matrix. The parse graph can 
 10    also be used to generate different paths between its tokens. 
 11                   
 12    Status: All classes and methods should be working. 
 13   
 14    Dependencies: PorterStemmer (CommonUtils/Stemming) 
 15                  Range.py (CommonUtils) 
 16  """ 
 17   
 18  import Utils.Libraries.PorterStemmer as stemmer 
 19  import Utils.Range as Range 
 20  import sys 
 21           
22 -class ParseGraphNode:
23 """ 24 Represents a single node (either token or dependency) in the parse graph 25 or a path generated by a ParseGraph-object. 26 """
27 - def __init__(self, isDependency=False):
28 self.isDependency = isDependency 29 # token attributes 30 self.pos = None 31 self.text = None 32 self.id = None 33 self.charOffset = None 34 self.dependencies = [] # all dependencies 35 self.entities = [] # this token is part of these named entities 36 # dependency attributes 37 self.dependencyType = None 38 self.fro = None # added to adjacency matrix labels 39 self.to = None
40
41 - def toString(self, showPos=False, highlight=False):
42 string = "" 43 if self.isDependency: 44 string = "-" + str(self.dependencyType) + "-" 45 else: 46 if showPos: 47 string = "[" + self.pos + "]" 48 else: 49 string = "[" + self.text + "]" 50 if highlight: 51 string = "{" + string[1:-1] + "}" 52 return string
53
54 -class ParseGraph:
55 """ 56 A ParseGraph-object consists of tokens and dependencies. A dependency connects 57 two tokens with two edges (token->dependency->token, where -> = edge). 58 """
59 - def __init__(self, tokenElements, dependencyElements, mergeDependencies=False):
60 self.tokensById, self.dependenciesById = self.buildParseGraph(tokenElements, dependencyElements, mergeDependencies)
61 62 ########################################################################### 63 # Construction and Conversion 64 ########################################################################### 65
66 - def buildParseGraph(self, tokenElements, dependencyElements, mergeDependencies=False):
67 """ Returns dictionaries containing tokens and dependencies 68 of the graph generated from ElementTree-elements. 69 """ 70 tokensById = {} 71 dependenciesById = {} 72 for tokenElement in tokenElements: 73 node = ParseGraphNode() 74 node.id = int(tokenElement.attrib["id"].split("_")[1]) 75 node.pos = tokenElement.attrib["POS"] 76 node.text = tokenElement.attrib["text"] 77 charFrom, charTo = tokenElement.attrib["charOffset"].split("-") 78 node.charOffset = (int(charFrom), int(charTo)) 79 tokensById[node.id] = node 80 81 #self.depByOrder = [] 82 dependencyIndex = len(tokensById) + 99 83 if mergeDependencies: 84 dependenciesByFroAndTo = {} 85 for dependencyElement in dependencyElements: 86 dependency = ParseGraphNode(True) 87 dependency.dependencyType = dependencyElement.attrib["type"] 88 dependency.fro = tokensById[int(dependencyElement.attrib["t1"].split("_")[1])] 89 dependency.to = tokensById[int(dependencyElement.attrib["t2"].split("_")[1])] 90 91 if mergeDependencies: 92 key = (dependency.fro.id, dependency.to.id) #frozenset([dependency.fro.id, dependency.to.id]) 93 if dependenciesByFroAndTo.has_key(key): 94 if not type(dependenciesByFroAndTo[key].dependencyType) == types.ListType: 95 dependenciesByFroAndTo[key].dependencyType = [dependenciesByFroAndTo[key].dependencyType] 96 dependenciesByFroAndTo[key].dependencyType.append(dependency.dependencyType) 97 else: 98 dependenciesByFroAndTo[key] = dependency 99 tokensById[dependency.fro.id].dependencies.append(dependency) 100 tokensById[dependency.to.id].dependencies.append(dependency) 101 dependency.id = dependencyIndex 102 assert( not dependenciesById.has_key(dependency.id) ) 103 dependenciesById[dependency.id] = dependency 104 else: 105 tokensById[dependency.fro.id].dependencies.append(dependency) 106 tokensById[dependency.to.id].dependencies.append(dependency) 107 #dependenciesById["dep_" + str(dependencyIndex) + "-mt_" + str(dependency.fro.id) + "-" + dependency.dependencyType + "-mt_" + str(dependency.to.id)] = dependency 108 #dependenciesById[dependencyIndex] = dependency 109 dependency.id = dependencyIndex # (dependency.fro.id,dependency.to.id) 110 assert( not dependenciesById.has_key(dependency.id) ) 111 dependenciesById[dependency.id] = dependency 112 dependencyIndex += 1 113 114 return tokensById, dependenciesById
115 116 ########################################################################### 117 # Marking and manipulation of special tokens 118 ########################################################################### 119
120 - def markNamedEntities(self, entityElements):
121 """ Marks tokens belonging to named entities 122 """ 123 namedEntityTokens = [] 124 for entityElement in entityElements: 125 offsets = [] 126 offsetStrings = entityElement.attrib["charOffset"].split(",") 127 for offsetString in offsetStrings: 128 charFrom, charTo = offsetString.split("-") 129 offset = (int(charFrom), int(charTo)) 130 offsets.append(offset) 131 for k,v in self.tokensById.iteritems(): 132 for offset in offsets: 133 if Range.overlap(offset, v.charOffset): 134 v.entities.append(entityElement.attrib["id"]) 135 namedEntityTokens.append(v.id) 136 return namedEntityTokens
137
138 - def getNamedEntityTokenIds(self, namedEntityIds):
139 """ Returns the ids of all tokens in specified named entities 140 """ 141 tokenIds = [] 142 for key, node in self.tokensById.iteritems(): 143 for id in namedEntityIds: 144 if id in node.entities: 145 tokenIds.append(node.id) 146 return tokenIds
147
148 - def getTokenIdsByText(self, texts, lookInsideNamedEntities=False):
149 """ Returns the ids of all tokens whose text attribute can be 150 found in the list texts. Can be used f.e. detecting interaction 151 words. 152 """ 153 matchingTokens = [] 154 for node in self.tokensById.values(): 155 if len(node.entities) > 0 and not lookInsideNamedEntities: 156 continue 157 if node.text.lower() in texts: 158 matchingTokens.append(node.id) 159 elif node.text.find("-") != -1: # For cases like actin-activation 160 tempText = node.text.rsplit("-",1)[1] 161 if tempText.lower() in texts: 162 matchingTokens.append(node.id) 163 return matchingTokens
164
165 - def stemTokens(self):
166 for token in self.tokensById.values(): 167 token.stem = stemmer.stem(token.text)
168 169 ########################################################################### 170 # NetworkX-based shortest path 171 ########################################################################### 172
173 - def buildNXGraph(self):
174 """ Initializes the graph structure used by NetworkX. 175 Called automatically when needed. 176 """ 177 import networkx as NX 178 self.nXGraph = NX.Graph() 179 for token in self.tokensById.values(): 180 self.nXGraph.add_node(token.id) 181 for dependency in self.dependenciesById.values(): #self.depByOrder: 182 self.nXGraph.add_node(dependency.id) 183 self.nXGraph.add_edge((dependency.fro.id,dependency.id)) 184 self.nXGraph.add_edge((dependency.id,dependency.to.id))
185
186 - def buildShortestPathNX(self, startTokenId, endTokenId, directed=False):
187 """ Build shortest path using NetworkX 188 """ 189 import networkx as NX 190 if (not hasattr(self,"nXGraph")) or self.nXGraph == None: 191 self.buildNXGraph() 192 path = NX.shortest_path(self.nXGraph, startTokenId, endTokenId) 193 if path == False: 194 return [] 195 isTokenPhase = True 196 for i in range(len(path)): 197 if isTokenPhase: 198 path[i] = self.tokensById[path[i]] 199 else: 200 path[i] = self.dependenciesById[path[i]] 201 isTokenPhase = not isTokenPhase 202 return path
203