1  """ 
  2    Program:    Classes for working with parse graphs 
  3    Date:       Jan. 31, 2008 
  4    Author:     Jari Bjorne 
  5   
  6    Description: This file contains classes and functions for working with 
  7    sentence parse graphs. A parse graph can be created from interaction xml. 
  8    The nodes can be modified and assigned attributes like weight, and the 
  9    result can be converted into an adjacency matrix. The parse graph can 
 10    also be used to generate different paths between its tokens. 
 11                   
 12    Status: All classes and methods should be working. 
 13   
 14    Dependencies: PorterStemmer (CommonUtils/Stemming) 
 15                  Range.py (CommonUtils) 
 16  """ 
 17   
 18  import Utils.Libraries.PorterStemmer as stemmer 
 19  import Utils.Range as Range 
 20  import sys 
 21           
 23      """  
 24      Represents a single node (either token or dependency) in the parse graph 
 25      or a path generated by a ParseGraph-object. 
 26      """ 
 28          self.isDependency = isDependency 
 29           
 30          self.pos = None 
 31          self.text = None 
 32          self.id = None 
 33          self.charOffset = None 
 34          self.dependencies = []  
 35          self.entities = []  
 36           
 37          self.dependencyType = None 
 38          self.fro = None  
 39          self.to = None 
  40       
 41 -    def toString(self, showPos=False, highlight=False): 
  42          string = "" 
 43          if self.isDependency: 
 44              string = "-" + str(self.dependencyType) + "-" 
 45          else: 
 46              if showPos: 
 47                  string = "[" + self.pos + "]" 
 48              else: 
 49                  string = "[" + self.text + "]" 
 50          if highlight: 
 51              string = "{" + string[1:-1] + "}" 
 52          return string 
   53   
 55      """ 
 56      A ParseGraph-object consists of tokens and dependencies. A dependency connects 
 57      two tokens with two edges (token->dependency->token, where -> = edge). 
 58      """ 
 59 -    def __init__(self, tokenElements, dependencyElements, mergeDependencies=False): 
  60          self.tokensById, self.dependenciesById = self.buildParseGraph(tokenElements, dependencyElements, mergeDependencies) 
  61       
 62       
 63       
 64       
 65   
 66 -    def buildParseGraph(self, tokenElements, dependencyElements, mergeDependencies=False): 
  67          """ Returns dictionaries containing tokens and dependencies 
 68          of the graph generated from ElementTree-elements. 
 69          """ 
 70          tokensById = {} 
 71          dependenciesById = {} 
 72          for tokenElement in tokenElements: 
 73              node = ParseGraphNode() 
 74              node.id = int(tokenElement.attrib["id"].split("_")[1]) 
 75              node.pos = tokenElement.attrib["POS"] 
 76              node.text = tokenElement.attrib["text"] 
 77              charFrom, charTo = tokenElement.attrib["charOffset"].split("-") 
 78              node.charOffset = (int(charFrom), int(charTo)) 
 79              tokensById[node.id] = node 
 80       
 81           
 82          dependencyIndex = len(tokensById) + 99 
 83          if mergeDependencies: 
 84              dependenciesByFroAndTo = {} 
 85          for dependencyElement in dependencyElements: 
 86              dependency = ParseGraphNode(True) 
 87              dependency.dependencyType = dependencyElement.attrib["type"] 
 88              dependency.fro = tokensById[int(dependencyElement.attrib["t1"].split("_")[1])] 
 89              dependency.to = tokensById[int(dependencyElement.attrib["t2"].split("_")[1])] 
 90               
 91              if mergeDependencies: 
 92                  key = (dependency.fro.id, dependency.to.id)  
 93                  if dependenciesByFroAndTo.has_key(key): 
 94                      if not type(dependenciesByFroAndTo[key].dependencyType) == types.ListType: 
 95                          dependenciesByFroAndTo[key].dependencyType = [dependenciesByFroAndTo[key].dependencyType] 
 96                      dependenciesByFroAndTo[key].dependencyType.append(dependency.dependencyType) 
 97                  else: 
 98                      dependenciesByFroAndTo[key] = dependency 
 99                      tokensById[dependency.fro.id].dependencies.append(dependency) 
100                      tokensById[dependency.to.id].dependencies.append(dependency) 
101                      dependency.id = dependencyIndex 
102                      assert( not dependenciesById.has_key(dependency.id) ) 
103                      dependenciesById[dependency.id] = dependency 
104              else: 
105                  tokensById[dependency.fro.id].dependencies.append(dependency) 
106                  tokensById[dependency.to.id].dependencies.append(dependency) 
107                   
108                   
109                  dependency.id = dependencyIndex  
110                  assert( not dependenciesById.has_key(dependency.id) ) 
111                  dependenciesById[dependency.id] = dependency 
112              dependencyIndex += 1 
113           
114          return tokensById, dependenciesById 
 115   
116       
117       
118       
119       
121          """ Marks tokens belonging to named entities 
122          """ 
123          namedEntityTokens = [] 
124          for entityElement in entityElements: 
125              offsets = [] 
126              offsetStrings = entityElement.attrib["charOffset"].split(",") 
127              for offsetString in offsetStrings: 
128                  charFrom, charTo = offsetString.split("-") 
129                  offset = (int(charFrom), int(charTo)) 
130                  offsets.append(offset) 
131              for k,v in self.tokensById.iteritems(): 
132                  for offset in offsets: 
133                      if Range.overlap(offset, v.charOffset): 
134                          v.entities.append(entityElement.attrib["id"]) 
135                          namedEntityTokens.append(v.id) 
136          return namedEntityTokens 
 137   
139          """ Returns the ids of all tokens in specified named entities 
140          """ 
141          tokenIds = [] 
142          for key, node in self.tokensById.iteritems(): 
143              for id in namedEntityIds: 
144                  if id in node.entities: 
145                      tokenIds.append(node.id) 
146          return tokenIds 
 147       
148 -    def getTokenIdsByText(self, texts, lookInsideNamedEntities=False): 
 149          """ Returns the ids of all tokens whose text attribute can be 
150          found in the list texts. Can be used f.e. detecting interaction 
151          words. 
152          """ 
153          matchingTokens = [] 
154          for node in self.tokensById.values(): 
155              if len(node.entities) > 0 and not lookInsideNamedEntities: 
156                  continue 
157              if node.text.lower() in texts: 
158                  matchingTokens.append(node.id) 
159              elif node.text.find("-") != -1:  
160                  tempText = node.text.rsplit("-",1)[1] 
161                  if tempText.lower() in texts: 
162                      matchingTokens.append(node.id) 
163          return matchingTokens 
 164       
166          for token in self.tokensById.values(): 
167              token.stem = stemmer.stem(token.text) 
 168       
169       
170       
171       
172       
174          """ Initializes the graph structure used by NetworkX. 
175          Called automatically when needed. 
176          """ 
177          import networkx as NX 
178          self.nXGraph = NX.Graph() 
179          for token in self.tokensById.values(): 
180              self.nXGraph.add_node(token.id) 
181          for dependency in self.dependenciesById.values():  
182              self.nXGraph.add_node(dependency.id) 
183              self.nXGraph.add_edge((dependency.fro.id,dependency.id)) 
184              self.nXGraph.add_edge((dependency.id,dependency.to.id)) 
 185   
187          """ Build shortest path using NetworkX 
188          """ 
189          import networkx as NX 
190          if (not hasattr(self,"nXGraph")) or self.nXGraph == None: 
191              self.buildNXGraph() 
192          path = NX.shortest_path(self.nXGraph, startTokenId, endTokenId) 
193          if path == False: 
194              return [] 
195          isTokenPhase = True 
196          for i in range(len(path)): 
197              if isTokenPhase: 
198                  path[i] = self.tokensById[path[i]] 
199              else: 
200                  path[i] = self.dependenciesById[path[i]] 
201              isTokenPhase = not isTokenPhase 
202          return path 
  203