Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module GraphKernelFeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.GraphKernelFeatureBuilder

  1  from FeatureBuilder import FeatureBuilder 
  2  import numpy 
  3  #from numpy import * 
  4  import numpy.linalg 
  5  import networkx as NX 
  6  import copy 
  7  import sys 
  8  sys.path.append("../..") 
  9  import Core.ExampleUtils as ExampleUtils 
 10   
11 -def getHexColor(red, green, blue):
12 """ convert an (R, G, B) tuple to #RRGGBB """ 13 hexcolor = '#%02x%02x%02x' % (int(red),int(green),int(blue)) 14 # that's it! '%02x' means zero-padded, 2-digit hex values 15 return hexcolor
16
17 -def getColorFromBRGSpectrum(value, minVal=0.0, maxVal=1.0):
18 span = maxVal - minVal 19 pos = value / span 20 spanHalf = span / 2.0 21 blue = max((spanHalf - value)/spanHalf, 0.0) * 255 22 red = max((spanHalf - abs(value-spanHalf))/spanHalf, 0.0) * 255 23 green = max((value-spanHalf)/spanHalf, 0.0) * 255 24 return getHexColor(red, green, blue)
25
26 -def adjacencyMatrixToHtml(matrix, labels, filename):
27 from HtmlBuilder import HtmlBuilder 28 h = HtmlBuilder() 29 h.newPage("test","") 30 31 h.header("Adjacency Matrix", 3) 32 h.table(1) 33 rows, columns = matrix.shape 34 h.tableRow() # title row 35 h.tableData(None, True) # corner cell 36 for i in range(columns): 37 h.tableData(None, False) 38 h.span( str(i), "font-size:smaller;font-weight:bold" ) 39 h.closeElement() # tableData 40 h.closeElement() # title row 41 42 for i in range(rows): 43 h.tableRow() 44 h.tableData(None, False) 45 h.span( str(i), "font-size:smaller;font-weight:bold" ) 46 h.closeElement() # tableData 47 for j in range(columns): 48 h.tableData(None, False) 49 if matrix[i,j] != 0.0: 50 style = "font-size:smaller;background-color:" + getColorFromBRGSpectrum(matrix[i,j]) #00FF00" 51 h.span( str(matrix[i,j])[0:4], style ) 52 else: 53 style = "font-size:smaller" 54 h.span( "0", style ) 55 h.closeElement() # tableData 56 h.closeElement() # tableRow 57 58 h.closeElement() # table 59 60 h.header("Legend", 4) 61 h.table(1) 62 h.tableRow() 63 h.tableData(None, False) 64 h.span( "0.0", "font-size:smaller" ) 65 h.closeElement() # tableData 66 i = 0.1 67 while i <= 1.0: 68 h.tableData(None, False) 69 h.span( str(i), "font-size:smaller;background-color:" + getColorFromBRGSpectrum(i) ) 70 h.closeElement() # tableData 71 i += 0.1 72 h.closeElement() # tableRow 73 h.closeElement() # table 74 75 if labels != None: 76 h.header("Labels", 3) 77 for i in range(len(labels)): 78 string = str(i) + ": " 79 first = True 80 for label in labels[i]: 81 if not first: 82 string += ", " 83 string += label 84 first = False 85 h.span(string) 86 h.lineBreak() 87 88 h.write(filename)
89
90 -class GraphKernelFeatureBuilder(FeatureBuilder):
91 - def __init__(self, featureSet):
92 FeatureBuilder.__init__(self, featureSet)
93
94 - def buildGraphKernelFeatures(self, sentenceGraph, path):
95 edgeList = [] 96 depGraph = sentenceGraph.dependencyGraph 97 pt = path 98 for i in range(1, len(path)): 99 edgeList.extend(depGraph.getEdges(pt[i], pt[i-1])) 100 edgeList.extend(depGraph.getEdges(pt[i-1], pt[i])) 101 edges = edgeList 102 adjacencyMatrix, labels = self._buildAdjacencyMatrix(sentenceGraph, path, edges) 103 node_count = 2*len(sentenceGraph.tokens) + len(sentenceGraph.dependencies) 104 105 if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": 106 adjacencyMatrixToHtml(adjacencyMatrix, labels, "LLL.d0.s0_adjacency_matrix.html") 107 108 allPathsMatrix = self._prepareMatrix(adjacencyMatrix, node_count) 109 self._matrixToFeatures(allPathsMatrix, labels) 110 if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": 111 adjacencyMatrixToHtml(allPathsMatrix, labels, "LLL.d0.s0_all_paths_matrix.html") 112 commentLines = [] 113 commentLines.extend(self.featureSet.toStrings()) 114 example = ["example_"+self.entity1.attrib["id"]+"_"+self.entity2.attrib["id"],"unknown",self.features] 115 ExampleUtils.writeExamples([example],"LLL.d0.s0_example.txt",commentLines)
116 #sys.exit("Debug files created") 117
118 - def _matrixToFeatures(self, W, labels):
119 #proteins = set(["PROTEIN1", "PROTEIN2", "$$PROTEIN1", "$$PROTEIN2"]) 120 """Linearizes the representation of the graph""" 121 linear = {} 122 for i in range(W.shape[0]): 123 for j in range(W.shape[1]): 124 if W[i,j] > 0.00001: #i != j and W[i,j] > 0.3: #0.00001: 125 for label1 in labels[i]: 126 if (not "punct" in labels[i]) and (not "punct" in labels[j]): 127 for label2 in labels[j]: 128 #if label1 in proteins or label2 in proteins: 129 label = label1+"_$_"+label2 130 self.features[self.featureSet.getId(label)] = W[i,j]
131
132 - def _prepareMatrix(self, adjacencyMatrix, node_count, dtyp=numpy.float64):
133 W = adjacencyMatrix * -1.0 134 # W = adjacencyMatrix 135 # for i in range(adjacencyMatrix.shape[0]): 136 # for j in range(adjacencyMatrix.shape[1]): 137 # adjacencyMatrix[i,j] *= -1.0 138 W += numpy.mat(numpy.identity(node_count, dtype = dtyp)) 139 return numpy.linalg.inv(W) - numpy.mat(numpy.identity(node_count, dtype=dtyp))
140
141 - def _getTokenId(self, tokenElement):
142 """ Returns the position id of the token """ 143 return int(tokenElement.attrib["id"].split("_")[1])
144
145 - def _getTokenText(self, path, sentenceGraph, token):
146 tokenText = sentenceGraph.getTokenText(token) 147 if tokenText == "NAMED_ENT": 148 if token == path[0]: 149 tokenText = "NAMED_ENT_1" 150 elif token == path[-1]: 151 tokenText = "NAMED_ENT_2" 152 return tokenText
153
154 - def _buildAdjacencyMatrix(self, sentenceGraph, path, edges, floattype=numpy.float64, directed=True, linearOrderWeight=0.9):
155 """ Returns a Numpy-matrix 156 """ 157 #For each token, 2 nodes are allocated. For each dependency, one node is allocated 158 node_count = 2*len(sentenceGraph.tokens) + len(sentenceGraph.dependencies) 159 # Make the adjacency matrix of the graph 160 adjMatrix = numpy.mat(numpy.zeros((node_count,node_count), dtype = floattype)) 161 #A dictionary of labels is associated with each node 162 labels = [set([]) for x in range(node_count)] 163 #The word nodes have indices 0..2*len(tokens), the dependency nodes have the rest of the indices. 164 dep_indices = range(2*len(sentenceGraph.tokens), node_count) 165 166 # Calculate dependency weights 167 weightByDependency = {} 168 self._setAllDependencyWeights(sentenceGraph, weightByDependency, 0.3) 169 self._setDependencyWeightsByPath(edges, weightByDependency, 0.9) 170 self._reduceWeightByDistance(sentenceGraph, weightByDependency) 171 172 # Build dependency types 173 allEdges = edges #self._getEdgeList(edges) 174 175 #For each dependency 176 depEdgePairs = [] 177 depGraphEdges = sentenceGraph.dependencyGraph.edges #() 178 for dependency in sentenceGraph.dependencies: 179 for edge in depGraphEdges: 180 if edge[2] == dependency: 181 depEdgePairs.append( (dependency, edge) ) 182 depGraphEdges.remove(edge) 183 184 for depPair, index in zip(depEdgePairs, dep_indices): 185 dep = depPair[1] 186 #Token1-dependency, and dependency-token2 weights are added 187 adjMatrix[self._getTokenId(dep[0])-1, index] = weightByDependency[dep[2]] 188 adjMatrix[index, self._getTokenId(dep[1])-1] = weightByDependency[dep[2]] 189 #For undirected graphs, the links would also go the other way 190 if not directed: 191 adjMatrix[self._getTokenId(dep[1])-1, index] = weightByDependency[dep[2]] 192 adjMatrix[index, self._getTokenId(dep[0])-1] = weightByDependency[dep[2]] 193 194 # if type(dep.ppiType) == types.ListType: 195 # for i in dep.ppiType: 196 # labels[index].add(i) 197 # else: 198 # labels[index].add(dep.ppiType) 199 if dep in allEdges: 200 labels[index].add("sp_" + dep[2].attrib["type"]) 201 else: 202 labels[index].add(dep[2].attrib["type"]) 203 204 #Add the linear order of the sentence to the matrix 205 for i in range(len(sentenceGraph.tokens),2*len(sentenceGraph.tokens)-1): 206 adjMatrix[i,i+1] = linearOrderWeight 207 if not directed: 208 adjMatrix[i+1,i] = linearOrderWeight 209 210 #For each token 211 #preTagByToken = self._addPositionTags(sentenceGraph, sentenceGraph.entitiesByToken[path[0]], sentenceGraph.entitiesByToken[path[-1]]) 212 preTagByToken = self._addPositionTags(sentenceGraph, [path[0]], [path[-1]]) 213 for node in sentenceGraph.tokens: 214 index = self._getTokenId(node) - 1 215 # use the same approach as in MultiEdgeFeatureBuilder 216 features = self.getTokenFeatures(node, sentenceGraph) 217 if "txt_NAMED_ENT" in features: 218 if self.entity1 in sentenceGraph.tokenIsEntityHead[node]: 219 features.remove("txt_NAMED_ENT") 220 features.append("txt_NAMED_ENT_1") 221 elif self.entity2 in sentenceGraph.tokenIsEntityHead[node]: 222 features.remove("txt_NAMED_ENT") 223 features.append("txt_NAMED_ENT_2") 224 if "noAnnType" in features: 225 features.remove("noAnnType") 226 227 # apply labels 228 if node in path: # shortest path 229 for feature in features: 230 labels[index].add("sp_"+feature) 231 #labels[index].add("sp_"+self._getTokenText(path, sentenceGraph, node)) 232 #labels[index].add("sp_"+node.attrib["POS"]) 233 else: 234 for feature in features: 235 labels[index].add(feature) 236 #labels[index].add(self._getTokenText(path, sentenceGraph, node)) 237 #labels[index].add(node.attrib["POS"]) 238 # for code in node.metamapCodes: 239 # labels[index].add(code) 240 # if node.isPPIInteraction: 241 # labels[index].add("1Nt3R4Ct") 242 if preTagByToken.has_key(node): 243 preTag = preTagByToken[node] 244 for feature in features: 245 labels[index].add(preTag+feature) 246 #labels[len(sentenceGraph.tokens)+index].add(preTag+self._getTokenText(path, sentenceGraph, node)) 247 #labels[len(sentenceGraph.tokens)+index].add(preTag+node.attrib["POS"]) 248 # for code in node.metamapCodes: 249 # labels[len(tokensById)+index].add(preTag+code) 250 # if node.isPPIInteraction: 251 # labels[len(tokensById)+index].add(preTag+"1Nt3R4Ct") 252 253 return adjMatrix, labels
254
255 - def _setAllDependencyWeights(self, sentenceGraph, weights, weight):
256 """ All weights are set to the given value 257 """ 258 for node in sentenceGraph.dependencies: 259 weights[node] = weight
260
261 - def _getEdgeList(self, edgeDict):
262 allEdges = [] 263 if edgeDict != None: 264 keys1 = edgeDict.keys() 265 keys1.sort() 266 for k1 in keys1: 267 keys2 = edgeDict[k1].keys() 268 keys2.sort() 269 for k2 in keys2: 270 allEdges.extend(edgeDict[k1][k2]) 271 return allEdges
272
273 - def _setDependencyWeightsByPath(self, edges, weights, weight):
274 """ The weights of all dependencies in specified paths are set to the 275 given value 276 """ 277 allEdges = edges #self._getEdgeList(edges) 278 279 for edge in allEdges: 280 assert(weights.has_key(edge[2])) 281 weights[edge[2]] = weight
282
283 - def _reduceWeightByDistance(self, sentenceGraph, weights, zeroDistanceThreshold = 0.9, reduceFactor = 0.5):
284 """ Reduces the weight of dependencies based on their distance 285 from the nearest dependency whose weight is >= the threshold. 286 """ 287 undirected = sentenceGraph.dependencyGraph.toUndirected() #.to_undirected() 288 edges = undirected.edges 289 tempGraph = NX.Graph(directed=False) 290 for edge in edges: 291 tempGraph.add_edge(edge[0], edge[1]) 292 tokenDistanceDict = NX.all_pairs_shortest_path_length(tempGraph, cutoff=999) 293 dependencyDistances = {} 294 295 zeroDistanceEdges = [] 296 for edge in edges: 297 if weights[edge[2]] >= zeroDistanceThreshold: 298 zeroDistanceEdges.append(edge) 299 dependencyDistances[edge[2]] = 0 300 301 # Cannot reduce weight if no node is over threshold 302 if len(zeroDistanceEdges) == 0: 303 return 304 305 # Calculate distances 306 for edge in edges: 307 if edge in zeroDistanceEdges: 308 continue 309 shortestDistance = 99 310 for zeroDistanceEdge in zeroDistanceEdges: 311 if tokenDistanceDict.has_key(edge[0]): 312 if tokenDistanceDict[edge[0]].has_key(zeroDistanceEdge[0]): 313 if tokenDistanceDict[ edge[0] ][ zeroDistanceEdge[0] ] < shortestDistance: 314 shortestDistance = tokenDistanceDict[ edge[0] ][ zeroDistanceEdge[0] ] 315 if tokenDistanceDict[edge[0]].has_key(zeroDistanceEdge[1]): 316 if tokenDistanceDict[ edge[0] ][ zeroDistanceEdge[1] ] < shortestDistance: 317 shortestDistance = tokenDistanceDict[ edge[0] ][ zeroDistanceEdge[1] ] 318 if tokenDistanceDict.has_key(edge[1]): 319 if tokenDistanceDict[edge[1]].has_key(zeroDistanceEdge[0]): 320 if tokenDistanceDict[ edge[1] ][ zeroDistanceEdge[0] ] < shortestDistance: 321 shortestDistance = tokenDistanceDict[ edge[1] ][ zeroDistanceEdge[0] ] 322 if tokenDistanceDict[edge[1]].has_key(zeroDistanceEdge[1]): 323 if tokenDistanceDict[ edge[1] ][ zeroDistanceEdge[1] ] < shortestDistance: 324 shortestDistance = tokenDistanceDict[ edge[1] ][ zeroDistanceEdge[1] ] 325 #assert(not dependencyDistances.has_key(edge[2])) 326 dependencyDistances[edge[2]] = shortestDistance + 1 327 328 # Reduce weight 329 for dependency in sentenceGraph.dependencies: 330 if not dependencyDistances.has_key(dependency): 331 dependencyDistances[dependency] = 99 332 weights[dependency] *= pow(reduceFactor, max(dependencyDistances[dependency] - 1, 0))
333 334 # def setPPIPrefixForDependencies(self, sentenceGraph, weightByDependency, prefix, threshold): 335 # """ Sets the dependencies ppiType to their dependencyType, 336 # and adds a prefix if their weight is over a given threshold 337 # """ 338 # for dependency in sentenceGraph.dependencies: 339 # if weightByDependency[dependency] >= threshold: 340 # dependency.to.isOnShortestPath = True 341 # dependency.fro.isOnShortestPath = True 342 # if type(dependency.dependencyType) == types.ListType: 343 # dependency.ppiType = [] 344 # for i in range(len(dependency.dependencyType)): 345 # dependency.ppiType.append(prefix + dependency.dependencyType[i]) 346 # else: 347 # dependency.ppiType = prefix + dependency.dependencyType 348 # else: 349 # if type(dependency.dependencyType) == types.ListType: 350 # dependency.ppiType = [] 351 # for i in range(len(dependency.dependencyType)): 352 # dependency.ppiType.append(dependency.dependencyType[i]) 353 # else: 354 # dependency.ppiType = dependency.dependencyType 355
356 - def _addPositionTags(self, sentenceGraph, entity1Tokens, entity2Tokens):
357 """ Sets a prefix to the tokens ppiText based on their linear 358 order in the sentence. 359 """ 360 entity1TokenIds = [] 361 for token in entity1Tokens: 362 entity1TokenIds.append(self._getTokenId(token)) 363 entity2TokenIds = [] 364 for token in entity2Tokens: 365 entity2TokenIds.append(self._getTokenId(token)) 366 entity1FirstTokenId = min(entity1TokenIds) 367 entity2LastTokenId = max(entity2TokenIds) 368 369 preTagByToken = {} 370 for token in sentenceGraph.tokens: 371 pretag = "$$" 372 tokenId = self._getTokenId(token) 373 if not (tokenId in entity1TokenIds or tokenId in entity2TokenIds): 374 if tokenId < entity1FirstTokenId: 375 pretag = "$B$" 376 elif tokenId > entity2LastTokenId: 377 pretag = "$A$" 378 preTagByToken[token] = pretag 379 380 return preTagByToken
381