Package TEES :: Package ExampleBuilders :: Module EntityExampleBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.EntityExampleBuilder

  1  """ 
  2  Trigger examples 
  3  """ 
  4  __version__ = "$Revision: 1.34 $" 
  5   
  6  import sys, os 
  7  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  8  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  9  from ExampleBuilder import ExampleBuilder 
 10  import Utils.Libraries.PorterStemmer as PorterStemmer 
 11  from Core.IdSet import IdSet 
 12  import Core.ExampleUtils as ExampleUtils 
 13  #from Core.Gazetteer import Gazetteer 
 14  from FeatureBuilders.RELFeatureBuilder import RELFeatureBuilder 
 15  from FeatureBuilders.WordNetFeatureBuilder import WordNetFeatureBuilder 
 16  from FeatureBuilders.GiulianoFeatureBuilder import GiulianoFeatureBuilder 
 17  import PhraseTriggerExampleBuilder 
 18  import Utils.InteractionXML.ResolveEPITriggerTypes 
 19   
20 -class EntityExampleBuilder(ExampleBuilder):
21 - def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
22 if classSet == None: 23 classSet = IdSet(1) 24 assert( classSet.getId("neg") == 1 ) 25 if featureSet == None: 26 featureSet = IdSet() 27 28 ExampleBuilder.__init__(self, classSet, featureSet) 29 #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" 30 if gazetteerFileName!=None: 31 self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) 32 print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName 33 else: 34 print >> sys.stderr, "No gazetteer loaded" 35 self.gazetteer=None 36 self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano", 37 "epi_merge_negated", "limit_merged_types", "genia_task1", 38 "build_for_nameless", "pos_only", "all_tokens", 39 "names", "pos_pairs", "linear_ngrams", "phospho"]) 40 self.styles = self.getParameters(style) 41 # if "selftrain_group" in self.styles: 42 # self.selfTrainGroups = set() 43 # if "selftrain_group-1" in self.styles: 44 # self.selfTrainGroups.add("-1") 45 # if "selftrain_group0" in self.styles: 46 # self.selfTrainGroups.add("0") 47 # if "selftrain_group1" in self.styles: 48 # self.selfTrainGroups.add("1") 49 # if "selftrain_group2" in self.styles: 50 # self.selfTrainGroups.add("2") 51 # if "selftrain_group3" in self.styles: 52 # self.selfTrainGroups.add("3") 53 # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups 54 55 self.skiplist = set() 56 if skiplist != None: 57 f = open(skiplist, "rt") 58 for line in f.readlines(): 59 self.skiplist.add(line.strip()) 60 f.close() 61 62 if self.styles["rel_features"]: 63 self.relFeatureBuilder = RELFeatureBuilder(featureSet) 64 if self.styles["wordnet"]: 65 self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) 66 if self.styles["bb_features"]: 67 self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens() 68 #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) 69 if self.styles["giuliano"]: 70 self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
71
72 - def getMergedEntityType(self, entities):
73 """ 74 If a single token belongs to multiple entities of different types, 75 a new, composite type is defined. This type is the alphabetically 76 ordered types of these entities joined with '---'. 77 """ 78 types = set() 79 entityIds = set() 80 for entity in entities: 81 if entity.get("isName") == "True" and self.styles["all_tokens"]: 82 continue 83 if entity.get("type") == "Entity" and self.styles["genia_task1"]: 84 continue 85 if self.styles["epi_merge_negated"]: 86 types.add(Utils.InteractionXML.ResolveEPITriggerTypes.getEPIBaseType(entity.get("type"))) 87 entityIds.add(entity.get("id")) 88 else: 89 types.add(entity.get("type")) 90 entityIds.add(entity.get("id")) 91 types = list(types) 92 types.sort() 93 typeString = "" 94 for type in types: 95 #if type == "Protein" and "all_tokens" in self.styles: 96 # continue 97 if typeString != "": 98 typeString += "---" 99 typeString += type 100 101 if typeString == "": 102 return "neg", None 103 104 idString = "/".join(sorted(list(entityIds))) 105 106 if self.styles["limit_merged_types"]: 107 if typeString.find("---") != -1: 108 if typeString == "Gene_expression---Positive_regulation": 109 return typeString, idString 110 else: 111 return typeString.split("---")[0], idString # ids partially incorrect 112 else: 113 return typeString, idString 114 return typeString, idString
115
116 - def getTokenFeatures(self, token, sentenceGraph):
117 """ 118 Returns a list of features based on the attributes of a token. 119 These can be used to define more complex features. 120 """ 121 # These features are cached when this method is first called 122 # for a token. 123 if self.tokenFeatures.has_key(token): 124 return self.tokenFeatures[token], self.tokenFeatureWeights[token] 125 tokTxt=sentenceGraph.getTokenText(token) 126 features = {} 127 features["_txt_"+tokTxt]=1 128 features["_POS_"+token.get("POS")]=1 129 if sentenceGraph.tokenIsName[token] and not self.styles["names"]: 130 features["_isName"]=1 131 for entity in sentenceGraph.tokenIsEntityHead[token]: 132 if entity.get("isName") == "True": 133 features["_annType_"+entity.get("type")]=1 134 # # Filip's gazetteer based features (can be used separately from exclude_gazetteer) 135 # if "gazetteer_features" in self.styles: 136 # tokTxtLower = tokTxt.lower() 137 # if "stem_gazetteer" in self.styles: 138 # tokTxtLower = PorterStemmer.stem(tokTxtLower) 139 # if self.gazetteer and tokTxtLower in self.gazetteer: 140 # for label,weight in self.gazetteer[tokTxtLower].items(): 141 # features["_knownLabel_"+label]=weight # 1 performs slightly worse 142 ## BANNER features 143 #if sentenceGraph.entityHintsByToken.has_key(token): 144 # features["BANNER-entity"] = 1 145 # Wordnet features 146 #if "wordnet" in self.styles: 147 # for wordNetFeature in self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, token.get("POS")): 148 # features["_WN_"+wordNetFeature] = 1 149 self.tokenFeatures[token] = sorted(features.keys()) 150 self.tokenFeatureWeights[token] = features 151 return self.tokenFeatures[token], self.tokenFeatureWeights[token]
152
153 - def buildLinearOrderFeatures(self,sentenceGraph,index,tag,features):
154 """ 155 Linear features are built by marking token features with a tag 156 that defines their relative position in the linear order. 157 """ 158 tag = "linear_"+tag 159 tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph) 160 for tokenFeature in tokenFeatures: 161 features[self.featureSet.getId(tag+tokenFeature)] = tokenFeatureWeights[tokenFeature]
162
163 - def buildLinearNGram(self, i, j, sentenceGraph, features):
164 ngram = "ngram" 165 for index in range(i, j+1): 166 ngram += "_" + sentenceGraph.getTokenText(sentenceGraph.tokens[index]).lower() 167 features[self.featureSet.getId(ngram)] = 1
168
169 - def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
170 """ 171 Build one example for each token of the sentence 172 """ 173 if sentenceGraph.sentenceElement.get("origId") in self.skiplist: 174 print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") 175 return 0 #[] 176 177 #examples = [] 178 exampleIndex = 0 179 180 self.tokenFeatures = {} 181 self.tokenFeatureWeights = {} 182 183 namedEntityHeadTokens = [] 184 if not self.styles["names"]: 185 namedEntityCount = 0 186 for entity in sentenceGraph.entities: 187 if entity.get("isName") == "True": # known data which can be used for features 188 namedEntityCount += 1 189 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) 190 # NOTE!!! This will change the number of examples and omit 191 # all triggers (positive and negative) from sentences which 192 # have no NE:s, possibly giving a too-optimistic performance 193 # value. Such sentences can still have triggers from intersentence 194 # interactions, but as such events cannot be recovered anyway, 195 # looking for these triggers would be pointless. 196 if namedEntityCount == 0 and not self.styles["build_for_nameless"]: # no names, no need for triggers 197 return 0 #[] 198 199 if self.styles["pos_pairs"]: 200 namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) 201 else: 202 for key in sentenceGraph.tokenIsName.keys(): 203 sentenceGraph.tokenIsName[key] = False 204 205 bagOfWords = {} 206 for token in sentenceGraph.tokens: 207 text = "bow_" + token.get("text") 208 if not bagOfWords.has_key(text): 209 bagOfWords[text] = 0 210 bagOfWords[text] += 1 211 if sentenceGraph.tokenIsName[token]: 212 text = "ne_" + text 213 if not bagOfWords.has_key(text): 214 bagOfWords[text] = 0 215 bagOfWords[text] += 1 216 bowFeatures = {} 217 for k in sorted(bagOfWords.keys()): 218 bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] 219 220 self.inEdgesByToken = {} 221 self.outEdgesByToken = {} 222 self.edgeSetByToken = {} 223 for token in sentenceGraph.tokens: 224 #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) 225 #fixedInEdges = [] 226 #for edge in inEdges: 227 # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) 228 #inEdges = fixedInEdges 229 inEdges = sentenceGraph.dependencyGraph.getInEdges(token) 230 #inEdges.sort(compareDependencyEdgesById) 231 self.inEdgesByToken[token] = inEdges 232 #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) 233 #fixedOutEdges = [] 234 #for edge in outEdges: 235 # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) 236 #outEdges = fixedOutEdges 237 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) 238 #outEdges.sort(compareDependencyEdgesById) 239 self.outEdgesByToken[token] = outEdges 240 self.edgeSetByToken[token] = set(inEdges + outEdges) 241 242 for i in range(len(sentenceGraph.tokens)): 243 token = sentenceGraph.tokens[i] 244 245 # CLASS 246 if len(sentenceGraph.tokenIsEntityHead[token]) > 0: 247 categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]) 248 else: 249 categoryName, entityIds = "neg", None 250 self.exampleStats.beginExample(categoryName) 251 252 # Recognize only non-named entities (i.e. interaction words) 253 if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]: 254 self.exampleStats.filter("name") 255 self.exampleStats.endExample() 256 continue 257 # if "selftrain_limits" in self.styles: 258 # # any predicted entity not part of the self-training set causes example to be rejected 259 # filtered = False 260 # for entity in sentenceGraph.tokenIsEntityHead[token]: 261 # if entity.get("selftrain") == "False": 262 # self.exampleStats.filter("selftrain_limits") 263 # self.exampleStats.endExample() 264 # filtered = True 265 # break 266 # if filtered: 267 # continue 268 # if "selftrain_group" in self.styles: 269 # # any predicted entity not part of the self-training set causes example to be rejected 270 # filtered = False 271 # for entity in sentenceGraph.tokenIsEntityHead[token]: 272 # if entity.get("selftraingroup") not in self.selfTrainGroups: 273 # self.exampleStats.filter("selftrain_group") 274 # self.exampleStats.endExample() 275 # filtered = True 276 # break 277 # if filtered: 278 # continue 279 if self.styles["pos_only"] and categoryName == "neg": 280 self.exampleStats.filter("pos_only") 281 self.exampleStats.endExample() 282 continue 283 284 category = self.classSet.getId(categoryName) 285 if category == None: 286 self.exampleStats.filter("undefined_class") 287 self.exampleStats.endExample() 288 continue 289 290 tokenText = token.get("text").lower() 291 # if "stem_gazetteer" in self.styles: 292 # tokenText = PorterStemmer.stem(tokenText) 293 # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: 294 # features = {} 295 # features[self.featureSet.getId("exclude_gazetteer")] = 1 296 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} 297 # if entityIds != None: 298 # extra["goldIds"] = entityIds 299 # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) 300 # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) 301 # exampleIndex += 1 302 # continue 303 304 # FEATURES 305 features = {} 306 307 if not self.styles["names"]: 308 features[self.featureSet.getId(namedEntityCountFeature)] = 1 309 #for k,v in bagOfWords.iteritems(): 310 # features[self.featureSet.getId(k)] = v 311 # pre-calculate bow _features_ 312 features.update(bowFeatures) 313 314 # for j in range(len(sentenceGraph.tokens)): 315 # text = "bow_" + sentenceGraph.tokens[j].get("text") 316 # if j < i: 317 # features[self.featureSet.getId("bf_" + text)] = 1 318 # elif j > i: 319 # features[self.featureSet.getId("af_" + text)] = 1 320 321 # Main features 322 text = token.get("text") 323 features[self.featureSet.getId("txt_"+text)] = 1 324 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 325 stem = PorterStemmer.stem(text) 326 features[self.featureSet.getId("stem_"+stem)] = 1 327 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 328 329 # Normalized versions of the string (if same as non-normalized, overlap without effect) 330 normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() 331 if normalizedText == "bound": # should be for all irregular verbs 332 normalizedText = "bind" 333 features[self.featureSet.getId("txt_"+normalizedText)] = 1 334 norStem = PorterStemmer.stem(normalizedText) 335 features[self.featureSet.getId("stem_"+norStem)] = 1 336 features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1 337 338 ## Subspan features 339 #textLower = text.lower() 340 #for i in range(1, len(textLower)): 341 # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 342 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 343 344 # Substring features 345 for string in text.split("-"): 346 stringLower = string.lower() 347 features[self.featureSet.getId("substring_"+stringLower)] = 1 348 features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1 349 350 # Linear order features 351 for index in [-3,-2,-1,1,2,3]: 352 if i + index > 0 and i + index < len(sentenceGraph.tokens): 353 self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) 354 355 # Linear n-grams 356 if self.styles["linear_ngrams"]: 357 self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features) 358 self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features) 359 360 if self.styles["phospho"]: 361 if text.find("hospho") != -1: 362 features[self.featureSet.getId("phospho_found")] = 1 363 features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1 364 features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1 365 366 if self.styles["bb_features"]: 367 if text.lower() in self.bacteriaTokens: 368 features[self.featureSet.getId("lpsnBacToken")] = 1 369 370 # Content 371 if i > 0 and text[0].isalpha() and text[0].isupper(): 372 features[self.featureSet.getId("upper_case_start")] = 1 373 for j in range(len(text)): 374 if j > 0 and text[j].isalpha() and text[j].isupper(): 375 features[self.featureSet.getId("upper_case_middle")] = 1 376 # numbers and special characters 377 if text[j].isdigit(): 378 features[self.featureSet.getId("has_digits")] = 1 379 if j > 0 and text[j-1] == "-": 380 features[self.featureSet.getId("has_hyphenated_digit")] = 1 381 elif text[j] == "-": 382 features[self.featureSet.getId("has_hyphen")] = 1 383 elif text[j] == "/": 384 features[self.featureSet.getId("has_fslash")] = 1 385 elif text[j] == "\\": 386 features[self.featureSet.getId("has_bslash")] = 1 387 # duplets 388 if j > 0: 389 features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 390 # triplets 391 if j > 1: 392 features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 393 # quadruplets (don't work, slight decrease (0.5 pp) on f-score 394 #if j > 2: 395 # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 396 397 # Attached edges (Hanging in and out edges) 398 t1InEdges = self.inEdgesByToken[token] 399 for edge in t1InEdges: 400 edgeType = edge[2].get("type") 401 features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 402 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 403 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 404 tokenText = sentenceGraph.getTokenText(edge[0]) 405 features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 406 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 407 tokenStem = PorterStemmer.stem(tokenText) 408 features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1 409 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1 410 features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 411 t1OutEdges = self.outEdgesByToken[token] 412 for edge in t1OutEdges: 413 edgeType = edge[2].get("type") 414 features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 415 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 416 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 417 tokenText = sentenceGraph.getTokenText(edge[1]) 418 features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 419 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 420 tokenStem = PorterStemmer.stem(tokenText) 421 features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1 422 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1 423 features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 424 425 # REL features 426 if self.styles["rel_features"]: 427 self.relFeatureBuilder.setFeatureVector(features) 428 self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i) 429 self.relFeatureBuilder.setFeatureVector(None) 430 431 #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") 432 #tokTxt = token.get("text") 433 #tokPOS = token.get("POS") 434 #wordNetFeatures = [] 435 #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) 436 #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) 437 if self.styles["wordnet"]: 438 tokTxt = token.get("text") 439 tokPOS = token.get("POS") 440 wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) 441 for wordNetFeature in wordNetFeatures: 442 #print wordNetFeature, 443 features[self.featureSet.getId("WN_"+wordNetFeature)] = 1 444 #print 445 446 if self.styles["giuliano"]: 447 self.giulianoFeatureBuilder.setFeatureVector(features) 448 self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph) 449 self.giulianoFeatureBuilder.setFeatureVector(None) 450 451 extra = {"xtype":"token","t":token.get("id")} 452 if self.styles["bb_features"]: 453 extra["trigex"] = "bb" # Request trigger extension in ExampleWriter 454 if self.styles["epi_merge_negated"]: 455 extra["unmergeneg"] = "epi" # Request trigger type unmerging 456 if entityIds != None: 457 extra["goldIds"] = entityIds # The entities to which this example corresponds 458 #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) 459 460 # chains 461 self.buildChains(token, sentenceGraph, features) 462 463 if self.styles["pos_pairs"]: 464 self.buildPOSPairs(token, namedEntityHeadTokens, features) 465 466 example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra) 467 ExampleUtils.appendExamples([example], outfile) 468 exampleIndex += 1 469 self.exampleStats.endExample() 470 #return examples 471 return exampleIndex
472
473 - def buildChains(self,token,sentenceGraph,features,depthLeft=3,chain="",visited=None):
474 if depthLeft == 0: 475 return 476 strDepthLeft = "dist_" + str(depthLeft) 477 478 if visited == None: 479 visited = set() 480 481 inEdges = self.inEdgesByToken[token] 482 outEdges = self.outEdgesByToken[token] 483 edgeSet = visited.union(self.edgeSetByToken[token]) 484 for edge in inEdges: 485 if not edge in visited: 486 edgeType = edge[2].get("type") 487 features[self.featureSet.getId("dep_"+strDepthLeft+edgeType)] = 1 488 489 nextToken = edge[0] 490 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph) 491 for tokenFeature in tokenFeatures: 492 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature] 493 # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: 494 # if entity.get("isName") == "True": 495 # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 496 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 497 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 498 # tokenText = sentenceGraph.getTokenText(nextToken) 499 # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 500 501 if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]: 502 features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1 503 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1 504 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-frw_"+edgeType,edgeSet) 505 506 for edge in outEdges: 507 if not edge in visited: 508 edgeType = edge[2].get("type") 509 features[self.featureSet.getId("dep_dist_"+strDepthLeft+edgeType)] = 1 510 511 nextToken = edge[1] 512 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph) 513 for tokenFeature in tokenFeatures: 514 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature] 515 # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: 516 # if entity.get("isName") == "True": 517 # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 518 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 519 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 520 # tokenText = sentenceGraph.getTokenText(nextToken) 521 # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 522 if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]: 523 features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1 524 525 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1 526 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)
527
528 - def getNamedEntityHeadTokens(self, sentenceGraph):
529 headTokens = [] 530 for entity in sentenceGraph.entities: 531 if entity.get("isName") == "True": # known data which can be used for features 532 headTokens.append(sentenceGraph.entityHeadTokenByEntity[entity]) 533 return headTokens
534
535 - def buildPOSPairs(self, token, namedEntityHeadTokens, features):
536 tokenPOS = token.get("POS") 537 assert tokenPOS != None 538 for headToken in namedEntityHeadTokens: 539 headPOS = headToken.get("POS") 540 features[self.featureSet.getId("POS_pair_NE_"+tokenPOS+"-"+headPOS)] = 1
541