TEES.ExampleBuilders.ModifierExampleBuilder

67 - def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):

68 global speculationWords 69 70 if classSet == None: 71 classSet = IdSet(1) 72 assert( classSet.getId("neg") == 1 ) 73 if featureSet == None: 74 featureSet = IdSet() 75 76 self.specWords, self.specWordStems = readWords(speculationWords) 77 78 ExampleBuilder.__init__(self, classSet, featureSet) 79 #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" 80 if gazetteerFileName!=None: 81 self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) 82 print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName 83 else: 84 self.gazetteer=None 85 self._setDefaultParameters({"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")}) 86 self.styles = self.getParameters(style)

87

88 - def getMergedEntityType(self, entities):

89 """ 90 If a single token belongs to multiple entities of different types, 91 a new, composite type is defined. This type is the alphabetically 92 ordered types of these entities joined with '---'. 93 """ 94 types = set() 95 for entity in entities: 96 types.add(entity.get("type")) 97 types = list(types) 98 types.sort() 99 typeString = "" 100 for type in types: 101 if typeString != "": 102 typeString += "---" 103 typeString += type 104 return typeString

105

106 - def getTokenFeatures(self, token, sentenceGraph):

107 """ 108 Returns a list of features based on the attributes of a token. 109 These can be used to define more complex features. 110 """ 111 # These features are cached when this method is first called 112 # for a token. 113 if self.tokenFeatures.has_key(token): 114 return self.tokenFeatures[token] 115 tokTxt=sentenceGraph.getTokenText(token) 116 features = {} 117 features["_txt_"+tokTxt]=1 118 features["_POS_"+token.get("POS")]=1 119 if self.styles["speculation_words"]: 120 if tokTxt in self.specWords: 121 features["_spec"]=1 122 features["_spec_"+tokTxt]=1 123 tokStem = PorterStemmer.stem(tokTxt) 124 if tokStem in self.specWordStems: 125 features["_spec_stem"]=1 126 features["_spec_stem_"+tokStem]=1 127 if sentenceGraph.tokenIsName[token]: 128 features["_isName"]=1 129 for entity in sentenceGraph.tokenIsEntityHead[token]: 130 if entity.get("isName") == "True": 131 features["_annType_"+entity.get("type")]=1 132 if self.gazetteer and tokTxt.lower() in self.gazetteer: 133 for label,weight in self.gazetteer[tokTxt.lower()].items(): 134 pass 135 #features["_knownLabel_"+label]=weight 136 self.tokenFeatures[token] = features 137 return features

138

139 - def buildLinearOrderFeatures(self,sentenceGraph,index,tag,features):

140 """ 141 Linear features are built by marking token features with a tag 142 that defines their relative position in the linear order. 143 """ 144 tag = "linear_"+tag 145 for tokenFeature,w in self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph).iteritems(): 146 features[self.featureSet.getId(tag+tokenFeature)] = w

147

148 - def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):

149 """ 150 Build one example for each token of the sentence 151 """ 152 examples = [] 153 exampleIndex = 0 154 155 self.tokenFeatures = {} 156 157 if goldGraph != None: 158 entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities) 159 160 namedEntityCount = 0 161 entityCount = 0 162 for entity in sentenceGraph.entities: 163 if entity.get("isName") == "True": # known data which can be used for features 164 namedEntityCount += 1 165 else: # known data which can be used for features 166 entityCount += 1 167 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) 168 entityCountFeature = "entityCount_" + str(entityCount) 169 170 bagOfWords = {} 171 for token in sentenceGraph.tokens: 172 text = "bow_" + token.get("text") 173 if not bagOfWords.has_key(text): 174 bagOfWords[text] = 0 175 bagOfWords[text] += 1 176 if sentenceGraph.tokenIsName[token]: 177 text = "ne_" + text 178 if not bagOfWords.has_key(text): 179 bagOfWords[text] = 0 180 bagOfWords[text] += 1 181 if len(sentenceGraph.tokenIsEntityHead) > 0: 182 text = "ge_" + text 183 if not bagOfWords.has_key(text): 184 bagOfWords[text] = 0 185 bagOfWords[text] += 1 186 187 text = token.get("text") 188 if self.styles["speculation_words"] and text in self.specWords: 189 if not bagOfWords.has_key("spec_bow_"+text): 190 bagOfWords["spec_bow_"+text] = 0 191 bagOfWords["spec_bow_"+text] += 1 192 bagOfWords["spec_sentence"] = 1 193 194 bowFeatures = {} 195 for k,v in bagOfWords.iteritems(): 196 bowFeatures[self.featureSet.getId(k)] = v 197 198 self.inEdgesByToken = {} 199 self.outEdgesByToken = {} 200 self.edgeSetByToken = {} 201 for token in sentenceGraph.tokens: 202 inEdges = sentenceGraph.dependencyGraph.getInEdges(token) 203 self.inEdgesByToken[token] = inEdges 204 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) 205 self.outEdgesByToken[token] = outEdges 206 self.edgeSetByToken[token] = set(inEdges + outEdges) 207 208 for entity in sentenceGraph.entities: 209 #token = sentenceGraph.tokens[i] 210 token = sentenceGraph.entityHeadTokenByEntity[entity] 211 # Recognize only non-named entities (i.e. interaction words) 212 if entity.get("isName") == "True": 213 continue 214 215 # CLASS 216 if self.styles["classification"] == "multiclass": 217 task3Type = "multiclass" 218 categoryName = "" 219 if entity.get("negation") == "True": 220 categoryName += "negation" 221 if entity.get("speculation") == "True": 222 if categoryName != "": 223 categoryName += "---" 224 categoryName += "speculation" 225 if categoryName == "": 226 categoryName = "neg" 227 category = self.classSet.getId(categoryName) 228 elif self.styles["classification"] == "speculation": 229 task3Type = "speculation" 230 if entity.get("speculation") == "True": 231 category = self.classSet.getId("speculation") 232 else: 233 category = 1 234 if goldGraph != None: 235 if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True": 236 category = self.classSet.getId("speculation") 237 else: 238 category = 1 239 categoryName = self.classSet.getName(category) 240 elif self.styles["classification"] == "negation": 241 task3Type = "negation" 242 if entity.get("negation") == "True": 243 category = self.classSet.getId("negation") 244 else: 245 category = 1 246 if goldGraph != None: 247 if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True": 248 category = self.classSet.getId("negation") 249 else: 250 category = 1 251 categoryName = self.classSet.getName(category) 252 self.exampleStats.beginExample(categoryName) 253 254 # FEATURES 255 features = {} 256 257 # ENTITY TYPE 258 #entityType = self.classSet.getId(self.getMergedEntityType(entity)) 259 #del self.classSet.Ids[self.getMergedEntityType(entity)] 260 #IF LOCAL 261 # There's a mistake here. The entityType should be the string, not 262 # the id of the type. But there's also another issue. getMergedEntityType 263 # expects a list, not an item. Therefore the type is always empty -> 264 # types don't get used in classification. But this is the code used in 265 # the publication, so it will now be published as is, and fixed in a later 266 # release. 267 # 268 # Besides, using the classSet here generates an unneeded 269 # additional class, that shows up in evaluations etc. However, to be 270 # able to publish the exact models used for the publication experiments, 271 # this can't be fixed so it breaks feature id consistency. Therefore I'll 272 # now just remove the redundant class id from the classSet. 273 #ENDIF 274 #features[self.featureSet.getId(entityType)] = 1 275 276 features[self.featureSet.getId(namedEntityCountFeature)] = 1 277 features[self.featureSet.getId(entityCountFeature)] = 1 278 #for k,v in bagOfWords.iteritems(): 279 # features[self.featureSet.getId(k)] = v 280 # pre-calculate bow _features_ 281 features.update(bowFeatures) 282 283 # for j in range(len(sentenceGraph.tokens)): 284 # text = "bow_" + sentenceGraph.tokens[j].get("text") 285 # if j < i: 286 # features[self.featureSet.getId("bf_" + text)] = 1 287 # elif j > i: 288 # features[self.featureSet.getId("af_" + text)] = 1 289 290 # Main features 291 text = token.get("text") 292 features[self.featureSet.getId("txt_"+text)] = 1 293 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 294 stem = PorterStemmer.stem(text) 295 features[self.featureSet.getId("stem_"+stem)] = 1 296 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 297 298 if self.styles["speculation_words"]: 299 if text in self.specWords: 300 features[self.featureSet.getId("ent_spec")] = 1 301 if stem in self.specWordStems: 302 features[self.featureSet.getId("ent_spec_stem")] = 1 303 304 # Linear order features 305 for i in range(len(sentenceGraph.tokens)): 306 if token == sentenceGraph.tokens[i]: 307 break 308 for index in [-3,-2,-1,1,2,3]: 309 if i + index > 0 and i + index < len(sentenceGraph.tokens): 310 self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) 311 312 # Content 313 if i > 0 and text[0].isalpha() and text[0].isupper(): 314 features[self.featureSet.getId("upper_case_start")] = 1 315 for j in range(len(text)): 316 if j > 0 and text[j].isalpha() and text[j].isupper(): 317 features[self.featureSet.getId("upper_case_middle")] = 1 318 # numbers and special characters 319 if text[j].isdigit(): 320 features[self.featureSet.getId("has_digits")] = 1 321 if j > 0 and text[j-1] == "-": 322 features[self.featureSet.getId("has_hyphenated_digit")] = 1 323 elif text[j] == "-": 324 features[self.featureSet.getId("has_hyphen")] = 1 325 elif text[j] == "/": 326 features[self.featureSet.getId("has_fslash")] = 1 327 elif text[j] == "\\": 328 features[self.featureSet.getId("has_bslash")] = 1 329 # duplets 330 if j > 0: 331 features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 332 # triplets 333 if j > 1: 334 features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 335 336 # Attached edges (Hanging in and out edges) 337 t1InEdges = self.inEdgesByToken[token] 338 for edge in t1InEdges: 339 edgeType = edge[2].get("type") 340 features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 341 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 342 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 343 tokenText = sentenceGraph.getTokenText(edge[0]) 344 features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 345 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 346 t1OutEdges = self.outEdgesByToken[token] 347 for edge in t1OutEdges: 348 edgeType = edge[2].get("type") 349 features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 350 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 351 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 352 tokenText = sentenceGraph.getTokenText(edge[1]) 353 features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 354 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 355 356 self.buildChains(token, sentenceGraph, features) 357 358 extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")} 359 #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) 360 example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) 361 ExampleUtils.appendExamples([example], outfile) 362 exampleIndex += 1 363 self.exampleStats.endExample() 364 #return examples 365 return exampleIndex

366

367 - def buildChains(self,token,sentenceGraph,features,depthLeft=3,chain="",visited=None):

368 if depthLeft == 0: 369 return 370 strDepthLeft = "dist_" + str(depthLeft) 371 372 if visited == None: 373 visited = set() 374 375 inEdges = self.inEdgesByToken[token] 376 outEdges = self.outEdgesByToken[token] 377 edgeSet = visited.union(self.edgeSetByToken[token]) 378 for edge in inEdges: 379 if not edge in visited: 380 edgeType = edge[2].get("type") 381 features[self.featureSet.getId("dep_"+strDepthLeft+edgeType)] = 1 382 383 nextToken = edge[0] 384 for tokenFeature,w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): 385 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w 386 387 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1 388 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-frw_"+edgeType,edgeSet) 389 390 for edge in outEdges: 391 if not edge in visited: 392 edgeType = edge[2].get("type") 393 features[self.featureSet.getId("dep_dist_"+strDepthLeft+edgeType)] = 1 394 395 nextToken = edge[1] 396 for tokenFeature,w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): 397 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w 398 399 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1 400 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)

Source Code for Module TEES.ExampleBuilders.ModifierExampleBuilder