Package TEES :: Package ExampleBuilders :: Module ModifierExampleBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.ModifierExampleBuilder

  1  """ 
  2  Speculation and negation examples 
  3  """ 
  4  __version__ = "$Revision: 1.12 $" 
  5   
  6  import sys, os 
  7  import types 
  8  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  9  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
 10  from ExampleBuilder import ExampleBuilder 
 11  import Utils.Libraries.PorterStemmer as PorterStemmer 
 12  from Core.IdSet import IdSet 
 13  import Core.ExampleUtils as ExampleUtils 
 14  #from Core.Gazetteer import Gazetteer 
 15  # For gold mapping 
 16  import Evaluators.EvaluateInteractionXML as EvaluateInteractionXML 
 17   
 18  # A list of speculation related words manually picked from the BioNLP'09 GE corpus 
 19  speculationWords = [ 
 20      'account', 'aim', 'almost', 'analysed', 'analyses', 'analysis', 'analyzed', 'appear',  
 21      'appeared', 'appears', 'argue', 'artifact', 'ascertain', 'asked', 'assayed',  
 22      'assessed', 'assumes', 'believed', 'can', 'candidates', 'clarify', 'clear',  
 23      'conclude', 'confirm', 'considered', 'could', 'define', 'delineate', 'determine',  
 24      'determined', 'elucidate', 'elucidating', 'establish', 'evaluate', 'evaluated',  
 25      'evaluates', 'evidence', 'examine', 'examined', 'explore', 'findings', 'hypothesis',  
 26      'hypothesize', 'hypothesized', 'idea', 'identification', 'implicated', 'implicates',  
 27      'implications', 'importance', 'important', 'indicate', 'indicated', 'indicators',  
 28      'information', 'insights', 'investigate', 'investigated', 'investigation', 'isolate',  
 29      'known', 'likely', 'may', 'measured', 'might', 'monitored', 'most', 'must',  
 30      'objective', 'obscure', 'observations', 'observed', 'partially', 'partly',  
 31      'performed', 'perhaps', 'play', 'plays', 'possible', 'postulated', 'potent',  
 32      'potential', 'potentially', 'probably', 'propose', 'proposed', 'putative',  
 33      'quantitated', 'reexamined', 'reported', 'revealed', 'role', 'screened', 'seemed',  
 34      'seems', 'shown', 'significantly', 'since', 'sought', 'studied', 'studies', 'study',  
 35      'suggest', 'suggested', 'suggesting', 'suggests', 'support', 'suspect', 'tested',  
 36      'thought', 'unclear', 'undefined', 'understand', 'unknown', 'whether'] 
 37   
38 -def readWords(words):
39 if type(words) in types.StringTypes: 40 wordSet = set() 41 f = open(filename) 42 for line in f.readlines(): 43 wordSet.add(line.strip()) 44 f.close() 45 else: # assume it's a list 46 wordSet = set(words) 47 stemSet = set() 48 for word in wordSet: 49 stemSet.add(PorterStemmer.stem(word)) 50 return wordSet, stemSet
51 52 #def compareDependencyEdgesById(dep1, dep2): 53 # """ 54 # Dependency edges are sorted, so that the program behaves consistently 55 # on the sama data between different runs. 56 # """ 57 # id1 = dep1[2].get("id") 58 # id2 = dep2[2].get("id") 59 # if id1 > id2: 60 # return 1 61 # elif id1 == id2: 62 # return 0 63 # else: # x<y 64 # return -1 65
66 -class ModifierExampleBuilder(ExampleBuilder):
67 - def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
68 global speculationWords 69 70 if classSet == None: 71 classSet = IdSet(1) 72 assert( classSet.getId("neg") == 1 ) 73 if featureSet == None: 74 featureSet = IdSet() 75 76 self.specWords, self.specWordStems = readWords(speculationWords) 77 78 ExampleBuilder.__init__(self, classSet, featureSet) 79 #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" 80 if gazetteerFileName!=None: 81 self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) 82 print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName 83 else: 84 self.gazetteer=None 85 self._setDefaultParameters({"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")}) 86 self.styles = self.getParameters(style)
87
88 - def getMergedEntityType(self, entities):
89 """ 90 If a single token belongs to multiple entities of different types, 91 a new, composite type is defined. This type is the alphabetically 92 ordered types of these entities joined with '---'. 93 """ 94 types = set() 95 for entity in entities: 96 types.add(entity.get("type")) 97 types = list(types) 98 types.sort() 99 typeString = "" 100 for type in types: 101 if typeString != "": 102 typeString += "---" 103 typeString += type 104 return typeString
105
106 - def getTokenFeatures(self, token, sentenceGraph):
107 """ 108 Returns a list of features based on the attributes of a token. 109 These can be used to define more complex features. 110 """ 111 # These features are cached when this method is first called 112 # for a token. 113 if self.tokenFeatures.has_key(token): 114 return self.tokenFeatures[token] 115 tokTxt=sentenceGraph.getTokenText(token) 116 features = {} 117 features["_txt_"+tokTxt]=1 118 features["_POS_"+token.get("POS")]=1 119 if self.styles["speculation_words"]: 120 if tokTxt in self.specWords: 121 features["_spec"]=1 122 features["_spec_"+tokTxt]=1 123 tokStem = PorterStemmer.stem(tokTxt) 124 if tokStem in self.specWordStems: 125 features["_spec_stem"]=1 126 features["_spec_stem_"+tokStem]=1 127 if sentenceGraph.tokenIsName[token]: 128 features["_isName"]=1 129 for entity in sentenceGraph.tokenIsEntityHead[token]: 130 if entity.get("isName") == "True": 131 features["_annType_"+entity.get("type")]=1 132 if self.gazetteer and tokTxt.lower() in self.gazetteer: 133 for label,weight in self.gazetteer[tokTxt.lower()].items(): 134 pass 135 #features["_knownLabel_"+label]=weight 136 self.tokenFeatures[token] = features 137 return features
138
139 - def buildLinearOrderFeatures(self,sentenceGraph,index,tag,features):
140 """ 141 Linear features are built by marking token features with a tag 142 that defines their relative position in the linear order. 143 """ 144 tag = "linear_"+tag 145 for tokenFeature,w in self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph).iteritems(): 146 features[self.featureSet.getId(tag+tokenFeature)] = w
147
148 - def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
149 """ 150 Build one example for each token of the sentence 151 """ 152 examples = [] 153 exampleIndex = 0 154 155 self.tokenFeatures = {} 156 157 if goldGraph != None: 158 entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities) 159 160 namedEntityCount = 0 161 entityCount = 0 162 for entity in sentenceGraph.entities: 163 if entity.get("isName") == "True": # known data which can be used for features 164 namedEntityCount += 1 165 else: # known data which can be used for features 166 entityCount += 1 167 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) 168 entityCountFeature = "entityCount_" + str(entityCount) 169 170 bagOfWords = {} 171 for token in sentenceGraph.tokens: 172 text = "bow_" + token.get("text") 173 if not bagOfWords.has_key(text): 174 bagOfWords[text] = 0 175 bagOfWords[text] += 1 176 if sentenceGraph.tokenIsName[token]: 177 text = "ne_" + text 178 if not bagOfWords.has_key(text): 179 bagOfWords[text] = 0 180 bagOfWords[text] += 1 181 if len(sentenceGraph.tokenIsEntityHead) > 0: 182 text = "ge_" + text 183 if not bagOfWords.has_key(text): 184 bagOfWords[text] = 0 185 bagOfWords[text] += 1 186 187 text = token.get("text") 188 if self.styles["speculation_words"] and text in self.specWords: 189 if not bagOfWords.has_key("spec_bow_"+text): 190 bagOfWords["spec_bow_"+text] = 0 191 bagOfWords["spec_bow_"+text] += 1 192 bagOfWords["spec_sentence"] = 1 193 194 bowFeatures = {} 195 for k,v in bagOfWords.iteritems(): 196 bowFeatures[self.featureSet.getId(k)] = v 197 198 self.inEdgesByToken = {} 199 self.outEdgesByToken = {} 200 self.edgeSetByToken = {} 201 for token in sentenceGraph.tokens: 202 inEdges = sentenceGraph.dependencyGraph.getInEdges(token) 203 self.inEdgesByToken[token] = inEdges 204 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) 205 self.outEdgesByToken[token] = outEdges 206 self.edgeSetByToken[token] = set(inEdges + outEdges) 207 208 for entity in sentenceGraph.entities: 209 #token = sentenceGraph.tokens[i] 210 token = sentenceGraph.entityHeadTokenByEntity[entity] 211 # Recognize only non-named entities (i.e. interaction words) 212 if entity.get("isName") == "True": 213 continue 214 215 # CLASS 216 if self.styles["classification"] == "multiclass": 217 task3Type = "multiclass" 218 categoryName = "" 219 if entity.get("negation") == "True": 220 categoryName += "negation" 221 if entity.get("speculation") == "True": 222 if categoryName != "": 223 categoryName += "---" 224 categoryName += "speculation" 225 if categoryName == "": 226 categoryName = "neg" 227 category = self.classSet.getId(categoryName) 228 elif self.styles["classification"] == "speculation": 229 task3Type = "speculation" 230 if entity.get("speculation") == "True": 231 category = self.classSet.getId("speculation") 232 else: 233 category = 1 234 if goldGraph != None: 235 if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True": 236 category = self.classSet.getId("speculation") 237 else: 238 category = 1 239 categoryName = self.classSet.getName(category) 240 elif self.styles["classification"] == "negation": 241 task3Type = "negation" 242 if entity.get("negation") == "True": 243 category = self.classSet.getId("negation") 244 else: 245 category = 1 246 if goldGraph != None: 247 if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True": 248 category = self.classSet.getId("negation") 249 else: 250 category = 1 251 categoryName = self.classSet.getName(category) 252 self.exampleStats.beginExample(categoryName) 253 254 # FEATURES 255 features = {} 256 257 # ENTITY TYPE 258 #entityType = self.classSet.getId(self.getMergedEntityType(entity)) 259 #del self.classSet.Ids[self.getMergedEntityType(entity)] 260 #IF LOCAL 261 # There's a mistake here. The entityType should be the string, not 262 # the id of the type. But there's also another issue. getMergedEntityType 263 # expects a list, not an item. Therefore the type is always empty -> 264 # types don't get used in classification. But this is the code used in 265 # the publication, so it will now be published as is, and fixed in a later 266 # release. 267 # 268 # Besides, using the classSet here generates an unneeded 269 # additional class, that shows up in evaluations etc. However, to be 270 # able to publish the exact models used for the publication experiments, 271 # this can't be fixed so it breaks feature id consistency. Therefore I'll 272 # now just remove the redundant class id from the classSet. 273 #ENDIF 274 #features[self.featureSet.getId(entityType)] = 1 275 276 features[self.featureSet.getId(namedEntityCountFeature)] = 1 277 features[self.featureSet.getId(entityCountFeature)] = 1 278 #for k,v in bagOfWords.iteritems(): 279 # features[self.featureSet.getId(k)] = v 280 # pre-calculate bow _features_ 281 features.update(bowFeatures) 282 283 # for j in range(len(sentenceGraph.tokens)): 284 # text = "bow_" + sentenceGraph.tokens[j].get("text") 285 # if j < i: 286 # features[self.featureSet.getId("bf_" + text)] = 1 287 # elif j > i: 288 # features[self.featureSet.getId("af_" + text)] = 1 289 290 # Main features 291 text = token.get("text") 292 features[self.featureSet.getId("txt_"+text)] = 1 293 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 294 stem = PorterStemmer.stem(text) 295 features[self.featureSet.getId("stem_"+stem)] = 1 296 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 297 298 if self.styles["speculation_words"]: 299 if text in self.specWords: 300 features[self.featureSet.getId("ent_spec")] = 1 301 if stem in self.specWordStems: 302 features[self.featureSet.getId("ent_spec_stem")] = 1 303 304 # Linear order features 305 for i in range(len(sentenceGraph.tokens)): 306 if token == sentenceGraph.tokens[i]: 307 break 308 for index in [-3,-2,-1,1,2,3]: 309 if i + index > 0 and i + index < len(sentenceGraph.tokens): 310 self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) 311 312 # Content 313 if i > 0 and text[0].isalpha() and text[0].isupper(): 314 features[self.featureSet.getId("upper_case_start")] = 1 315 for j in range(len(text)): 316 if j > 0 and text[j].isalpha() and text[j].isupper(): 317 features[self.featureSet.getId("upper_case_middle")] = 1 318 # numbers and special characters 319 if text[j].isdigit(): 320 features[self.featureSet.getId("has_digits")] = 1 321 if j > 0 and text[j-1] == "-": 322 features[self.featureSet.getId("has_hyphenated_digit")] = 1 323 elif text[j] == "-": 324 features[self.featureSet.getId("has_hyphen")] = 1 325 elif text[j] == "/": 326 features[self.featureSet.getId("has_fslash")] = 1 327 elif text[j] == "\\": 328 features[self.featureSet.getId("has_bslash")] = 1 329 # duplets 330 if j > 0: 331 features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 332 # triplets 333 if j > 1: 334 features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 335 336 # Attached edges (Hanging in and out edges) 337 t1InEdges = self.inEdgesByToken[token] 338 for edge in t1InEdges: 339 edgeType = edge[2].get("type") 340 features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 341 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 342 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 343 tokenText = sentenceGraph.getTokenText(edge[0]) 344 features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 345 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 346 t1OutEdges = self.outEdgesByToken[token] 347 for edge in t1OutEdges: 348 edgeType = edge[2].get("type") 349 features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 350 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 351 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 352 tokenText = sentenceGraph.getTokenText(edge[1]) 353 features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 354 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 355 356 self.buildChains(token, sentenceGraph, features) 357 358 extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")} 359 #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) 360 example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) 361 ExampleUtils.appendExamples([example], outfile) 362 exampleIndex += 1 363 self.exampleStats.endExample() 364 #return examples 365 return exampleIndex
366
367 - def buildChains(self,token,sentenceGraph,features,depthLeft=3,chain="",visited=None):
368 if depthLeft == 0: 369 return 370 strDepthLeft = "dist_" + str(depthLeft) 371 372 if visited == None: 373 visited = set() 374 375 inEdges = self.inEdgesByToken[token] 376 outEdges = self.outEdgesByToken[token] 377 edgeSet = visited.union(self.edgeSetByToken[token]) 378 for edge in inEdges: 379 if not edge in visited: 380 edgeType = edge[2].get("type") 381 features[self.featureSet.getId("dep_"+strDepthLeft+edgeType)] = 1 382 383 nextToken = edge[0] 384 for tokenFeature,w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): 385 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w 386 387 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1 388 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-frw_"+edgeType,edgeSet) 389 390 for edge in outEdges: 391 if not edge in visited: 392 edgeType = edge[2].get("type") 393 features[self.featureSet.getId("dep_dist_"+strDepthLeft+edgeType)] = 1 394 395 nextToken = edge[1] 396 for tokenFeature,w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): 397 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w 398 399 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1 400 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)
401