1 """
2 Trigger examples
3 """
4 __version__ = "$Revision: 1.34 $"
5
6 import sys, os
7 thisPath = os.path.dirname(os.path.abspath(__file__))
8 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
9 from ExampleBuilder import ExampleBuilder
10 import Utils.Libraries.PorterStemmer as PorterStemmer
11 from Core.IdSet import IdSet
12 import Core.ExampleUtils as ExampleUtils
13
14 from FeatureBuilders.RELFeatureBuilder import RELFeatureBuilder
15 from FeatureBuilders.WordNetFeatureBuilder import WordNetFeatureBuilder
16 from FeatureBuilders.GiulianoFeatureBuilder import GiulianoFeatureBuilder
17 import PhraseTriggerExampleBuilder
18 import Utils.InteractionXML.ResolveEPITriggerTypes
19
21 - def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
22 if classSet == None:
23 classSet = IdSet(1)
24 assert( classSet.getId("neg") == 1 )
25 if featureSet == None:
26 featureSet = IdSet()
27
28 ExampleBuilder.__init__(self, classSet, featureSet)
29
30 if gazetteerFileName!=None:
31 self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
32 print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
33 else:
34 print >> sys.stderr, "No gazetteer loaded"
35 self.gazetteer=None
36 self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano",
37 "epi_merge_negated", "limit_merged_types", "genia_task1",
38 "build_for_nameless", "pos_only", "all_tokens",
39 "names", "pos_pairs", "linear_ngrams", "phospho"])
40 self.styles = self.getParameters(style)
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 self.skiplist = set()
56 if skiplist != None:
57 f = open(skiplist, "rt")
58 for line in f.readlines():
59 self.skiplist.add(line.strip())
60 f.close()
61
62 if self.styles["rel_features"]:
63 self.relFeatureBuilder = RELFeatureBuilder(featureSet)
64 if self.styles["wordnet"]:
65 self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
66 if self.styles["bb_features"]:
67 self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens()
68
69 if self.styles["giuliano"]:
70 self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
71
73 """
74 If a single token belongs to multiple entities of different types,
75 a new, composite type is defined. This type is the alphabetically
76 ordered types of these entities joined with '---'.
77 """
78 types = set()
79 entityIds = set()
80 for entity in entities:
81 if entity.get("isName") == "True" and self.styles["all_tokens"]:
82 continue
83 if entity.get("type") == "Entity" and self.styles["genia_task1"]:
84 continue
85 if self.styles["epi_merge_negated"]:
86 types.add(Utils.InteractionXML.ResolveEPITriggerTypes.getEPIBaseType(entity.get("type")))
87 entityIds.add(entity.get("id"))
88 else:
89 types.add(entity.get("type"))
90 entityIds.add(entity.get("id"))
91 types = list(types)
92 types.sort()
93 typeString = ""
94 for type in types:
95
96
97 if typeString != "":
98 typeString += "---"
99 typeString += type
100
101 if typeString == "":
102 return "neg", None
103
104 idString = "/".join(sorted(list(entityIds)))
105
106 if self.styles["limit_merged_types"]:
107 if typeString.find("---") != -1:
108 if typeString == "Gene_expression---Positive_regulation":
109 return typeString, idString
110 else:
111 return typeString.split("---")[0], idString
112 else:
113 return typeString, idString
114 return typeString, idString
115
117 """
118 Returns a list of features based on the attributes of a token.
119 These can be used to define more complex features.
120 """
121
122
123 if self.tokenFeatures.has_key(token):
124 return self.tokenFeatures[token], self.tokenFeatureWeights[token]
125 tokTxt=sentenceGraph.getTokenText(token)
126 features = {}
127 features["_txt_"+tokTxt]=1
128 features["_POS_"+token.get("POS")]=1
129 if sentenceGraph.tokenIsName[token] and not self.styles["names"]:
130 features["_isName"]=1
131 for entity in sentenceGraph.tokenIsEntityHead[token]:
132 if entity.get("isName") == "True":
133 features["_annType_"+entity.get("type")]=1
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149 self.tokenFeatures[token] = sorted(features.keys())
150 self.tokenFeatureWeights[token] = features
151 return self.tokenFeatures[token], self.tokenFeatureWeights[token]
152
154 """
155 Linear features are built by marking token features with a tag
156 that defines their relative position in the linear order.
157 """
158 tag = "linear_"+tag
159 tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph)
160 for tokenFeature in tokenFeatures:
161 features[self.featureSet.getId(tag+tokenFeature)] = tokenFeatureWeights[tokenFeature]
162
164 ngram = "ngram"
165 for index in range(i, j+1):
166 ngram += "_" + sentenceGraph.getTokenText(sentenceGraph.tokens[index]).lower()
167 features[self.featureSet.getId(ngram)] = 1
168
170 """
171 Build one example for each token of the sentence
172 """
173 if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
174 print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId")
175 return 0
176
177
178 exampleIndex = 0
179
180 self.tokenFeatures = {}
181 self.tokenFeatureWeights = {}
182
183 namedEntityHeadTokens = []
184 if not self.styles["names"]:
185 namedEntityCount = 0
186 for entity in sentenceGraph.entities:
187 if entity.get("isName") == "True":
188 namedEntityCount += 1
189 namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
190
191
192
193
194
195
196 if namedEntityCount == 0 and not self.styles["build_for_nameless"]:
197 return 0
198
199 if self.styles["pos_pairs"]:
200 namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)
201 else:
202 for key in sentenceGraph.tokenIsName.keys():
203 sentenceGraph.tokenIsName[key] = False
204
205 bagOfWords = {}
206 for token in sentenceGraph.tokens:
207 text = "bow_" + token.get("text")
208 if not bagOfWords.has_key(text):
209 bagOfWords[text] = 0
210 bagOfWords[text] += 1
211 if sentenceGraph.tokenIsName[token]:
212 text = "ne_" + text
213 if not bagOfWords.has_key(text):
214 bagOfWords[text] = 0
215 bagOfWords[text] += 1
216 bowFeatures = {}
217 for k in sorted(bagOfWords.keys()):
218 bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]
219
220 self.inEdgesByToken = {}
221 self.outEdgesByToken = {}
222 self.edgeSetByToken = {}
223 for token in sentenceGraph.tokens:
224
225
226
227
228
229 inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
230
231 self.inEdgesByToken[token] = inEdges
232
233
234
235
236
237 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
238
239 self.outEdgesByToken[token] = outEdges
240 self.edgeSetByToken[token] = set(inEdges + outEdges)
241
242 for i in range(len(sentenceGraph.tokens)):
243 token = sentenceGraph.tokens[i]
244
245
246 if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
247 categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])
248 else:
249 categoryName, entityIds = "neg", None
250 self.exampleStats.beginExample(categoryName)
251
252
253 if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]:
254 self.exampleStats.filter("name")
255 self.exampleStats.endExample()
256 continue
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279 if self.styles["pos_only"] and categoryName == "neg":
280 self.exampleStats.filter("pos_only")
281 self.exampleStats.endExample()
282 continue
283
284 category = self.classSet.getId(categoryName)
285 if category == None:
286 self.exampleStats.filter("undefined_class")
287 self.exampleStats.endExample()
288 continue
289
290 tokenText = token.get("text").lower()
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305 features = {}
306
307 if not self.styles["names"]:
308 features[self.featureSet.getId(namedEntityCountFeature)] = 1
309
310
311
312 features.update(bowFeatures)
313
314
315
316
317
318
319
320
321
322 text = token.get("text")
323 features[self.featureSet.getId("txt_"+text)] = 1
324 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
325 stem = PorterStemmer.stem(text)
326 features[self.featureSet.getId("stem_"+stem)] = 1
327 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
328
329
330 normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
331 if normalizedText == "bound":
332 normalizedText = "bind"
333 features[self.featureSet.getId("txt_"+normalizedText)] = 1
334 norStem = PorterStemmer.stem(normalizedText)
335 features[self.featureSet.getId("stem_"+norStem)] = 1
336 features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1
337
338
339
340
341
342
343
344
345 for string in text.split("-"):
346 stringLower = string.lower()
347 features[self.featureSet.getId("substring_"+stringLower)] = 1
348 features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1
349
350
351 for index in [-3,-2,-1,1,2,3]:
352 if i + index > 0 and i + index < len(sentenceGraph.tokens):
353 self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
354
355
356 if self.styles["linear_ngrams"]:
357 self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features)
358 self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features)
359
360 if self.styles["phospho"]:
361 if text.find("hospho") != -1:
362 features[self.featureSet.getId("phospho_found")] = 1
363 features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1
364 features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1
365
366 if self.styles["bb_features"]:
367 if text.lower() in self.bacteriaTokens:
368 features[self.featureSet.getId("lpsnBacToken")] = 1
369
370
371 if i > 0 and text[0].isalpha() and text[0].isupper():
372 features[self.featureSet.getId("upper_case_start")] = 1
373 for j in range(len(text)):
374 if j > 0 and text[j].isalpha() and text[j].isupper():
375 features[self.featureSet.getId("upper_case_middle")] = 1
376
377 if text[j].isdigit():
378 features[self.featureSet.getId("has_digits")] = 1
379 if j > 0 and text[j-1] == "-":
380 features[self.featureSet.getId("has_hyphenated_digit")] = 1
381 elif text[j] == "-":
382 features[self.featureSet.getId("has_hyphen")] = 1
383 elif text[j] == "/":
384 features[self.featureSet.getId("has_fslash")] = 1
385 elif text[j] == "\\":
386 features[self.featureSet.getId("has_bslash")] = 1
387
388 if j > 0:
389 features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
390
391 if j > 1:
392 features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
393
394
395
396
397
398 t1InEdges = self.inEdgesByToken[token]
399 for edge in t1InEdges:
400 edgeType = edge[2].get("type")
401 features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
402 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
403 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
404 tokenText = sentenceGraph.getTokenText(edge[0])
405 features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
406 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
407 tokenStem = PorterStemmer.stem(tokenText)
408 features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1
409 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1
410 features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
411 t1OutEdges = self.outEdgesByToken[token]
412 for edge in t1OutEdges:
413 edgeType = edge[2].get("type")
414 features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
415 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
416 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
417 tokenText = sentenceGraph.getTokenText(edge[1])
418 features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
419 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1
420 tokenStem = PorterStemmer.stem(tokenText)
421 features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1
422 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1
423 features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
424
425
426 if self.styles["rel_features"]:
427 self.relFeatureBuilder.setFeatureVector(features)
428 self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i)
429 self.relFeatureBuilder.setFeatureVector(None)
430
431
432
433
434
435
436
437 if self.styles["wordnet"]:
438 tokTxt = token.get("text")
439 tokPOS = token.get("POS")
440 wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
441 for wordNetFeature in wordNetFeatures:
442
443 features[self.featureSet.getId("WN_"+wordNetFeature)] = 1
444
445
446 if self.styles["giuliano"]:
447 self.giulianoFeatureBuilder.setFeatureVector(features)
448 self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph)
449 self.giulianoFeatureBuilder.setFeatureVector(None)
450
451 extra = {"xtype":"token","t":token.get("id")}
452 if self.styles["bb_features"]:
453 extra["trigex"] = "bb"
454 if self.styles["epi_merge_negated"]:
455 extra["unmergeneg"] = "epi"
456 if entityIds != None:
457 extra["goldIds"] = entityIds
458
459
460
461 self.buildChains(token, sentenceGraph, features)
462
463 if self.styles["pos_pairs"]:
464 self.buildPOSPairs(token, namedEntityHeadTokens, features)
465
466 example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra)
467 ExampleUtils.appendExamples([example], outfile)
468 exampleIndex += 1
469 self.exampleStats.endExample()
470
471 return exampleIndex
472
473 - def buildChains(self,token,sentenceGraph,features,depthLeft=3,chain="",visited=None):
474 if depthLeft == 0:
475 return
476 strDepthLeft = "dist_" + str(depthLeft)
477
478 if visited == None:
479 visited = set()
480
481 inEdges = self.inEdgesByToken[token]
482 outEdges = self.outEdgesByToken[token]
483 edgeSet = visited.union(self.edgeSetByToken[token])
484 for edge in inEdges:
485 if not edge in visited:
486 edgeType = edge[2].get("type")
487 features[self.featureSet.getId("dep_"+strDepthLeft+edgeType)] = 1
488
489 nextToken = edge[0]
490 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph)
491 for tokenFeature in tokenFeatures:
492 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature]
493
494
495
496
497
498
499
500
501 if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]:
502 features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1
503 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1
504 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-frw_"+edgeType,edgeSet)
505
506 for edge in outEdges:
507 if not edge in visited:
508 edgeType = edge[2].get("type")
509 features[self.featureSet.getId("dep_dist_"+strDepthLeft+edgeType)] = 1
510
511 nextToken = edge[1]
512 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph)
513 for tokenFeature in tokenFeatures:
514 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature]
515
516
517
518
519
520
521
522 if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]:
523 features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1
524
525 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1
526 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)
527
529 headTokens = []
530 for entity in sentenceGraph.entities:
531 if entity.get("isName") == "True":
532 headTokens.append(sentenceGraph.entityHeadTokenByEntity[entity])
533 return headTokens
534
535 - def buildPOSPairs(self, token, namedEntityHeadTokens, features):
536 tokenPOS = token.get("POS")
537 assert tokenPOS != None
538 for headToken in namedEntityHeadTokens:
539 headPOS = headToken.get("POS")
540 features[self.featureSet.getId("POS_pair_NE_"+tokenPOS+"-"+headPOS)] = 1
541