1 """
2 Speculation and negation examples
3 """
4 __version__ = "$Revision: 1.12 $"
5
6 import sys, os
7 import types
8 thisPath = os.path.dirname(os.path.abspath(__file__))
9 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
10 from ExampleBuilder import ExampleBuilder
11 import Utils.Libraries.PorterStemmer as PorterStemmer
12 from Core.IdSet import IdSet
13 import Core.ExampleUtils as ExampleUtils
14
15
16 import Evaluators.EvaluateInteractionXML as EvaluateInteractionXML
17
18
19 speculationWords = [
20 'account', 'aim', 'almost', 'analysed', 'analyses', 'analysis', 'analyzed', 'appear',
21 'appeared', 'appears', 'argue', 'artifact', 'ascertain', 'asked', 'assayed',
22 'assessed', 'assumes', 'believed', 'can', 'candidates', 'clarify', 'clear',
23 'conclude', 'confirm', 'considered', 'could', 'define', 'delineate', 'determine',
24 'determined', 'elucidate', 'elucidating', 'establish', 'evaluate', 'evaluated',
25 'evaluates', 'evidence', 'examine', 'examined', 'explore', 'findings', 'hypothesis',
26 'hypothesize', 'hypothesized', 'idea', 'identification', 'implicated', 'implicates',
27 'implications', 'importance', 'important', 'indicate', 'indicated', 'indicators',
28 'information', 'insights', 'investigate', 'investigated', 'investigation', 'isolate',
29 'known', 'likely', 'may', 'measured', 'might', 'monitored', 'most', 'must',
30 'objective', 'obscure', 'observations', 'observed', 'partially', 'partly',
31 'performed', 'perhaps', 'play', 'plays', 'possible', 'postulated', 'potent',
32 'potential', 'potentially', 'probably', 'propose', 'proposed', 'putative',
33 'quantitated', 'reexamined', 'reported', 'revealed', 'role', 'screened', 'seemed',
34 'seems', 'shown', 'significantly', 'since', 'sought', 'studied', 'studies', 'study',
35 'suggest', 'suggested', 'suggesting', 'suggests', 'support', 'suspect', 'tested',
36 'thought', 'unclear', 'undefined', 'understand', 'unknown', 'whether']
37
39 if type(words) in types.StringTypes:
40 wordSet = set()
41 f = open(filename)
42 for line in f.readlines():
43 wordSet.add(line.strip())
44 f.close()
45 else:
46 wordSet = set(words)
47 stemSet = set()
48 for word in wordSet:
49 stemSet.add(PorterStemmer.stem(word))
50 return wordSet, stemSet
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
67 - def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
68 global speculationWords
69
70 if classSet == None:
71 classSet = IdSet(1)
72 assert( classSet.getId("neg") == 1 )
73 if featureSet == None:
74 featureSet = IdSet()
75
76 self.specWords, self.specWordStems = readWords(speculationWords)
77
78 ExampleBuilder.__init__(self, classSet, featureSet)
79
80 if gazetteerFileName!=None:
81 self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
82 print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
83 else:
84 self.gazetteer=None
85 self._setDefaultParameters({"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")})
86 self.styles = self.getParameters(style)
87
89 """
90 If a single token belongs to multiple entities of different types,
91 a new, composite type is defined. This type is the alphabetically
92 ordered types of these entities joined with '---'.
93 """
94 types = set()
95 for entity in entities:
96 types.add(entity.get("type"))
97 types = list(types)
98 types.sort()
99 typeString = ""
100 for type in types:
101 if typeString != "":
102 typeString += "---"
103 typeString += type
104 return typeString
105
107 """
108 Returns a list of features based on the attributes of a token.
109 These can be used to define more complex features.
110 """
111
112
113 if self.tokenFeatures.has_key(token):
114 return self.tokenFeatures[token]
115 tokTxt=sentenceGraph.getTokenText(token)
116 features = {}
117 features["_txt_"+tokTxt]=1
118 features["_POS_"+token.get("POS")]=1
119 if self.styles["speculation_words"]:
120 if tokTxt in self.specWords:
121 features["_spec"]=1
122 features["_spec_"+tokTxt]=1
123 tokStem = PorterStemmer.stem(tokTxt)
124 if tokStem in self.specWordStems:
125 features["_spec_stem"]=1
126 features["_spec_stem_"+tokStem]=1
127 if sentenceGraph.tokenIsName[token]:
128 features["_isName"]=1
129 for entity in sentenceGraph.tokenIsEntityHead[token]:
130 if entity.get("isName") == "True":
131 features["_annType_"+entity.get("type")]=1
132 if self.gazetteer and tokTxt.lower() in self.gazetteer:
133 for label,weight in self.gazetteer[tokTxt.lower()].items():
134 pass
135
136 self.tokenFeatures[token] = features
137 return features
138
140 """
141 Linear features are built by marking token features with a tag
142 that defines their relative position in the linear order.
143 """
144 tag = "linear_"+tag
145 for tokenFeature,w in self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph).iteritems():
146 features[self.featureSet.getId(tag+tokenFeature)] = w
147
149 """
150 Build one example for each token of the sentence
151 """
152 examples = []
153 exampleIndex = 0
154
155 self.tokenFeatures = {}
156
157 if goldGraph != None:
158 entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities)
159
160 namedEntityCount = 0
161 entityCount = 0
162 for entity in sentenceGraph.entities:
163 if entity.get("isName") == "True":
164 namedEntityCount += 1
165 else:
166 entityCount += 1
167 namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
168 entityCountFeature = "entityCount_" + str(entityCount)
169
170 bagOfWords = {}
171 for token in sentenceGraph.tokens:
172 text = "bow_" + token.get("text")
173 if not bagOfWords.has_key(text):
174 bagOfWords[text] = 0
175 bagOfWords[text] += 1
176 if sentenceGraph.tokenIsName[token]:
177 text = "ne_" + text
178 if not bagOfWords.has_key(text):
179 bagOfWords[text] = 0
180 bagOfWords[text] += 1
181 if len(sentenceGraph.tokenIsEntityHead) > 0:
182 text = "ge_" + text
183 if not bagOfWords.has_key(text):
184 bagOfWords[text] = 0
185 bagOfWords[text] += 1
186
187 text = token.get("text")
188 if self.styles["speculation_words"] and text in self.specWords:
189 if not bagOfWords.has_key("spec_bow_"+text):
190 bagOfWords["spec_bow_"+text] = 0
191 bagOfWords["spec_bow_"+text] += 1
192 bagOfWords["spec_sentence"] = 1
193
194 bowFeatures = {}
195 for k,v in bagOfWords.iteritems():
196 bowFeatures[self.featureSet.getId(k)] = v
197
198 self.inEdgesByToken = {}
199 self.outEdgesByToken = {}
200 self.edgeSetByToken = {}
201 for token in sentenceGraph.tokens:
202 inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
203 self.inEdgesByToken[token] = inEdges
204 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
205 self.outEdgesByToken[token] = outEdges
206 self.edgeSetByToken[token] = set(inEdges + outEdges)
207
208 for entity in sentenceGraph.entities:
209
210 token = sentenceGraph.entityHeadTokenByEntity[entity]
211
212 if entity.get("isName") == "True":
213 continue
214
215
216 if self.styles["classification"] == "multiclass":
217 task3Type = "multiclass"
218 categoryName = ""
219 if entity.get("negation") == "True":
220 categoryName += "negation"
221 if entity.get("speculation") == "True":
222 if categoryName != "":
223 categoryName += "---"
224 categoryName += "speculation"
225 if categoryName == "":
226 categoryName = "neg"
227 category = self.classSet.getId(categoryName)
228 elif self.styles["classification"] == "speculation":
229 task3Type = "speculation"
230 if entity.get("speculation") == "True":
231 category = self.classSet.getId("speculation")
232 else:
233 category = 1
234 if goldGraph != None:
235 if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True":
236 category = self.classSet.getId("speculation")
237 else:
238 category = 1
239 categoryName = self.classSet.getName(category)
240 elif self.styles["classification"] == "negation":
241 task3Type = "negation"
242 if entity.get("negation") == "True":
243 category = self.classSet.getId("negation")
244 else:
245 category = 1
246 if goldGraph != None:
247 if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True":
248 category = self.classSet.getId("negation")
249 else:
250 category = 1
251 categoryName = self.classSet.getName(category)
252 self.exampleStats.beginExample(categoryName)
253
254
255 features = {}
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276 features[self.featureSet.getId(namedEntityCountFeature)] = 1
277 features[self.featureSet.getId(entityCountFeature)] = 1
278
279
280
281 features.update(bowFeatures)
282
283
284
285
286
287
288
289
290
291 text = token.get("text")
292 features[self.featureSet.getId("txt_"+text)] = 1
293 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
294 stem = PorterStemmer.stem(text)
295 features[self.featureSet.getId("stem_"+stem)] = 1
296 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
297
298 if self.styles["speculation_words"]:
299 if text in self.specWords:
300 features[self.featureSet.getId("ent_spec")] = 1
301 if stem in self.specWordStems:
302 features[self.featureSet.getId("ent_spec_stem")] = 1
303
304
305 for i in range(len(sentenceGraph.tokens)):
306 if token == sentenceGraph.tokens[i]:
307 break
308 for index in [-3,-2,-1,1,2,3]:
309 if i + index > 0 and i + index < len(sentenceGraph.tokens):
310 self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
311
312
313 if i > 0 and text[0].isalpha() and text[0].isupper():
314 features[self.featureSet.getId("upper_case_start")] = 1
315 for j in range(len(text)):
316 if j > 0 and text[j].isalpha() and text[j].isupper():
317 features[self.featureSet.getId("upper_case_middle")] = 1
318
319 if text[j].isdigit():
320 features[self.featureSet.getId("has_digits")] = 1
321 if j > 0 and text[j-1] == "-":
322 features[self.featureSet.getId("has_hyphenated_digit")] = 1
323 elif text[j] == "-":
324 features[self.featureSet.getId("has_hyphen")] = 1
325 elif text[j] == "/":
326 features[self.featureSet.getId("has_fslash")] = 1
327 elif text[j] == "\\":
328 features[self.featureSet.getId("has_bslash")] = 1
329
330 if j > 0:
331 features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
332
333 if j > 1:
334 features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
335
336
337 t1InEdges = self.inEdgesByToken[token]
338 for edge in t1InEdges:
339 edgeType = edge[2].get("type")
340 features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
341 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
342 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
343 tokenText = sentenceGraph.getTokenText(edge[0])
344 features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
345 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
346 t1OutEdges = self.outEdgesByToken[token]
347 for edge in t1OutEdges:
348 edgeType = edge[2].get("type")
349 features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
350 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
351 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
352 tokenText = sentenceGraph.getTokenText(edge[1])
353 features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
354 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1
355
356 self.buildChains(token, sentenceGraph, features)
357
358 extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")}
359
360 example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
361 ExampleUtils.appendExamples([example], outfile)
362 exampleIndex += 1
363 self.exampleStats.endExample()
364
365 return exampleIndex
366
367 - def buildChains(self,token,sentenceGraph,features,depthLeft=3,chain="",visited=None):
368 if depthLeft == 0:
369 return
370 strDepthLeft = "dist_" + str(depthLeft)
371
372 if visited == None:
373 visited = set()
374
375 inEdges = self.inEdgesByToken[token]
376 outEdges = self.outEdgesByToken[token]
377 edgeSet = visited.union(self.edgeSetByToken[token])
378 for edge in inEdges:
379 if not edge in visited:
380 edgeType = edge[2].get("type")
381 features[self.featureSet.getId("dep_"+strDepthLeft+edgeType)] = 1
382
383 nextToken = edge[0]
384 for tokenFeature,w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems():
385 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w
386
387 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1
388 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-frw_"+edgeType,edgeSet)
389
390 for edge in outEdges:
391 if not edge in visited:
392 edgeType = edge[2].get("type")
393 features[self.featureSet.getId("dep_dist_"+strDepthLeft+edgeType)] = 1
394
395 nextToken = edge[1]
396 for tokenFeature,w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems():
397 features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w
398
399 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1
400 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)
401