1 """
2 Shortest path features
3 """
4 __version__ = "$Revision: 1.30 $"
5
6 from FeatureBuilder import FeatureBuilder
7 import Utils.Libraries.PorterStemmer as PorterStemmer
8
9 import Utils.Libraries.combine as combine
10
12 """
13 This feature builder generates features describing a pair of word tokens connected by one or more
14 dependencies. Most of the features it produces are built on the shortest undirected path of
15 dependencies between the two tokens.
16 """
17 - def __init__(self, featureSet, style=None):
18 """
19 @type featureSet: IdSet
20 @param featureSet: feature ids
21 """
22 FeatureBuilder.__init__(self, featureSet, style=style)
23
24 self.ontologyFeatureBuilder = None
25 self.noAnnType = False
26 self.predictedRange = None
27
29
30 return edge.get("type")
31
32 eType = edge.get("type")
33 if eType == "subj" or eType.startswith("nsubj") or eType.startswith("csubj"):
34 return "subj"
35 elif eType in ["obj", "dobj", "iobj", "pobj"]:
36 return "obj"
37 elif eType == "agent" or eType == "prepc" or eType.startswith("prep_"):
38 return "prep"
39 elif eType == "appos":
40 return "nn"
41 else:
42 return eType
43
45 self.predictedRange = [None,None]
46 for sentence in sentences:
47 targetElements = sentence.findall(elementName)
48 for element in targetElements:
49 predictions = element.get("predictions")
50 if predictions != None and predictions != "":
51 predictions = predictions.split(",")
52 for p in predictions:
53 splits = p.split(":")
54 value = float(splits[1])
55 if self.predictedRange[0] == None or self.predictedRange[0] > value:
56 self.predictedRange[0] = value
57 if self.predictedRange[1] == None or self.predictedRange[1] < value:
58 self.predictedRange[1] = value
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 - def setFeatureVector(self, features=None, entity1=None, entity2=None, resetCache=True):
80 """
81 When the feature builder builds features, they are put to this feature vector.
82
83 @type features: dictionary
84 @param features: a reference to the feature vector
85 @type entity1: cElementTree.Element
86 @param entity1: an entity used by trigger or edge feature builders
87 @type entity2: cElementTree.Element
88 @param entity2: an entity used by trigger or edge feature builders
89 @type resetCache: boolean
90 @param resetCache: Some intermediate features are cached to speed up example generation. This
91 cache should be cleared when moving to another example.
92 """
93 self.entity1 = entity1
94 self.entity2 = entity2
95 self.features = features
96
97 if self.ontologyFeatureBuilder != None:
98 self.ontologyFeatureBuilder.setFeatureVector(features)
99 if resetCache:
100 self.tokenFeatures = {}
101 self.edgeCache = {}
102 self.depPathCache = {}
103
105 """
106 Edge examples are usually predicted on top of predicted entities. The entities' confidence scores
107 can be used as features for edge detection. For these features to be used, the model must also have
108 been trained on data that contains prediction confidence scores.
109 """
110 predictions = element.get("predictions")
111 if predictions != None and predictions != "":
112 predictions = predictions.split(",")
113 for p in predictions:
114 splits = p.split(":")
115 if self.predictedRange[0] == None or self.predictedRange[1] == None:
116 value = 1.0
117 else:
118 value = float(splits[1])
119 value -= self.predictedRange[0]
120 value /= (self.predictedRange[1] - self.predictedRange[0])
121 assert(value >= 0 and value <= 1)
122
123 self.setFeature(tag + "_strength_"+splits[0], value)
124 else:
125
126 self.setFeature(tag + "_strength_" + str(element.get("type")), 1.0)
127
129 """
130 Build features for the two entities of the current example. These features are labeled as "e1" or "e2",
131 so entity order is meaningful.
132 """
133
134 for token in sentenceGraph.tokens:
135 if token not in sentenceGraph.entitiesByToken:
136 continue
137 entities = sentenceGraph.entitiesByToken[token]
138 if self.entity1 in entities:
139 tokenFeatures = self.getTokenFeatures(token, sentenceGraph)
140 for feature in tokenFeatures:
141 self.setFeature("e1_"+feature, 1)
142 if self.entity2 in entities:
143 tokenFeatures = self.getTokenFeatures(token, sentenceGraph)
144 for feature in tokenFeatures:
145 self.setFeature("e2_"+feature, 1)
146 if self.entity1 != None and self.entity2 != None:
147 entityCombination = ""
148 if self.entity1.get("isName") != None:
149 if self.entity1.get("isName") == "True":
150 entityCombination += "e1_Entity_"
151 else:
152 entityCombination += "e1_InteractionWord_"
153 if self.predictedRange != None:
154 self.buildPredictedValueFeatures(self.entity1, "e1")
155 else:
156 entityCombination += "e1_Entity_"
157 if self.entity2.get("isName") != None:
158 if self.entity2.get("isName") == "True":
159 entityCombination += "e2_Entity"
160 else:
161 entityCombination += "e2_InteractionWord"
162 if self.predictedRange != None:
163 self.buildPredictedValueFeatures(self.entity2, "e2")
164 else:
165 entityCombination += "e2_Entity"
166 self.setFeature(entityCombination, 1)
167 self.setFeature("eTypes_"+self.getEntityType(self.entity1)+"_"+self.getEntityType(self.entity2), 1)
168
169 if sentenceGraph.entityHeadTokenByEntity[self.entity1] == sentenceGraph.entityHeadTokenByEntity[self.entity2]:
170 self.setFeature("selfLoop", 1)
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
297 """
298 Simple numeric features about the length of the path
299 """
300 self.setFeature("len_tokens_"+str(len(pathTokens)), 1)
301 self.setFeature("len", len(pathTokens))
302
304 textCounts = {}
305 for token in sentenceGraph.tokens:
306 texts = self.getTokenAnnotatedType(token, sentenceGraph)
307
308 for text in texts:
309 if not textCounts.has_key(text):
310 textCounts[text] = 0
311 textCounts[text] += 1
312
313 for key in sorted(textCounts.keys()):
314 self.setFeature("count_"+key, textCounts[key])
315
317 """
318 Token features for the first and last tokens of the path
319 """
320 for feature in self.getTokenFeatures(pathTokens[0], sentenceGraph):
321 self.setFeature("tokTerm1_"+feature, 1)
322 for feature in self.getTokenFeatures(pathTokens[-1], sentenceGraph):
323 self.setFeature("tokTerm2_"+feature, 1)
324
325
326
327
328
329
331
332
333 internalTypes = ""
334 for token in pathTokens[0:-1]:
335 annTypes = self.getTokenAnnotatedType(token, sentenceGraph)
336 for annType in annTypes:
337 internalTypes += "_" + annType
338 internalTypes += "__"
339 self.setFeature("tokenPath"+internalTypes, 1)
340
341
342
343
344
345
346
348 """
349 Goes through all the possible walks and builds features for subsections
350 of "length" edges.
351 """
352
353
354
355 t1 = self.getTokenAnnotatedType(pathTokens[0], sentenceGraph)
356 t2 = self.getTokenAnnotatedType(pathTokens[-1], sentenceGraph)
357
358
359 walks = sentenceGraph.dependencyGraph.getWalks(pathTokens)
360
361
362
363 self.buildWalkPaths(pathTokens, walks, sentenceGraph)
364 dirGrams = []
365 for walk in walks:
366 dirGrams.append("")
367 for i in range(len(pathTokens)-1):
368 for j in range(len(walks)):
369 if walks[j][i][0] == pathTokens[i]:
370 dirGrams[j] += "F"
371 else:
372 assert walks[j][i][1] == pathTokens[i]
373 dirGrams[j] += "R"
374 if i >= length-1:
375 styleGram = dirGrams[j][i-(length-1):i+1]
376 edgeGram = "depGram_" + styleGram
377
378 for token in pathTokens[i-(length-1)+1:i+1]:
379 for feature in self.getTokenFeatures(token, sentenceGraph, annotatedType=(self.maximum == True)):
380 self.setFeature("tok_"+styleGram+feature, 1)
381
382 position = 0
383 tokenTypeGram = ""
384 for edge in walks[j][i-(length-1):i+1]:
385 self.setFeature("dep_"+styleGram+str(position)+"_"+self.getEdgeType(edge[2]), 1)
386 position += 1
387 edgeGram += "_" + self.getEdgeType(edge[2])
388 self.setFeature(edgeGram, 1)
389 for type1 in t1:
390 for type2 in t2:
391 self.setFeature(type1+"_"+edgeGram+"_"+type2, 1)
392 for dirGram in dirGrams:
393 self.setFeature("edge_directions_"+dirGram, 1)
394
395 - def addType(self, token, sentenceGraph, prefix="annType_"):
399
401
402
403
404 edgeList = []
405 depGraph = sentenceGraph.dependencyGraph
406 pt = pathTokens
407 for i in range(1, len(pathTokens)):
408 edgeList.extend(depGraph.getEdges(pt[i], pt[i-1]))
409 edgeList.extend(depGraph.getEdges(pt[i-1], pt[i]))
410
411
412 for edge in edgeList:
413 depType = self.getEdgeType(edge[2])
414 self.setFeature("dep_"+depType, 1)
415
416 self.setFeature("txt_"+sentenceGraph.getTokenText(edge[0]), 1)
417 self.setFeature("POS_"+edge[0].get("POS"), 1)
418 self.addType(edge[0], sentenceGraph, prefix="annType_")
419
420 self.setFeature("txt_"+sentenceGraph.getTokenText(edge[1]), 1)
421 self.setFeature("POS_"+edge[1].get("POS"), 1)
422 self.addType(edge[1], sentenceGraph, prefix="annType_")
423
424
425 gText = sentenceGraph.getTokenText(edge[0])
426 dText = sentenceGraph.getTokenText(edge[1])
427 gPOS = edge[0].get("POS")
428 dPOS = edge[1].get("POS")
429 gAT = "noAnnType"
430 dAT = "noAnnType"
431 if sentenceGraph.tokenIsEntityHead[edge[0]] != None:
432 gATs = self.getTokenAnnotatedType(edge[0], sentenceGraph)
433 if sentenceGraph.tokenIsEntityHead[edge[1]] != None:
434 dATs = self.getTokenAnnotatedType(edge[1], sentenceGraph)
435 self.setFeature("gov_"+gText+"_"+dText, 1)
436 self.setFeature("gov_"+gPOS+"_"+dPOS, 1)
437 for gAT in gATs:
438 for dAT in dATs:
439 self.setFeature("gov_"+gAT+"_"+dAT, 1)
440
441 for gAT in gATs:
442 self.setFeature("triple_"+gAT+"_"+depType+"_"+dAT, 1)
443
444
445
446
447
448
449
450
451
452
453
455 depGraph = sentenceGraph.dependencyGraph
456 pt = pathTokens
457
458 for i in range(1,len(pathTokens)):
459
460
461 for edge in depGraph.getEdges(pt[i], pt[i-1]):
462 depType = self.getEdgeType(edge[2])
463 self.setFeature("dep_"+depType+"Forward_", 1)
464
465 for edge in depGraph.getEdges(pt[i-1], pt[i]):
466 depType = self.getEdgeType(edge[2])
467 self.setFeature("dep_Reverse_"+depType, 1)
468
469
470 for i in range(1,len(pathTokens)-1):
471 self.setFeature("internalPOS_"+pathTokens[i].get("POS"), 1)
472 self.setFeature("internalTxt_"+sentenceGraph.getTokenText(pathTokens[i]), 1)
473
474 for i in range(2,len(pathTokens)-1):
475
476
477 for edge in depGraph.getEdges(pt[i], pt[i-1]):
478 self.setFeature("internalDep_"+self.getEdgeType(edge[2]), 1)
479
480 for edge in depGraph.getEdges(pt[i-1], pt[i]):
481 self.setFeature("internalDep_"+self.getEdgeType(edge[2]), 1)
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
537
538
539 inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
540 for edge in inEdges:
541 if edge in ignoreEdges:
542 continue
543 self.setFeature(prefix+"HangingIn_"+self.getEdgeType(edge[2]), 1)
544 for feature in self.getTokenFeatures(edge[0], sentenceGraph):
545 self.setFeature(prefix+"HangingIn_"+feature, 1)
546
547 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
548 for edge in outEdges:
549 if edge in ignoreEdges:
550 continue
551 self.setFeature(prefix+"HangingOut_"+self.getEdgeType(edge[2]), 1)
552 for feature in self.getTokenFeatures(edge[1], sentenceGraph):
553 self.setFeature(prefix+"HangingOut_"+feature, 1)
554