1 import sys
2 sys.path.append("..")
3 import Utils.Libraries.PorterStemmer as PorterStemmer
4 from Core.IdSet import IdSet
5 import Core.ExampleUtils as ExampleUtils
6
7 from FeatureBuilder import FeatureBuilder
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
24 - def __init__(self, featureSet, style=None):
25 FeatureBuilder.__init__(self, featureSet, style)
26 self.noAnnType = False
27 self.edgeTypesForFeatures = []
28 self.useNonNameEntities = False
29
31 """
32 If a single token belongs to multiple entities of different types,
33 a new, composite type is defined. This type is the alphabetically
34 ordered types of these entities joined with '---'.
35 """
36 types = set()
37 for entity in entities:
38 types.add(entity.get("type"))
39 types = list(types)
40 types.sort()
41 typeString = ""
42 for type in types:
43 if typeString != "":
44 typeString += "---"
45 typeString += type
46 return typeString
47
49 """
50 Returns a list of features based on the attributes of a token.
51 These can be used to define more complex features.
52 """
53
54
55 if self.tokenFeatures.has_key(token):
56 return self.tokenFeatures[token], self.tokenFeatureWeights[token]
57 tokTxt=sentenceGraph.getTokenText(token)
58 features = {}
59 features["_txt_"+tokTxt]=1
60 features["_POS_"+token.get("POS")]=1
61 if sentenceGraph.tokenIsName[token]:
62 features["_isName"]=1
63 for entity in sentenceGraph.tokenIsEntityHead[token]:
64 if entity.get("isName") == "True":
65 features["_annType_"+self.getEntityType(entity)]=1
66
67 if self.useNonNameEntities:
68 for entity in sentenceGraph.tokenIsEntityHead[token]:
69 features["_annType_"+self.getEntityType(entity)]=1
70
71
72
73
74 self.tokenFeatures[token] = sorted(features.keys())
75 self.tokenFeatureWeights[token] = features
76 return self.tokenFeatures[token], self.tokenFeatureWeights[token]
77
79 """
80 Linear features are built by marking token features with a tag
81 that defines their relative position in the linear order.
82 """
83 tag = "linear_"+tag
84 tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph)
85 for tokenFeature in tokenFeatures:
86 self.setFeature(tag+tokenFeature, tokenFeatureWeights[tokenFeature])
87
89 """
90 Build one example for each token of the sentence
91 """
92 self.sentenceGraph = sentenceGraph
93 self.tokenFeatures = {}
94 self.tokenFeatureWeights = {}
95
96
97 namedEntityCount = 0
98 for entity in sentenceGraph.entities:
99 if entity.get("isName") == "True":
100 namedEntityCount += 1
101 self.namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
102
103 bagOfWords = {}
104 for token in sentenceGraph.tokens:
105 text = "bow_" + token.get("text")
106 if not bagOfWords.has_key(text):
107 bagOfWords[text] = 0
108 bagOfWords[text] += 1
109 if sentenceGraph.tokenIsName[token]:
110 text = "ne_" + text
111 if not bagOfWords.has_key(text):
112 bagOfWords[text] = 0
113 bagOfWords[text] += 1
114 self.bowFeatures = {}
115 for k in sorted(bagOfWords.keys()):
116 self.bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]
117
118 self.inEdgesByToken = {}
119 self.outEdgesByToken = {}
120 self.edgeSetByToken = {}
121 for token in sentenceGraph.tokens:
122 inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
123
124
125
126
127
128
129 self.inEdgesByToken[token] = inEdges
130
131 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
132
133
134
135
136
137
138 self.outEdgesByToken[token] = outEdges
139 self.edgeSetByToken[token] = set(inEdges + outEdges)
140
142 sentenceGraph = self.sentenceGraph
143 tokenIndex = None
144 for i in range(len(self.sentenceGraph.tokens)):
145 if token == self.sentenceGraph.tokens[i]:
146 tokenIndex = i
147 break
148 assert tokenIndex != None
149 token = self.sentenceGraph.tokens[tokenIndex]
150
151
152 self.setFeature(self.namedEntityCountFeature, 1)
153
154
155
156
157
158
159
160
161
162
163
164 text = token.get("text")
165 self.setFeature("txt_"+text, 1)
166 self.setFeature("POS_"+token.get("POS"), 1)
167 stem = PorterStemmer.stem(text)
168 self.setFeature("stem_"+stem, 1)
169 self.setFeature("nonstem_"+text[len(stem):], 1)
170
171
172 if linear:
173 for index in [-3,-2,-1,1,2,3]:
174 if i + index > 0 and i + index < len(sentenceGraph.tokens):
175 self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index))
176
177
178 if i > 0 and text[0].isalpha() and text[0].isupper():
179 self.setFeature("upper_case_start", 1)
180 for j in range(len(text)):
181 if j > 0 and text[j].isalpha() and text[j].isupper():
182 self.setFeature("upper_case_middle", 1)
183
184 if text[j].isdigit():
185 self.setFeature("has_digits", 1)
186 if j > 0 and text[j-1] == "-":
187 self.setFeature("has_hyphenated_digit", 1)
188 elif text[j] == "-":
189 self.setFeature("has_hyphen", 1)
190 elif text[j] == "/":
191 self.setFeature("has_fslash", 1)
192 elif text[j] == "\\":
193 self.setFeature("has_bslash", 1)
194
195 if j > 0:
196 self.setFeature("dt_"+text[j-1:j+1].lower(), 1)
197
198 if j > 1:
199 self.setFeature("tt_"+text[j-2:j+1].lower(), 1)
200
201
202 if chains:
203 self.buildChains(token, sentenceGraph)
204
206
207 t1InEdges = self.inEdgesByToken[token]
208 for edge in t1InEdges:
209 edgeType = edge[2].get("type")
210 self.setFeature("t1HIn_"+edgeType, 1)
211 self.setFeature("t1HIn_"+edge[0].get("POS"), 1)
212 self.setFeature("t1HIn_"+edgeType+"_"+edge[0].get("POS"), 1)
213 tokenText = sentenceGraph.getTokenText(edge[0])
214 self.setFeature("t1HIn_"+tokenText, 1)
215 self.setFeature("t1HIn_"+edgeType+"_"+tokenText, 1)
216 t1OutEdges = self.outEdgesByToken[token]
217 for edge in t1OutEdges:
218 edgeType = edge[2].get("type")
219 self.setFeature("t1HOut_"+edgeType, 1)
220 self.setFeature("t1HOut_"+edge[1].get("POS"), 1)
221 self.setFeature("t1HOut_"+edgeType+"_"+edge[1].get("POS"), 1)
222 tokenText = sentenceGraph.getTokenText(edge[1])
223 self.setFeature("t1HOut_"+tokenText, 1)
224 self.setFeature("t1HOut_"+edgeType+"_"+tokenText, 1)
225
226 - def buildChains(self,token,sentenceGraph,depthLeft=3,chain="",visited=None):
227 if depthLeft == 0:
228 return
229 strDepthLeft = "dist_" + str(depthLeft)
230
231 if visited == None:
232 visited = set()
233
234 inEdges = self.inEdgesByToken[token]
235 outEdges = self.outEdgesByToken[token]
236 edgeSet = visited.union(self.edgeSetByToken[token])
237 for edge in inEdges:
238 if not edge in visited:
239 edgeType = edge[2].get("type")
240 self.setFeature("dep_"+strDepthLeft+edgeType, 1)
241
242 nextToken = edge[0]
243 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph)
244 for tokenFeature in tokenFeatures:
245 self.setFeature(strDepthLeft + tokenFeature, tokenWeights[tokenFeature])
246
247
248
249
250
251
252
253
254 self.setFeature("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType, 1)
255 self.buildChains(nextToken,sentenceGraph,depthLeft-1,chain+"-frw_"+edgeType,edgeSet)
256
257 for edge in outEdges:
258 if not edge in visited:
259 edgeType = edge[2].get("type")
260 self.setFeature("dep_dist_"+strDepthLeft+edgeType, 1)
261
262 nextToken = edge[1]
263 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph)
264 for tokenFeature in tokenFeatures:
265 self.setFeature(strDepthLeft + tokenFeature, tokenWeights[tokenFeature])
266
267
268
269
270
271
272
273
274 self.setFeature("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType, 1)
275 self.buildChains(nextToken,sentenceGraph,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)
276