1 """
2 Base class for FeatureBuilders
3 """
4 __version__ = "$Revision: 1.14 $"
5
7 """
8 Multiple example builders might make use of the same features. A feature builder object can be used in
9 different example builders that require the same feature set.
10 """
11 - def __init__(self, featureSet, style=None):
12 """
13 @type featureSet: IdSet
14 @param featureSet: feature ids
15 """
16 self.featureSet = featureSet
17 self.features = None
18 self.entity1 = None
19 self.entity2 = None
20 self.noAnnType = False
21 self.filterAnnTypes = set()
22 self.ontologyFeatureBuilder = None
23 self.maximum = False
24 self.style = style
25
26 self.maskNamedEntities = True
27 self.tag = ""
28
31
33 """
34 When the feature builder builds features, they are put to this feature vector.
35
36 @type features: dictionary
37 @param features: a reference to the feature vector
38 @type entity1: cElementTree.Element
39 @param entity1: an entity used by trigger or edge feature builders
40 @type entity2: cElementTree.Element
41 @param entity2: an entity used by trigger or edge feature builders
42 """
43 self.features = features
44 self.entity1 = entity1
45 self.entity2 = entity2
46 self.tokenFeatures = {}
47
49 """
50 Add a feature to the feature vector. If the feature already exists, its current
51 value is replaced with the new value. All features are prefixed with FeatureBuilder.tag.
52
53 @type name: str
54 @type value: float
55 """
56 self.features[self.featureSet.getId(self.tag+name)] = value
57
59 """
60 Some machine learning tasks require feature values to be normalized to range [0,1]. The range is
61 defined as the difference of the largest and smallest feature value in the current feature vector.
62 If this method is used, it should be called as the last step after generating all features.
63 """
64
65 total = 0.0
66 for v in self.features.values(): total += abs(v)
67 if total == 0.0:
68 total = 1.0
69 for k,v in self.features.iteritems():
70 self.features[k] = float(v) / total
71
72 - def getTokenFeatures(self, token, sentenceGraph, text=True, POS=True, annotatedType=True, stem=False, ontology=True):
73 """
74 Token features are features describing an isolated word token. These subfeatures are often merged into
75 such features like n-grams. This method produces and caches a set of feature names for a token in
76 the sentenceGraph sentence. The various flags can be used to choose which attributes will be included in the
77 feature name list.
78
79 @type token: cElementTree.Element
80 @param token: a word token
81 @type sentenceGraph: SentenceGraph
82 @param sentenceGraph: the sentence to which the token belongs
83 @type text: boolean
84 @type POS: boolean
85 @type annotatedType: boolean
86 @type stem: boolean
87 @type ontology: boolean
88 """
89 callId = token.get("id") + str(text) + str(POS) + str(annotatedType) + str(stem) + str(ontology)
90 if self.tokenFeatures.has_key(callId):
91 return self.tokenFeatures[callId]
92
93 featureList = []
94 if text:
95 featureList.append("txt_"+sentenceGraph.getTokenText(token))
96 if (not self.maskNamedEntities) and sentenceGraph.tokenIsName[token]:
97 featureList.append("txt_"+token.get("text"))
98 if POS:
99 pos = token.get("POS")
100 if pos.find("_") != None and self.maximum:
101 for split in pos.split("_"):
102 featureList.append("POS_"+split)
103 featureList.append("POS_"+pos)
104
105
106 if annotatedType and not self.noAnnType:
107 annTypes = self.getTokenAnnotatedType(token, sentenceGraph)
108 if "noAnnType" in annTypes and not self.maximum:
109 annTypes.remove("noAnnType")
110 for annType in annTypes:
111 featureList.append("annType_"+annType)
112 if ontology and (self.ontologyFeatureBuilder != None):
113 for annType in annTypes:
114 featureList.extend(self.ontologyFeatureBuilder.getParents(annType))
115 if stem:
116 featureList.append("stem_"+PorterStemmer.stem(sentenceGraph.getTokenText(token)))
117
118 self.tokenFeatures[callId] = featureList
119 return featureList
120
122 eType = entity.get("type")
123 if self.style != None and "maskTypeAsProtein" in self.style and self.style["maskTypeAsProtein"] and eType in self.style["maskTypeAsProtein"]:
124 return "Protein"
125 else:
126 return eType
127
129 """
130 Multiple entities may have the same head token. This returns a list of the types of all entities whose
131 head token this token is. If the FeatureBuilder.maximum flag is set, the list is truncated to a length of
132 two, otherwise to a length of one. This is done because when token features (to which the annotated type
133 belongs to) are combined into other features, a large number of annotated type features can lead to an
134 exponential increase in the number of features.
135 """
136 if len(sentenceGraph.tokenIsEntityHead[token]) > 0 and not self.noAnnType:
137 annTypes = set()
138 for entity in sentenceGraph.tokenIsEntityHead[token]:
139 eType = self.getEntityType(entity)
140 if eType != None and not eType in annTypes and not eType in self.filterAnnTypes:
141 if self.entity1 == None and self.entity2 == None:
142 annTypes.add(eType)
143 else:
144 if self.maximum:
145 annTypes.add(eType)
146 if self.entity1 == entity:
147 if not self.maximum:
148 return [eType]
149 else:
150 annTypes.add("e1_"+eType)
151 elif self.entity2 == entity:
152 if not self.maximum:
153 return [eType]
154 else:
155 annTypes.add("e2_"+eType)
156 else:
157 annTypes.add(eType)
158 annTypes = list(annTypes)
159 annTypes.sort()
160 if self.maximum:
161 return annTypes[0:2]
162 else:
163 return annTypes[0:1]
164 else:
165 return ["noAnnType"]
166
170
171 posSuperTypes = {}
172 posSuperTypes["CC"] = ""
173 posSuperTypes["CD"] = ""
174 posSuperTypes["DT"] = ""
175 posSuperTypes["EX"] = ""
176 posSuperTypes["FW"] = ""
177 posSuperTypes["IN"] = ""
178 posSuperTypes["JJ"] = "JJX"
179 posSuperTypes["JJR"] = "JJX"
180 posSuperTypes["JJS"] = "JJX"
181 posSuperTypes["LS"] = ""
182 posSuperTypes["MD"] = ""
183 posSuperTypes["NN"] = "NNX"
184 posSuperTypes["NNS"] = "NNX"
185 posSuperTypes["NNP"] = "NNX"
186 posSuperTypes["NNPS"] = "NNX"
187 posSuperTypes["PDT"] = ""
188 posSuperTypes["POS"] = ""
189 posSuperTypes["PRP"] = "PRPX"
190 posSuperTypes["PRP$"] = "PRPX"
191 posSuperTypes["RB"] = "RBX"
192 posSuperTypes["RBR"] = "RBX"
193 posSuperTypes["RBS"] = "RBX"
194 posSuperTypes["RP"] = ""
195 posSuperTypes["SYM"] = ""
196 posSuperTypes["TO"] = ""
197 posSuperTypes["UH"] = ""
198 posSuperTypes["VB"] = "VBX"
199 posSuperTypes["VBD"] = "VBX"
200 posSuperTypes["VBG"] = "VBX"
201 posSuperTypes["VBN"] = "VBX"
202 posSuperTypes["VBP"] = "VBX"
203 posSuperTypes["VBZ"] = "VBX"
204 posSuperTypes["WDT"] = "WX"
205 posSuperTypes["WP"] = "WX"
206 posSuperTypes["WP$"] = "WX"
207 posSuperTypes["WRB"] = "WX"
208
209 posSuperTypes["."] = "PUNCT"
210 posSuperTypes[","] = "PUNCT"
211 posSuperTypes[":"] = "PUNCT"
212 posSuperTypes[";"] = "PUNCT"
213 posSuperTypes["("] = "PUNCT"
214 posSuperTypes[")"] = "PUNCT"
215 posSuperTypes["""] = "PUNCT"
216 posSuperTypes["\""] = "PUNCT"
217