Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module FeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.FeatureBuilder

  1  """ 
  2  Base class for FeatureBuilders 
  3  """ 
  4  __version__ = "$Revision: 1.14 $" 
  5   
6 -class FeatureBuilder:
7 """ 8 Multiple example builders might make use of the same features. A feature builder object can be used in 9 different example builders that require the same feature set. 10 """
11 - def __init__(self, featureSet, style=None):
12 """ 13 @type featureSet: IdSet 14 @param featureSet: feature ids 15 """ 16 self.featureSet = featureSet # feature ids 17 self.features = None # current feature vector 18 self.entity1 = None # an entity node for which features are built 19 self.entity2 = None # another entity node for pairwise examples such as edges 20 self.noAnnType = False # do not use annotated entity types for building features 21 self.filterAnnTypes = set() # ignore these entity types 22 self.ontologyFeatureBuilder = None 23 self.maximum = False # produce maximum number of features 24 self.style = style 25 26 self.maskNamedEntities = True # named entity text strings are replaced with NAMED_ENT 27 self.tag = "" # a prefix that is added to each feature name
28
29 - def setTag(self, tag=""):
30 self.tag = tag
31
32 - def setFeatureVector(self, features, entity1=None, entity2=None):
33 """ 34 When the feature builder builds features, they are put to this feature vector. 35 36 @type features: dictionary 37 @param features: a reference to the feature vector 38 @type entity1: cElementTree.Element 39 @param entity1: an entity used by trigger or edge feature builders 40 @type entity2: cElementTree.Element 41 @param entity2: an entity used by trigger or edge feature builders 42 """ 43 self.features = features 44 self.entity1 = entity1 45 self.entity2 = entity2 46 self.tokenFeatures = {}
47
48 - def setFeature(self, name, value=1):
49 """ 50 Add a feature to the feature vector. If the feature already exists, its current 51 value is replaced with the new value. All features are prefixed with FeatureBuilder.tag. 52 53 @type name: str 54 @type value: float 55 """ 56 self.features[self.featureSet.getId(self.tag+name)] = value
57
58 - def normalizeFeatureVector(self):
59 """ 60 Some machine learning tasks require feature values to be normalized to range [0,1]. The range is 61 defined as the difference of the largest and smallest feature value in the current feature vector. 62 If this method is used, it should be called as the last step after generating all features. 63 """ 64 # Normalize features 65 total = 0.0 66 for v in self.features.values(): total += abs(v) 67 if total == 0.0: 68 total = 1.0 69 for k,v in self.features.iteritems(): 70 self.features[k] = float(v) / total
71
72 - def getTokenFeatures(self, token, sentenceGraph, text=True, POS=True, annotatedType=True, stem=False, ontology=True):
73 """ 74 Token features are features describing an isolated word token. These subfeatures are often merged into 75 such features like n-grams. This method produces and caches a set of feature names for a token in 76 the sentenceGraph sentence. The various flags can be used to choose which attributes will be included in the 77 feature name list. 78 79 @type token: cElementTree.Element 80 @param token: a word token 81 @type sentenceGraph: SentenceGraph 82 @param sentenceGraph: the sentence to which the token belongs 83 @type text: boolean 84 @type POS: boolean 85 @type annotatedType: boolean 86 @type stem: boolean 87 @type ontology: boolean 88 """ 89 callId = token.get("id") + str(text) + str(POS) + str(annotatedType) + str(stem) + str(ontology) 90 if self.tokenFeatures.has_key(callId): 91 return self.tokenFeatures[callId] 92 93 featureList = [] 94 if text: 95 featureList.append("txt_"+sentenceGraph.getTokenText(token)) 96 if (not self.maskNamedEntities) and sentenceGraph.tokenIsName[token]: 97 featureList.append("txt_"+token.get("text")) 98 if POS: 99 pos = token.get("POS") 100 if pos.find("_") != None and self.maximum: 101 for split in pos.split("_"): 102 featureList.append("POS_"+split) 103 featureList.append("POS_"+pos) 104 #if self.getPOSSuperType(pos) != "": 105 # featureList.append("POSX_"+self.getPOSSuperType(pos)) 106 if annotatedType and not self.noAnnType: 107 annTypes = self.getTokenAnnotatedType(token, sentenceGraph) 108 if "noAnnType" in annTypes and not self.maximum: 109 annTypes.remove("noAnnType") 110 for annType in annTypes: 111 featureList.append("annType_"+annType) 112 if ontology and (self.ontologyFeatureBuilder != None): 113 for annType in annTypes: 114 featureList.extend(self.ontologyFeatureBuilder.getParents(annType)) 115 if stem: 116 featureList.append("stem_"+PorterStemmer.stem(sentenceGraph.getTokenText(token))) 117 118 self.tokenFeatures[callId] = featureList 119 return featureList
120
121 - def getEntityType(self, entity):
122 eType = entity.get("type") 123 if self.style != None and "maskTypeAsProtein" in self.style and self.style["maskTypeAsProtein"] and eType in self.style["maskTypeAsProtein"]: 124 return "Protein" 125 else: 126 return eType
127
128 - def getTokenAnnotatedType(self, token, sentenceGraph):
129 """ 130 Multiple entities may have the same head token. This returns a list of the types of all entities whose 131 head token this token is. If the FeatureBuilder.maximum flag is set, the list is truncated to a length of 132 two, otherwise to a length of one. This is done because when token features (to which the annotated type 133 belongs to) are combined into other features, a large number of annotated type features can lead to an 134 exponential increase in the number of features. 135 """ 136 if len(sentenceGraph.tokenIsEntityHead[token]) > 0 and not self.noAnnType: 137 annTypes = set() 138 for entity in sentenceGraph.tokenIsEntityHead[token]: 139 eType = self.getEntityType(entity) 140 if eType != None and not eType in annTypes and not eType in self.filterAnnTypes: 141 if self.entity1 == None and self.entity2 == None: 142 annTypes.add(eType) 143 else: 144 if self.maximum: 145 annTypes.add(eType) 146 if self.entity1 == entity: 147 if not self.maximum: 148 return [eType] 149 else: 150 annTypes.add("e1_"+eType) 151 elif self.entity2 == entity: 152 if not self.maximum: 153 return [eType] 154 else: 155 annTypes.add("e2_"+eType) 156 else: 157 annTypes.add(eType) 158 annTypes = list(annTypes) 159 annTypes.sort() 160 if self.maximum: 161 return annTypes[0:2] 162 else: 163 return annTypes[0:1] #annTypes[0:2] 164 else: 165 return ["noAnnType"]
166
167 - def getPOSSuperType(self, pos):
168 global posSuperTypes 169 return posSuperTypes[pos]
170 171 posSuperTypes = {} 172 posSuperTypes["CC"] = "" # Coordinating conjunction 173 posSuperTypes["CD"] = "" # Cardinal number 174 posSuperTypes["DT"] = "" # Determiner 175 posSuperTypes["EX"] = "" # Existential there 176 posSuperTypes["FW"] = "" # Foreign word 177 posSuperTypes["IN"] = "" # Preposition or subordinating conjunction 178 posSuperTypes["JJ"] = "JJX" # Adjective 179 posSuperTypes["JJR"] = "JJX" # Adjective, comparative 180 posSuperTypes["JJS"] = "JJX" # Adjective, superlative 181 posSuperTypes["LS"] = "" # List item marker 182 posSuperTypes["MD"] = "" # Modal 183 posSuperTypes["NN"] = "NNX" # Noun, singular or mass 184 posSuperTypes["NNS"] = "NNX" # Noun, plural 185 posSuperTypes["NNP"] = "NNX" # Proper noun, singular 186 posSuperTypes["NNPS"] = "NNX" # Proper noun, plural 187 posSuperTypes["PDT"] = "" # Predeterminer 188 posSuperTypes["POS"] = "" # Possessive ending 189 posSuperTypes["PRP"] = "PRPX" # Personal pronoun 190 posSuperTypes["PRP$"] = "PRPX" # Possessive pronoun 191 posSuperTypes["RB"] = "RBX" # Adverb 192 posSuperTypes["RBR"] = "RBX" # Adverb, comparative 193 posSuperTypes["RBS"] = "RBX" # Adverb, superlative 194 posSuperTypes["RP"] = "" # Particle 195 posSuperTypes["SYM"] = "" # Symbol 196 posSuperTypes["TO"] = "" # to 197 posSuperTypes["UH"] = "" # Interjection 198 posSuperTypes["VB"] = "VBX" # Verb, base form 199 posSuperTypes["VBD"] = "VBX" # Verb, past tense 200 posSuperTypes["VBG"] = "VBX" # Verb, gerund or present participle 201 posSuperTypes["VBN"] = "VBX" # Verb, past participle 202 posSuperTypes["VBP"] = "VBX" # Verb, non-3rd person singular present 203 posSuperTypes["VBZ"] = "VBX" # Verb, 3rd person singular present 204 posSuperTypes["WDT"] = "WX" # Wh-determiner 205 posSuperTypes["WP"] = "WX" # Wh-pronoun 206 posSuperTypes["WP$"] = "WX" # Possessive wh-pronoun 207 posSuperTypes["WRB"] = "WX" # Wh-adverb 208 209 posSuperTypes["."] = "PUNCT" 210 posSuperTypes[","] = "PUNCT" 211 posSuperTypes[":"] = "PUNCT" 212 posSuperTypes[";"] = "PUNCT" 213 posSuperTypes["("] = "PUNCT" 214 posSuperTypes[")"] = "PUNCT" 215 posSuperTypes["""] = "PUNCT" 216 posSuperTypes["\""] = "PUNCT" 217