TEES.ExampleBuilders.FeatureBuilders.FeatureBuilder

7 """ 8 Multiple example builders might make use of the same features. A feature builder object can be used in 9 different example builders that require the same feature set. 10 """

11 - def __init__(self, featureSet, style=None):

12 """ 13 @type featureSet: IdSet 14 @param featureSet: feature ids 15 """ 16 self.featureSet = featureSet # feature ids 17 self.features = None # current feature vector 18 self.entity1 = None # an entity node for which features are built 19 self.entity2 = None # another entity node for pairwise examples such as edges 20 self.noAnnType = False # do not use annotated entity types for building features 21 self.filterAnnTypes = set() # ignore these entity types 22 self.ontologyFeatureBuilder = None 23 self.maximum = False # produce maximum number of features 24 self.style = style 25 26 self.maskNamedEntities = True # named entity text strings are replaced with NAMED_ENT 27 self.tag = "" # a prefix that is added to each feature name

28

29 - def setTag(self, tag=""):

30 self.tag = tag

31

32 - def setFeatureVector(self, features, entity1=None, entity2=None):

33 """ 34 When the feature builder builds features, they are put to this feature vector. 35 36 @type features: dictionary 37 @param features: a reference to the feature vector 38 @type entity1: cElementTree.Element 39 @param entity1: an entity used by trigger or edge feature builders 40 @type entity2: cElementTree.Element 41 @param entity2: an entity used by trigger or edge feature builders 42 """ 43 self.features = features 44 self.entity1 = entity1 45 self.entity2 = entity2 46 self.tokenFeatures = {}

47

48 - def setFeature(self, name, value=1):

49 """ 50 Add a feature to the feature vector. If the feature already exists, its current 51 value is replaced with the new value. All features are prefixed with FeatureBuilder.tag. 52 53 @type name: str 54 @type value: float 55 """ 56 self.features[self.featureSet.getId(self.tag+name)] = value

57

58 - def normalizeFeatureVector(self):

59 """ 60 Some machine learning tasks require feature values to be normalized to range [0,1]. The range is 61 defined as the difference of the largest and smallest feature value in the current feature vector. 62 If this method is used, it should be called as the last step after generating all features. 63 """ 64 # Normalize features 65 total = 0.0 66 for v in self.features.values(): total += abs(v) 67 if total == 0.0: 68 total = 1.0 69 for k,v in self.features.iteritems(): 70 self.features[k] = float(v) / total

71

72 - def getTokenFeatures(self, token, sentenceGraph, text=True, POS=True, annotatedType=True, stem=False, ontology=True):

73 """ 74 Token features are features describing an isolated word token. These subfeatures are often merged into 75 such features like n-grams. This method produces and caches a set of feature names for a token in 76 the sentenceGraph sentence. The various flags can be used to choose which attributes will be included in the 77 feature name list. 78 79 @type token: cElementTree.Element 80 @param token: a word token 81 @type sentenceGraph: SentenceGraph 82 @param sentenceGraph: the sentence to which the token belongs 83 @type text: boolean 84 @type POS: boolean 85 @type annotatedType: boolean 86 @type stem: boolean 87 @type ontology: boolean 88 """ 89 callId = token.get("id") + str(text) + str(POS) + str(annotatedType) + str(stem) + str(ontology) 90 if self.tokenFeatures.has_key(callId): 91 return self.tokenFeatures[callId] 92 93 featureList = [] 94 if text: 95 featureList.append("txt_"+sentenceGraph.getTokenText(token)) 96 if (not self.maskNamedEntities) and sentenceGraph.tokenIsName[token]: 97 featureList.append("txt_"+token.get("text")) 98 if POS: 99 pos = token.get("POS") 100 if pos.find("_") != None and self.maximum: 101 for split in pos.split("_"): 102 featureList.append("POS_"+split) 103 featureList.append("POS_"+pos) 104 #if self.getPOSSuperType(pos) != "": 105 # featureList.append("POSX_"+self.getPOSSuperType(pos)) 106 if annotatedType and not self.noAnnType: 107 annTypes = self.getTokenAnnotatedType(token, sentenceGraph) 108 if "noAnnType" in annTypes and not self.maximum: 109 annTypes.remove("noAnnType") 110 for annType in annTypes: 111 featureList.append("annType_"+annType) 112 if ontology and (self.ontologyFeatureBuilder != None): 113 for annType in annTypes: 114 featureList.extend(self.ontologyFeatureBuilder.getParents(annType)) 115 if stem: 116 featureList.append("stem_"+PorterStemmer.stem(sentenceGraph.getTokenText(token))) 117 118 self.tokenFeatures[callId] = featureList 119 return featureList

120

121 - def getEntityType(self, entity):

122 eType = entity.get("type") 123 if self.style != None and "maskTypeAsProtein" in self.style and self.style["maskTypeAsProtein"] and eType in self.style["maskTypeAsProtein"]: 124 return "Protein" 125 else: 126 return eType

127

128 - def getTokenAnnotatedType(self, token, sentenceGraph):

129 """ 130 Multiple entities may have the same head token. This returns a list of the types of all entities whose 131 head token this token is. If the FeatureBuilder.maximum flag is set, the list is truncated to a length of 132 two, otherwise to a length of one. This is done because when token features (to which the annotated type 133 belongs to) are combined into other features, a large number of annotated type features can lead to an 134 exponential increase in the number of features. 135 """ 136 if len(sentenceGraph.tokenIsEntityHead[token]) > 0 and not self.noAnnType: 137 annTypes = set() 138 for entity in sentenceGraph.tokenIsEntityHead[token]: 139 eType = self.getEntityType(entity) 140 if eType != None and not eType in annTypes and not eType in self.filterAnnTypes: 141 if self.entity1 == None and self.entity2 == None: 142 annTypes.add(eType) 143 else: 144 if self.maximum: 145 annTypes.add(eType) 146 if self.entity1 == entity: 147 if not self.maximum: 148 return [eType] 149 else: 150 annTypes.add("e1_"+eType) 151 elif self.entity2 == entity: 152 if not self.maximum: 153 return [eType] 154 else: 155 annTypes.add("e2_"+eType) 156 else: 157 annTypes.add(eType) 158 annTypes = list(annTypes) 159 annTypes.sort() 160 if self.maximum: 161 return annTypes[0:2] 162 else: 163 return annTypes[0:1] #annTypes[0:2] 164 else: 165 return ["noAnnType"]

166

167 - def getPOSSuperType(self, pos):

168 global posSuperTypes 169 return posSuperTypes[pos]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.FeatureBuilder