1  """ 
  2  Base class for FeatureBuilders 
  3  """ 
  4  __version__ = "$Revision: 1.14 $" 
  5   
  7      """ 
  8      Multiple example builders might make use of the same features. A feature builder object can be used in 
  9      different example builders that require the same feature set. 
 10      """ 
 11 -    def __init__(self, featureSet, style=None): 
  12          """ 
 13          @type featureSet: IdSet 
 14          @param featureSet: feature ids 
 15          """ 
 16          self.featureSet = featureSet  
 17          self.features = None  
 18          self.entity1 = None  
 19          self.entity2 = None  
 20          self.noAnnType = False  
 21          self.filterAnnTypes = set()  
 22          self.ontologyFeatureBuilder = None 
 23          self.maximum = False  
 24          self.style = style 
 25           
 26          self.maskNamedEntities = True  
 27          self.tag = ""  
  28       
 31       
 33          """ 
 34          When the feature builder builds features, they are put to this feature vector. 
 35           
 36          @type features: dictionary 
 37          @param features: a reference to the feature vector 
 38          @type entity1: cElementTree.Element 
 39          @param entity1: an entity used by trigger or edge feature builders    
 40          @type entity2: cElementTree.Element 
 41          @param entity2: an entity used by trigger or edge feature builders    
 42          """ 
 43          self.features = features 
 44          self.entity1 = entity1 
 45          self.entity2 = entity2 
 46          self.tokenFeatures = {} 
  47           
 49          """ 
 50          Add a feature to the feature vector. If the feature already exists, its current 
 51          value is replaced with the new value. All features are prefixed with FeatureBuilder.tag. 
 52           
 53          @type name: str 
 54          @type value: float 
 55          """ 
 56          self.features[self.featureSet.getId(self.tag+name)] = value 
  57           
 59          """ 
 60          Some machine learning tasks require feature values to be normalized to range [0,1]. The range is 
 61          defined as the difference of the largest and smallest feature value in the current feature vector. 
 62          If this method is used, it should be called as the last step after generating all features. 
 63          """ 
 64           
 65          total = 0.0 
 66          for v in self.features.values(): total += abs(v) 
 67          if total == 0.0:  
 68              total = 1.0 
 69          for k,v in self.features.iteritems(): 
 70              self.features[k] = float(v) / total 
  71   
 72 -    def getTokenFeatures(self, token, sentenceGraph, text=True, POS=True, annotatedType=True, stem=False, ontology=True): 
  73          """ 
 74          Token features are features describing an isolated word token. These subfeatures are often merged into 
 75          such features like n-grams. This method produces and caches a set of feature names for a token in 
 76          the sentenceGraph sentence. The various flags can be used to choose which attributes will be included in the 
 77          feature name list. 
 78           
 79          @type token: cElementTree.Element 
 80          @param token: a word token  
 81          @type sentenceGraph: SentenceGraph 
 82          @param sentenceGraph: the sentence to which the token belongs 
 83          @type text: boolean 
 84          @type POS: boolean 
 85          @type annotatedType: boolean 
 86          @type stem: boolean 
 87          @type ontology: boolean          
 88          """ 
 89          callId = token.get("id") + str(text) + str(POS) + str(annotatedType) + str(stem) + str(ontology) 
 90          if self.tokenFeatures.has_key(callId): 
 91              return self.tokenFeatures[callId] 
 92           
 93          featureList = [] 
 94          if text: 
 95              featureList.append("txt_"+sentenceGraph.getTokenText(token)) 
 96              if (not self.maskNamedEntities) and sentenceGraph.tokenIsName[token]: 
 97                  featureList.append("txt_"+token.get("text")) 
 98          if POS: 
 99              pos = token.get("POS") 
100              if pos.find("_") != None and self.maximum: 
101                  for split in pos.split("_"): 
102                      featureList.append("POS_"+split) 
103              featureList.append("POS_"+pos) 
104               
105               
106          if annotatedType and not self.noAnnType: 
107              annTypes = self.getTokenAnnotatedType(token, sentenceGraph) 
108              if "noAnnType" in annTypes and not self.maximum: 
109                  annTypes.remove("noAnnType") 
110              for annType in annTypes: 
111                  featureList.append("annType_"+annType) 
112              if ontology and (self.ontologyFeatureBuilder != None): 
113                  for annType in annTypes: 
114                      featureList.extend(self.ontologyFeatureBuilder.getParents(annType)) 
115          if stem: 
116              featureList.append("stem_"+PorterStemmer.stem(sentenceGraph.getTokenText(token))) 
117           
118          self.tokenFeatures[callId] = featureList             
119          return featureList 
 120       
122          eType = entity.get("type") 
123          if self.style != None and "maskTypeAsProtein" in self.style and self.style["maskTypeAsProtein"] and eType in self.style["maskTypeAsProtein"]: 
124              return "Protein" 
125          else: 
126              return eType 
 127       
129          """ 
130          Multiple entities may have the same head token. This returns a list of the types of all entities whose 
131          head token this token is. If the FeatureBuilder.maximum flag is set, the list is truncated to a length of 
132          two, otherwise to a length of one. This is done because when token features (to which the annotated type 
133          belongs to) are combined into other features, a large number of annotated type features can lead to an 
134          exponential increase in the number of features. 
135          """ 
136          if len(sentenceGraph.tokenIsEntityHead[token]) > 0 and not self.noAnnType: 
137              annTypes = set() 
138              for entity in sentenceGraph.tokenIsEntityHead[token]: 
139                  eType = self.getEntityType(entity) 
140                  if eType != None and not eType in annTypes and not eType in self.filterAnnTypes: 
141                      if self.entity1 == None and self.entity2 == None: 
142                          annTypes.add(eType) 
143                      else: 
144                          if self.maximum: 
145                              annTypes.add(eType) 
146                          if self.entity1 == entity: 
147                              if not self.maximum: 
148                                  return [eType] 
149                              else: 
150                                  annTypes.add("e1_"+eType) 
151                          elif self.entity2 == entity: 
152                              if not self.maximum: 
153                                  return [eType] 
154                              else: 
155                                                                  annTypes.add("e2_"+eType) 
156                          else: 
157                              annTypes.add(eType) 
158              annTypes = list(annTypes) 
159              annTypes.sort() 
160              if self.maximum: 
161                                  return annTypes[0:2] 
162              else: 
163                  return annTypes[0:1]  
164          else: 
165              return ["noAnnType"] 
 166       
 170   
171  posSuperTypes = {} 
172  posSuperTypes["CC"] = ""  
173  posSuperTypes["CD"] = ""  
174  posSuperTypes["DT"] = ""  
175  posSuperTypes["EX"] = ""  
176  posSuperTypes["FW"] = ""  
177  posSuperTypes["IN"] = ""  
178  posSuperTypes["JJ"] = "JJX"  
179  posSuperTypes["JJR"] = "JJX"  
180  posSuperTypes["JJS"] = "JJX"  
181  posSuperTypes["LS"] = ""  
182  posSuperTypes["MD"] = ""  
183  posSuperTypes["NN"] = "NNX"  
184  posSuperTypes["NNS"] = "NNX"  
185  posSuperTypes["NNP"] = "NNX"  
186  posSuperTypes["NNPS"] = "NNX"  
187  posSuperTypes["PDT"] = ""  
188  posSuperTypes["POS"] = ""  
189  posSuperTypes["PRP"] = "PRPX"  
190  posSuperTypes["PRP$"] = "PRPX"  
191  posSuperTypes["RB"] = "RBX"  
192  posSuperTypes["RBR"] = "RBX"  
193  posSuperTypes["RBS"] = "RBX"  
194  posSuperTypes["RP"] = ""  
195  posSuperTypes["SYM"] = ""  
196  posSuperTypes["TO"] = ""  
197  posSuperTypes["UH"] = ""  
198  posSuperTypes["VB"] = "VBX"  
199  posSuperTypes["VBD"] = "VBX"  
200  posSuperTypes["VBG"] = "VBX"  
201  posSuperTypes["VBN"] = "VBX"  
202  posSuperTypes["VBP"] = "VBX"  
203  posSuperTypes["VBZ"] = "VBX"  
204  posSuperTypes["WDT"] = "WX"  
205  posSuperTypes["WP"] = "WX"  
206  posSuperTypes["WP$"] = "WX"  
207  posSuperTypes["WRB"] = "WX"  
208   
209  posSuperTypes["."] = "PUNCT" 
210  posSuperTypes[","] = "PUNCT" 
211  posSuperTypes[":"] = "PUNCT" 
212  posSuperTypes[";"] = "PUNCT" 
213  posSuperTypes["("] = "PUNCT" 
214  posSuperTypes[")"] = "PUNCT" 
215  posSuperTypes["""] = "PUNCT" 
216  posSuperTypes["\""] = "PUNCT" 
217