Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module TriggerFeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.TriggerFeatureBuilder

  1  import sys 
  2  sys.path.append("..") 
  3  import Utils.Libraries.PorterStemmer as PorterStemmer 
  4  from Core.IdSet import IdSet 
  5  import Core.ExampleUtils as ExampleUtils 
  6  #from Core.Gazetteer import Gazetteer 
  7  from FeatureBuilder import FeatureBuilder 
  8   
  9  #def compareDependencyEdgesById(dep1, dep2): 
 10  #    """ 
 11  #    Dependency edges are sorted, so that the program behaves consistently 
 12  #    on the sama data between different runs. 
 13  #    """ 
 14  #    id1 = dep1[2].get("id") 
 15  #    id2 = dep2[2].get("id") 
 16  #    if id1 > id2: 
 17  #       return 1 
 18  #    elif id1 == id2: 
 19  #       return 0 
 20  #    else: # x<y 
 21  #       return -1 
 22   
23 -class TriggerFeatureBuilder(FeatureBuilder):
24 - def __init__(self, featureSet, style=None):
25 FeatureBuilder.__init__(self, featureSet, style) 26 self.noAnnType = False 27 self.edgeTypesForFeatures = [] 28 self.useNonNameEntities = False
29
30 - def getMergedEntityType(self, entities):
31 """ 32 If a single token belongs to multiple entities of different types, 33 a new, composite type is defined. This type is the alphabetically 34 ordered types of these entities joined with '---'. 35 """ 36 types = set() 37 for entity in entities: 38 types.add(entity.get("type")) 39 types = list(types) 40 types.sort() 41 typeString = "" 42 for type in types: 43 if typeString != "": 44 typeString += "---" 45 typeString += type 46 return typeString
47
48 - def getTokenFeatures(self, token, sentenceGraph):
49 """ 50 Returns a list of features based on the attributes of a token. 51 These can be used to define more complex features. 52 """ 53 # These features are cached when this method is first called 54 # for a token. 55 if self.tokenFeatures.has_key(token): 56 return self.tokenFeatures[token], self.tokenFeatureWeights[token] 57 tokTxt=sentenceGraph.getTokenText(token) 58 features = {} 59 features["_txt_"+tokTxt]=1 60 features["_POS_"+token.get("POS")]=1 61 if sentenceGraph.tokenIsName[token]: 62 features["_isName"]=1 63 for entity in sentenceGraph.tokenIsEntityHead[token]: 64 if entity.get("isName") == "True": 65 features["_annType_"+self.getEntityType(entity)]=1 66 # Only for Unmerging! 67 if self.useNonNameEntities: 68 for entity in sentenceGraph.tokenIsEntityHead[token]: 69 features["_annType_"+self.getEntityType(entity)]=1 70 # if self.gazetteer and tokTxt.lower() in self.gazetteer: 71 # for label,weight in self.gazetteer[tokTxt.lower()].items(): 72 # pass 73 # #features["_knownLabel_"+label]=weight 74 self.tokenFeatures[token] = sorted(features.keys()) 75 self.tokenFeatureWeights[token] = features 76 return self.tokenFeatures[token], self.tokenFeatureWeights[token]
77
78 - def buildLinearOrderFeatures(self,sentenceGraph,index,tag):
79 """ 80 Linear features are built by marking token features with a tag 81 that defines their relative position in the linear order. 82 """ 83 tag = "linear_"+tag 84 tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph) 85 for tokenFeature in tokenFeatures: 86 self.setFeature(tag+tokenFeature, tokenFeatureWeights[tokenFeature])
87
88 - def initSentence(self, sentenceGraph):
89 """ 90 Build one example for each token of the sentence 91 """ 92 self.sentenceGraph = sentenceGraph 93 self.tokenFeatures = {} 94 self.tokenFeatureWeights = {} 95 96 #if not "names" in self.styles: 97 namedEntityCount = 0 98 for entity in sentenceGraph.entities: 99 if entity.get("isName") == "True": # known data which can be used for features 100 namedEntityCount += 1 101 self.namedEntityCountFeature = "nameCount_" + str(namedEntityCount) 102 103 bagOfWords = {} 104 for token in sentenceGraph.tokens: 105 text = "bow_" + token.get("text") 106 if not bagOfWords.has_key(text): 107 bagOfWords[text] = 0 108 bagOfWords[text] += 1 109 if sentenceGraph.tokenIsName[token]: 110 text = "ne_" + text 111 if not bagOfWords.has_key(text): 112 bagOfWords[text] = 0 113 bagOfWords[text] += 1 114 self.bowFeatures = {} 115 for k in sorted(bagOfWords.keys()): 116 self.bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] 117 118 self.inEdgesByToken = {} 119 self.outEdgesByToken = {} 120 self.edgeSetByToken = {} 121 for token in sentenceGraph.tokens: 122 inEdges = sentenceGraph.dependencyGraph.getInEdges(token) 123 #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) 124 #fixedInEdges = [] 125 #for edge in inEdges: 126 # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) 127 #inEdges = fixedInEdges 128 #inEdges.sort(compareDependencyEdgesById) 129 self.inEdgesByToken[token] = inEdges 130 131 outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) 132 #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) 133 #fixedOutEdges = [] 134 #for edge in outEdges: 135 # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) 136 #outEdges = fixedOutEdges 137 #outEdges.sort(compareDependencyEdgesById) 138 self.outEdgesByToken[token] = outEdges 139 self.edgeSetByToken[token] = set(inEdges + outEdges)
140
141 - def buildFeatures(self, token, linear=True, chains=True):
142 sentenceGraph = self.sentenceGraph 143 tokenIndex = None 144 for i in range(len(self.sentenceGraph.tokens)): 145 if token == self.sentenceGraph.tokens[i]: 146 tokenIndex = i 147 break 148 assert tokenIndex != None 149 token = self.sentenceGraph.tokens[tokenIndex] 150 151 #if not "names" in self.styles: 152 self.setFeature(self.namedEntityCountFeature, 1) 153 154 #self.features.update(self.bowFeatures) # Note! these do not get tagged 155 156 # for j in range(len(sentenceGraph.tokens)): 157 # text = "bow_" + sentenceGraph.tokens[j].get("text") 158 # if j < i: 159 # features[self.featureSet.getId("bf_" + text)] = 1 160 # elif j > i: 161 # features[self.featureSet.getId("af_" + text)] = 1 162 163 # Main features 164 text = token.get("text") 165 self.setFeature("txt_"+text, 1) 166 self.setFeature("POS_"+token.get("POS"), 1) 167 stem = PorterStemmer.stem(text) 168 self.setFeature("stem_"+stem, 1) 169 self.setFeature("nonstem_"+text[len(stem):], 1) 170 171 # Linear order features 172 if linear: 173 for index in [-3,-2,-1,1,2,3]: 174 if i + index > 0 and i + index < len(sentenceGraph.tokens): 175 self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index)) 176 177 # Content 178 if i > 0 and text[0].isalpha() and text[0].isupper(): 179 self.setFeature("upper_case_start", 1) 180 for j in range(len(text)): 181 if j > 0 and text[j].isalpha() and text[j].isupper(): 182 self.setFeature("upper_case_middle", 1) 183 # numbers and special characters 184 if text[j].isdigit(): 185 self.setFeature("has_digits", 1) 186 if j > 0 and text[j-1] == "-": 187 self.setFeature("has_hyphenated_digit", 1) 188 elif text[j] == "-": 189 self.setFeature("has_hyphen", 1) 190 elif text[j] == "/": 191 self.setFeature("has_fslash", 1) 192 elif text[j] == "\\": 193 self.setFeature("has_bslash", 1) 194 # duplets 195 if j > 0: 196 self.setFeature("dt_"+text[j-1:j+1].lower(), 1) 197 # triplets 198 if j > 1: 199 self.setFeature("tt_"+text[j-2:j+1].lower(), 1) 200 201 # chains 202 if chains: 203 self.buildChains(token, sentenceGraph)
204
205 - def buildAttachedEdgeFeatures(self, token, sentenceGraph):
206 # Attached edges (Hanging in and out edges) 207 t1InEdges = self.inEdgesByToken[token] 208 for edge in t1InEdges: 209 edgeType = edge[2].get("type") 210 self.setFeature("t1HIn_"+edgeType, 1) 211 self.setFeature("t1HIn_"+edge[0].get("POS"), 1) 212 self.setFeature("t1HIn_"+edgeType+"_"+edge[0].get("POS"), 1) 213 tokenText = sentenceGraph.getTokenText(edge[0]) 214 self.setFeature("t1HIn_"+tokenText, 1) 215 self.setFeature("t1HIn_"+edgeType+"_"+tokenText, 1) 216 t1OutEdges = self.outEdgesByToken[token] 217 for edge in t1OutEdges: 218 edgeType = edge[2].get("type") 219 self.setFeature("t1HOut_"+edgeType, 1) 220 self.setFeature("t1HOut_"+edge[1].get("POS"), 1) 221 self.setFeature("t1HOut_"+edgeType+"_"+edge[1].get("POS"), 1) 222 tokenText = sentenceGraph.getTokenText(edge[1]) 223 self.setFeature("t1HOut_"+tokenText, 1) 224 self.setFeature("t1HOut_"+edgeType+"_"+tokenText, 1)
225
226 - def buildChains(self,token,sentenceGraph,depthLeft=3,chain="",visited=None):
227 if depthLeft == 0: 228 return 229 strDepthLeft = "dist_" + str(depthLeft) 230 231 if visited == None: 232 visited = set() 233 234 inEdges = self.inEdgesByToken[token] 235 outEdges = self.outEdgesByToken[token] 236 edgeSet = visited.union(self.edgeSetByToken[token]) 237 for edge in inEdges: 238 if not edge in visited: 239 edgeType = edge[2].get("type") 240 self.setFeature("dep_"+strDepthLeft+edgeType, 1) 241 242 nextToken = edge[0] 243 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph) 244 for tokenFeature in tokenFeatures: 245 self.setFeature(strDepthLeft + tokenFeature, tokenWeights[tokenFeature]) 246 # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: 247 # if entity.get("isName") == "True": 248 # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 249 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 250 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 251 # tokenText = sentenceGraph.getTokenText(nextToken) 252 # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 253 254 self.setFeature("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType, 1) 255 self.buildChains(nextToken,sentenceGraph,depthLeft-1,chain+"-frw_"+edgeType,edgeSet) 256 257 for edge in outEdges: 258 if not edge in visited: 259 edgeType = edge[2].get("type") 260 self.setFeature("dep_dist_"+strDepthLeft+edgeType, 1) 261 262 nextToken = edge[1] 263 tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph) 264 for tokenFeature in tokenFeatures: 265 self.setFeature(strDepthLeft + tokenFeature, tokenWeights[tokenFeature]) 266 # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: 267 # if entity.get("isName") == "True": 268 # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 269 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 270 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 271 # tokenText = sentenceGraph.getTokenText(nextToken) 272 # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 273 274 self.setFeature("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType, 1) 275 self.buildChains(nextToken,sentenceGraph,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)
276