Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module TokenFeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.TokenFeatureBuilder

 1  from FeatureBuilder import FeatureBuilder 
 2  #import Stemming.PorterStemmer as PorterStemmer 
 3   
4 -class TokenFeatureBuilder(FeatureBuilder):
5 - def __init__(self, featureSet):
6 FeatureBuilder.__init__(self, featureSet)
7
8 - def buildLinearOrderFeatures(self, tokenIndex, sentenceGraph, rangePos = 999, rangeNeg = 999, preTag="" ):
9 count = 1 10 for i in range(tokenIndex+1,min(len(sentenceGraph.tokens), tokenIndex+rangePos+1)): 11 tag = preTag + "linear_+" + str(count) + "_" 12 t = sentenceGraph.tokens[i] 13 self.features[self.featureSet.getId(tag+"txt_"+sentenceGraph.getTokenText(t))] = 1 14 self.features[self.featureSet.getId(tag+"POS_"+t.attrib["POS"])] = 1 15 if sentenceGraph.tokenIsName[t]: 16 self.features[self.featureSet.getId(tag+"isName")] = 1 17 count += 1 18 count = 1 19 for i in range(tokenIndex-1,max(tokenIndex-rangeNeg-1, -1),-1): 20 tag = preTag + "linear_-" + str(count) + "_" 21 t = sentenceGraph.tokens[i] 22 self.features[self.featureSet.getId(tag+"txt_"+sentenceGraph.getTokenText(t))] = 1 23 self.features[self.featureSet.getId(tag+"POS_"+t.attrib["POS"])] = 1 24 if sentenceGraph.tokenIsName[t]: 25 self.features[self.featureSet.getId(tag+"isName")] = 1 26 count += 1
27
28 - def buildContentFeatures(self, tokenIndex, text, duplets=True, triplets=True):
29 # Content 30 if tokenIndex > 0 and text[0].isalpha() and text[0].isupper(): 31 self.features[self.featureSet.getId("upper_case_start")] = 1 32 for j in range(len(text)): 33 if j > 0 and text[j].isalpha() and text[j].isupper(): 34 self.features[self.featureSet.getId("upper_case_middle")] = 1 35 # numbers and special characters 36 if text[j].isdigit(): 37 self.features[self.featureSet.getId("has_digits")] = 1 38 if j > 0 and text[j-1] == "-": 39 self.features[self.featureSet.getId("has_hyphenated_digit")] = 1 40 elif text[j] == "-": 41 self.features[self.featureSet.getId("has_hyphen")] = 1 42 elif text[j] == "/": 43 self.features[self.featureSet.getId("has_fslash")] = 1 44 elif text[j] == "\\": 45 self.features[self.featureSet.getId("has_bslash")] = 1 46 # duplets 47 if j > 0 and duplets: 48 self.features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 49 # triplets 50 if j > 1 and triplets: 51 self.features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
52
53 - def buildTokenGrams(self, startTokenIndex, endTokenIndex, sentenceGraph, tag="", max = 3):
54 tag = "lin_" + tag 55 for i in range(startTokenIndex,endTokenIndex+1): 56 text = "" 57 POS = "" 58 annType = "" 59 count = 0 60 for j in range(i,startTokenIndex-1,-1): 61 if count >= max: 62 break 63 token = sentenceGraph.tokens[j] 64 text = "_" + sentenceGraph.getTokenText(token) + text 65 POS = "_" + token.attrib["POS"] + POS 66 if sentenceGraph.tokenIsEntityHead[token] != None: 67 annType = "_" + sentenceGraph.tokenIsEntityHead[token].attrib["type"] + annType 68 else: 69 annType = "_" + "noAnnType" + annType 70 self.features[self.featureSet.getId(tag+"_text"+text)] = 1 71 self.features[self.featureSet.getId(tag+"_POS"+POS)] = 1 72 self.features[self.featureSet.getId(tag+"_annType"+POS)] = 1 73 count += 1
74