1  from FeatureBuilder import FeatureBuilder 
 2   
 3   
 7       
 9          count = 1 
10          for i in range(tokenIndex+1,min(len(sentenceGraph.tokens), tokenIndex+rangePos+1)): 
11              tag = preTag + "linear_+" + str(count) + "_" 
12              t = sentenceGraph.tokens[i] 
13              self.features[self.featureSet.getId(tag+"txt_"+sentenceGraph.getTokenText(t))] = 1 
14              self.features[self.featureSet.getId(tag+"POS_"+t.attrib["POS"])] = 1 
15              if sentenceGraph.tokenIsName[t]: 
16                  self.features[self.featureSet.getId(tag+"isName")] = 1 
17              count += 1 
18          count = 1 
19          for i in range(tokenIndex-1,max(tokenIndex-rangeNeg-1, -1),-1): 
20              tag = preTag + "linear_-" + str(count) + "_" 
21              t = sentenceGraph.tokens[i] 
22              self.features[self.featureSet.getId(tag+"txt_"+sentenceGraph.getTokenText(t))] = 1 
23              self.features[self.featureSet.getId(tag+"POS_"+t.attrib["POS"])] = 1 
24              if sentenceGraph.tokenIsName[t]: 
25                  self.features[self.featureSet.getId(tag+"isName")] = 1 
26              count += 1 
 27       
28 -    def buildContentFeatures(self, tokenIndex, text, duplets=True, triplets=True): 
 29           
30          if tokenIndex > 0 and text[0].isalpha() and text[0].isupper(): 
31              self.features[self.featureSet.getId("upper_case_start")] = 1 
32          for j in range(len(text)): 
33              if j > 0 and text[j].isalpha() and text[j].isupper(): 
34                  self.features[self.featureSet.getId("upper_case_middle")] = 1 
35               
36              if text[j].isdigit(): 
37                  self.features[self.featureSet.getId("has_digits")] = 1 
38                  if j > 0 and text[j-1] == "-": 
39                      self.features[self.featureSet.getId("has_hyphenated_digit")] = 1 
40              elif text[j] == "-": 
41                  self.features[self.featureSet.getId("has_hyphen")] = 1 
42              elif text[j] == "/": 
43                  self.features[self.featureSet.getId("has_fslash")] = 1 
44              elif text[j] == "\\": 
45                  self.features[self.featureSet.getId("has_bslash")] = 1 
46               
47              if j > 0 and duplets: 
48                  self.features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 
49               
50              if j > 1 and triplets: 
51                  self.features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 
 52   
53 -    def buildTokenGrams(self, startTokenIndex, endTokenIndex, sentenceGraph, tag="", max = 3): 
 54          tag = "lin_" + tag 
55          for i in range(startTokenIndex,endTokenIndex+1): 
56              text = "" 
57              POS = "" 
58              annType = "" 
59              count = 0 
60              for j in range(i,startTokenIndex-1,-1): 
61                  if count >= max: 
62                      break 
63                  token = sentenceGraph.tokens[j] 
64                  text = "_" + sentenceGraph.getTokenText(token) + text 
65                  POS = "_" + token.attrib["POS"] + POS 
66                  if sentenceGraph.tokenIsEntityHead[token] != None: 
67                      annType = "_" + sentenceGraph.tokenIsEntityHead[token].attrib["type"] + annType 
68                  else: 
69                      annType = "_" + "noAnnType" + annType 
70                  self.features[self.featureSet.getId(tag+"_text"+text)] = 1 
71                  self.features[self.featureSet.getId(tag+"_POS"+POS)] = 1 
72                  self.features[self.featureSet.getId(tag+"_annType"+POS)] = 1 
73                  count += 1 
  74