1 from FeatureBuilder import FeatureBuilder
2
3
7
9 count = 1
10 for i in range(tokenIndex+1,min(len(sentenceGraph.tokens), tokenIndex+rangePos+1)):
11 tag = preTag + "linear_+" + str(count) + "_"
12 t = sentenceGraph.tokens[i]
13 self.features[self.featureSet.getId(tag+"txt_"+sentenceGraph.getTokenText(t))] = 1
14 self.features[self.featureSet.getId(tag+"POS_"+t.attrib["POS"])] = 1
15 if sentenceGraph.tokenIsName[t]:
16 self.features[self.featureSet.getId(tag+"isName")] = 1
17 count += 1
18 count = 1
19 for i in range(tokenIndex-1,max(tokenIndex-rangeNeg-1, -1),-1):
20 tag = preTag + "linear_-" + str(count) + "_"
21 t = sentenceGraph.tokens[i]
22 self.features[self.featureSet.getId(tag+"txt_"+sentenceGraph.getTokenText(t))] = 1
23 self.features[self.featureSet.getId(tag+"POS_"+t.attrib["POS"])] = 1
24 if sentenceGraph.tokenIsName[t]:
25 self.features[self.featureSet.getId(tag+"isName")] = 1
26 count += 1
27
28 - def buildContentFeatures(self, tokenIndex, text, duplets=True, triplets=True):
29
30 if tokenIndex > 0 and text[0].isalpha() and text[0].isupper():
31 self.features[self.featureSet.getId("upper_case_start")] = 1
32 for j in range(len(text)):
33 if j > 0 and text[j].isalpha() and text[j].isupper():
34 self.features[self.featureSet.getId("upper_case_middle")] = 1
35
36 if text[j].isdigit():
37 self.features[self.featureSet.getId("has_digits")] = 1
38 if j > 0 and text[j-1] == "-":
39 self.features[self.featureSet.getId("has_hyphenated_digit")] = 1
40 elif text[j] == "-":
41 self.features[self.featureSet.getId("has_hyphen")] = 1
42 elif text[j] == "/":
43 self.features[self.featureSet.getId("has_fslash")] = 1
44 elif text[j] == "\\":
45 self.features[self.featureSet.getId("has_bslash")] = 1
46
47 if j > 0 and duplets:
48 self.features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
49
50 if j > 1 and triplets:
51 self.features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
52
53 - def buildTokenGrams(self, startTokenIndex, endTokenIndex, sentenceGraph, tag="", max = 3):
54 tag = "lin_" + tag
55 for i in range(startTokenIndex,endTokenIndex+1):
56 text = ""
57 POS = ""
58 annType = ""
59 count = 0
60 for j in range(i,startTokenIndex-1,-1):
61 if count >= max:
62 break
63 token = sentenceGraph.tokens[j]
64 text = "_" + sentenceGraph.getTokenText(token) + text
65 POS = "_" + token.attrib["POS"] + POS
66 if sentenceGraph.tokenIsEntityHead[token] != None:
67 annType = "_" + sentenceGraph.tokenIsEntityHead[token].attrib["type"] + annType
68 else:
69 annType = "_" + "noAnnType" + annType
70 self.features[self.featureSet.getId(tag+"_text"+text)] = 1
71 self.features[self.featureSet.getId(tag+"_POS"+POS)] = 1
72 self.features[self.featureSet.getId(tag+"_annType"+POS)] = 1
73 count += 1
74