TEES.ExampleBuilders.FeatureBuilders.RELFeatureBuilder

42 - def __init__(self, featureSet):

43 FeatureBuilder.__init__(self, featureSet)

44 #self.noAnnType = False 45 #self.edgeTypesForFeatures = [] 46 #self.useNonNameEntities = False 47

48 - def findAminoAcid(self, string):

49 global aminoAcids 50 51 string = string.lower() 52 for aa in aminoAcids: 53 word = string.find(aa[0]) 54 if word != -1: 55 return word, aa 56 else: 57 tlc = string.find(aa[1]) # three letter code 58 if tlc != -1: 59 # Three letter code must not be a part of a word (where it could be just a substring) 60 if (tlc == 0 or not string[tlc-1].isalpha()) and (tlc + 3 >= len(string) or not string[tlc + 3].isalpha()): 61 return tlc, aa 62 return -1, None

63

64 - def buildAllFeatures(self, tokens, tokenIndex):

65 token = tokens[tokenIndex] 66 tokText = token.get("text").lower() 67 68 self.buildAminoAcidFeatures(tokText) 69 self.buildDNAFeatures(tokText) 70 self.buildSubstringFeatures(tokens, tokenIndex) 71 self.buildRangeFeatures(tokens, tokenIndex) 72 self.buildKnownWordFeatures(tokText)

73

74 - def buildAminoAcidFeatures(self, string):

75 index, aa = self.findAminoAcid(string) 76 if aa != None: 77 self.setFeature("RELaminoacid_string") 78 self.setFeature("RELaminoacid_" + aa[1])

79

80 - def findSubstring(self, string, substring, tag=None):

81 if tag == None: 82 tag = substring 83 index = string.find(substring) 84 if index != -1: 85 self.setFeature("RELsubstring_"+tag) 86 if index + len(substring) == len(string): 87 self.setFeature("RELsubstring_terminal_"+tag) 88 else: 89 self.setFeature("RELsubstring_nonterminal_"+tag)

90

91 - def buildSubstringFeatures(self, tokens, tokenIndex):

92 string = "" 93 for t in tokens[tokenIndex-6:tokenIndex]: 94 # TODO the actual token does not seem to be included 95 string += t.get("text") 96 string = string.lower().replace("-", "").replace(" ", "") 97 # nfkb 98 self.findSubstring(string, "nfkappab", "nfkb") 99 self.findSubstring(string, "nfkb") 100 self.findSubstring(string, "nfkappab", "complex") 101 self.findSubstring(string, "nfkb", "complex") 102 # kappa-b 103 self.findSubstring(string, "kappab") 104 # ap-1 105 self.findSubstring(string, "ap1") 106 self.findSubstring(string, "activatingprotein1", "ap1") 107 self.findSubstring(string, "ap1", "complex") 108 self.findSubstring(string, "activatingprotein1", "complex") 109 # proteasome 110 self.findSubstring(string, "proteasome") 111 self.findSubstring(string, "proteasome", "complex") 112 # base pairs 113 self.findSubstring(string, "bp", "bp") 114 self.findSubstring(string, "basepair", "bp") 115 # primes 116 self.findSubstring(string, "5'", "5prime") 117 self.findSubstring(string, "3'", "3prime")

118

119 - def buildDNAFeatures(self, string):

120 for letter in string: 121 if letter not in ["a", "g", "t", "c"]: 122 return 123 self.setFeature("RELDNA_sequence")

124

125 - def buildRangeFeatures(self, tokens, tokenIndex):

126 if tokenIndex > 1: 127 if tokens[tokenIndex-1].get("text").lower() in ["to", "and", "-"]: 128 t1Text = tokens[tokenIndex-2].get("text") 129 if t1Text[0] == "-" or t1Text[0] == "+": 130 t1Text = t1Text[1:] 131 t2Text = tokens[tokenIndex].get("text") 132 if t2Text[0] == "-" or t2Text[0] == "+": 133 t2Text = t2Text[1:] 134 if t1Text.isdigit() and t2Text.isdigit(): 135 self.setFeature("RELnumeric_range")

136

137 - def buildKnownWordFeatures(self, string):

138 global subcomponent, supergroup 139 140 string = string.lower() 141 142 if string[-1] == "s": 143 singular = string[:-1] 144 else: 145 singular = None 146 if string in subcomponent or singular in subcomponent: 147 self.setFeature("RELknown_subcomponent") 148 if string in supergroup or singular in supergroup: 149 self.setFeature("RELknown_supergroup")

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.RELFeatureBuilder