1 from FeatureBuilder import FeatureBuilder
2
3
4
5
6 subcomponent = set(["region", "promoter", "upstream", "fragment", "site",
7 "sequence", "segment", "repeat", "repeat", "element",
8 "duplication", "exon", "downstream", "terminus", "motif",
9 "frame", "carboxy-terminus", "domain", "subunit", "codon",
10 "promoter", "enhancer", "locus", "ltr", "helix-loop-helix",
11 "zinc-finger", "portion", "residue", "box", "intron"])
12
13 supergroup = set(["complex", "family", "octamer", "microtubule"])
14
15 aminoAcids = [
16
17 ("glycine", "gly", "g", "nonpolar", "neutral"),
18 ("alanine", "ala", "a", "nonpolar", "neutral"),
19 ("valine", "val", "v", "nonpolar", "neutral"),
20 ("leucine", "leu", "l", "nonpolar", "neutral"),
21 ("isoleucine", "ile", "i", "nonpolar", "neutral"),
22 ("methionine", "met", "m", "nonpolar", "neutral"),
23 ("phenylalanine", "phe", "f", "nonpolar", "neutral"),
24 ("tryptophan", "trp", "w", "nonpolar", "neutral"),
25 ("proline", "pro", "p", "nonpolar", "neutral"),
26
27 ("serine", "ser", "s", "hydrophilic", "neutral"),
28 ("threonine", "thr", "t", "hydrophilic", "neutral"),
29 ("cysteine", "cys", "c", "hydrophilic", "neutral"),
30 ("tyrosine", "tyr", "y", "hydrophilic", "neutral"),
31 ("asparagine", "asn", "n", "hydrophilic", "neutral"),
32 ("glutamine", "gln", "q", "hydrophilic", "neutral"),
33
34 ("aspartic acid", "asp", "d", "hydrophilic", "negative"),
35 ("glutamic acid", "glu", "e", "hydrophilic", "negative"),
36
37 ("lysine", "lys", "k", "hydrophilic", "positive"),
38 ("arginine", "arg", "r", "hydrophilic", "positive"),
39 ("histidine", "his", "h", "hydrophilic", "positive")]
40
44
45
46
47
49 global aminoAcids
50
51 string = string.lower()
52 for aa in aminoAcids:
53 word = string.find(aa[0])
54 if word != -1:
55 return word, aa
56 else:
57 tlc = string.find(aa[1])
58 if tlc != -1:
59
60 if (tlc == 0 or not string[tlc-1].isalpha()) and (tlc + 3 >= len(string) or not string[tlc + 3].isalpha()):
61 return tlc, aa
62 return -1, None
63
73
79
81 if tag == None:
82 tag = substring
83 index = string.find(substring)
84 if index != -1:
85 self.setFeature("RELsubstring_"+tag)
86 if index + len(substring) == len(string):
87 self.setFeature("RELsubstring_terminal_"+tag)
88 else:
89 self.setFeature("RELsubstring_nonterminal_"+tag)
90
92 string = ""
93 for t in tokens[tokenIndex-6:tokenIndex]:
94
95 string += t.get("text")
96 string = string.lower().replace("-", "").replace(" ", "")
97
98 self.findSubstring(string, "nfkappab", "nfkb")
99 self.findSubstring(string, "nfkb")
100 self.findSubstring(string, "nfkappab", "complex")
101 self.findSubstring(string, "nfkb", "complex")
102
103 self.findSubstring(string, "kappab")
104
105 self.findSubstring(string, "ap1")
106 self.findSubstring(string, "activatingprotein1", "ap1")
107 self.findSubstring(string, "ap1", "complex")
108 self.findSubstring(string, "activatingprotein1", "complex")
109
110 self.findSubstring(string, "proteasome")
111 self.findSubstring(string, "proteasome", "complex")
112
113 self.findSubstring(string, "bp", "bp")
114 self.findSubstring(string, "basepair", "bp")
115
116 self.findSubstring(string, "5'", "5prime")
117 self.findSubstring(string, "3'", "3prime")
118
120 for letter in string:
121 if letter not in ["a", "g", "t", "c"]:
122 return
123 self.setFeature("RELDNA_sequence")
124
126 if tokenIndex > 1:
127 if tokens[tokenIndex-1].get("text").lower() in ["to", "and", "-"]:
128 t1Text = tokens[tokenIndex-2].get("text")
129 if t1Text[0] == "-" or t1Text[0] == "+":
130 t1Text = t1Text[1:]
131 t2Text = tokens[tokenIndex].get("text")
132 if t2Text[0] == "-" or t2Text[0] == "+":
133 t2Text = t2Text[1:]
134 if t1Text.isdigit() and t2Text.isdigit():
135 self.setFeature("RELnumeric_range")
136
150