Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module RELFeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.RELFeatureBuilder

  1  from FeatureBuilder import FeatureBuilder 
  2   
  3  # Amino acids from http://www.bio.davidson.edu/courses/genomics/jmol/aatable.html 
  4  #amino acid     three letter code     single letter code 
  5   
  6  subcomponent = set(["region", "promoter", "upstream", "fragment", "site", 
  7                "sequence", "segment", "repeat", "repeat", "element", 
  8                "duplication", "exon", "downstream", "terminus", "motif", 
  9                "frame", "carboxy-terminus", "domain", "subunit", "codon", 
 10                "promoter", "enhancer", "locus", "ltr", "helix-loop-helix", 
 11                "zinc-finger", "portion", "residue", "box", "intron"]) 
 12   
 13  supergroup = set(["complex", "family", "octamer", "microtubule"]) 
 14   
 15  aminoAcids = [ 
 16      #nonpolar (hydrophobic) 
 17      ("glycine", "gly", "g", "nonpolar", "neutral"),  
 18      ("alanine", "ala", "a", "nonpolar", "neutral"), 
 19      ("valine", "val", "v", "nonpolar", "neutral"), 
 20      ("leucine", "leu", "l", "nonpolar", "neutral"), 
 21      ("isoleucine", "ile", "i", "nonpolar", "neutral"), 
 22      ("methionine", "met", "m", "nonpolar", "neutral"), 
 23      ("phenylalanine", "phe", "f", "nonpolar", "neutral"), 
 24      ("tryptophan", "trp", "w", "nonpolar", "neutral"), 
 25      ("proline", "pro", "p", "nonpolar", "neutral"),  
 26      #polar (hydrophilic) 
 27      ("serine", "ser", "s", "hydrophilic", "neutral"), 
 28      ("threonine", "thr", "t", "hydrophilic", "neutral"), 
 29      ("cysteine", "cys", "c", "hydrophilic", "neutral"), 
 30      ("tyrosine", "tyr", "y", "hydrophilic", "neutral"), 
 31      ("asparagine", "asn", "n", "hydrophilic", "neutral"), 
 32      ("glutamine", "gln", "q", "hydrophilic", "neutral"), 
 33      #electrically charged (negative and hydrophilic) 
 34      ("aspartic acid", "asp", "d", "hydrophilic", "negative"), 
 35      ("glutamic acid", "glu", "e", "hydrophilic", "negative"), 
 36      #electrically charged (positive and hydrophilic) 
 37      ("lysine", "lys", "k", "hydrophilic", "positive"), 
 38      ("arginine", "arg", "r", "hydrophilic", "positive"), 
 39      ("histidine", "his", "h", "hydrophilic", "positive")] 
 40   
41 -class RELFeatureBuilder(FeatureBuilder):
42 - def __init__(self, featureSet):
43 FeatureBuilder.__init__(self, featureSet)
44 #self.noAnnType = False 45 #self.edgeTypesForFeatures = [] 46 #self.useNonNameEntities = False 47
48 - def findAminoAcid(self, string):
49 global aminoAcids 50 51 string = string.lower() 52 for aa in aminoAcids: 53 word = string.find(aa[0]) 54 if word != -1: 55 return word, aa 56 else: 57 tlc = string.find(aa[1]) # three letter code 58 if tlc != -1: 59 # Three letter code must not be a part of a word (where it could be just a substring) 60 if (tlc == 0 or not string[tlc-1].isalpha()) and (tlc + 3 >= len(string) or not string[tlc + 3].isalpha()): 61 return tlc, aa 62 return -1, None
63
64 - def buildAllFeatures(self, tokens, tokenIndex):
65 token = tokens[tokenIndex] 66 tokText = token.get("text").lower() 67 68 self.buildAminoAcidFeatures(tokText) 69 self.buildDNAFeatures(tokText) 70 self.buildSubstringFeatures(tokens, tokenIndex) 71 self.buildRangeFeatures(tokens, tokenIndex) 72 self.buildKnownWordFeatures(tokText)
73
74 - def buildAminoAcidFeatures(self, string):
75 index, aa = self.findAminoAcid(string) 76 if aa != None: 77 self.setFeature("RELaminoacid_string") 78 self.setFeature("RELaminoacid_" + aa[1])
79
80 - def findSubstring(self, string, substring, tag=None):
81 if tag == None: 82 tag = substring 83 index = string.find(substring) 84 if index != -1: 85 self.setFeature("RELsubstring_"+tag) 86 if index + len(substring) == len(string): 87 self.setFeature("RELsubstring_terminal_"+tag) 88 else: 89 self.setFeature("RELsubstring_nonterminal_"+tag)
90
91 - def buildSubstringFeatures(self, tokens, tokenIndex):
92 string = "" 93 for t in tokens[tokenIndex-6:tokenIndex]: 94 # TODO the actual token does not seem to be included 95 string += t.get("text") 96 string = string.lower().replace("-", "").replace(" ", "") 97 # nfkb 98 self.findSubstring(string, "nfkappab", "nfkb") 99 self.findSubstring(string, "nfkb") 100 self.findSubstring(string, "nfkappab", "complex") 101 self.findSubstring(string, "nfkb", "complex") 102 # kappa-b 103 self.findSubstring(string, "kappab") 104 # ap-1 105 self.findSubstring(string, "ap1") 106 self.findSubstring(string, "activatingprotein1", "ap1") 107 self.findSubstring(string, "ap1", "complex") 108 self.findSubstring(string, "activatingprotein1", "complex") 109 # proteasome 110 self.findSubstring(string, "proteasome") 111 self.findSubstring(string, "proteasome", "complex") 112 # base pairs 113 self.findSubstring(string, "bp", "bp") 114 self.findSubstring(string, "basepair", "bp") 115 # primes 116 self.findSubstring(string, "5'", "5prime") 117 self.findSubstring(string, "3'", "3prime")
118
119 - def buildDNAFeatures(self, string):
120 for letter in string: 121 if letter not in ["a", "g", "t", "c"]: 122 return 123 self.setFeature("RELDNA_sequence")
124
125 - def buildRangeFeatures(self, tokens, tokenIndex):
126 if tokenIndex > 1: 127 if tokens[tokenIndex-1].get("text").lower() in ["to", "and", "-"]: 128 t1Text = tokens[tokenIndex-2].get("text") 129 if t1Text[0] == "-" or t1Text[0] == "+": 130 t1Text = t1Text[1:] 131 t2Text = tokens[tokenIndex].get("text") 132 if t2Text[0] == "-" or t2Text[0] == "+": 133 t2Text = t2Text[1:] 134 if t1Text.isdigit() and t2Text.isdigit(): 135 self.setFeature("RELnumeric_range")
136
137 - def buildKnownWordFeatures(self, string):
138 global subcomponent, supergroup 139 140 string = string.lower() 141 142 if string[-1] == "s": 143 singular = string[:-1] 144 else: 145 singular = None 146 if string in subcomponent or singular in subcomponent: 147 self.setFeature("RELknown_subcomponent") 148 if string in supergroup or singular in supergroup: 149 self.setFeature("RELknown_supergroup")
150