Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module BacteriaRenamingFeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.BacteriaRenamingFeatureBuilder

  1  import sys, os 
  2  from FeatureBuilder import FeatureBuilder 
  3  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..") 
  4  import Utils.Settings as Settings 
  5  import Utils.Download 
  6   
  7  # 1) Lowercase bacsu names, there are differences 
  8  # 2) Assert matching to bacsu 
  9  # 3) Bacsu-order seems to be the same as the former/new order 
 10  # Bacsu doesn't have everything 
 11  # 4) http://www.subtiwiki.uni-goettingen.de 
 12   
13 -def readBacsu(filename):
14 f = open(filename) 15 synDict = {} 16 lowerCased = set() 17 for line in f: 18 if line[0:3] != "BSU": 19 continue 20 synSplits = line.split()[4:] 21 synList = [] 22 for name in synSplits: 23 name = name.replace(";", "") 24 name = name.lower() 25 synList.append(name) 26 if not synList[0] in synDict: 27 synDict[synList[0]] = synList[1:] 28 else: 29 print >> sys.stderr, "Warning,", synList[0], "already a primary name" 30 synDict[synList[0]].extend(synList[1:]) 31 f.close() 32 return synDict
33 34 #print readBacsu("/home/jari/data/BioNLP11SharedTask/bacsu-modified.txt") 35
36 -def readSubtiwiki(filename):
37 f = open(filename) 38 synDict = {} 39 lowerCased = set() 40 for line in f: 41 line = line.strip() 42 synList = line.split(",") 43 for i in range(len(synList)): 44 synList[i] = synList[i].lower() 45 if not synList[0] in synDict: 46 synDict[synList[0]] = synList[1:] 47 else: 48 print >> sys.stderr, "Warning,", synList[0], "already a primary name" 49 synDict[synList[0]].extend(synList[1:]) 50 f.close() 51 return synDict
52
53 -def installRENData(destPath=None, downloadPath=None, redownload=False, updateLocalSettings=False):
54 print >> sys.stderr, "---------------", "Downloading TEES data files for REN", "---------------" 55 print >> sys.stderr, "These files are derived from UniProt bacsu and SubtiWiki" 56 if destPath == None: 57 destPath = os.path.join(Settings.DATAPATH, "resources") 58 if downloadPath == None: 59 downloadPath = os.path.join(Settings.DATAPATH, "resources/download") 60 Utils.Download.downloadAndExtract(Settings.URL["TEES_RESOURCES"], destPath, downloadPath, redownload=redownload) 61 Settings.setLocal("TEES_RESOURCES", destPath, updateLocalSettings)
62 63
64 -class BacteriaRenamingFeatureBuilder(FeatureBuilder):
65 - def __init__(self, featureSet):
66 FeatureBuilder.__init__(self, featureSet) 67 #self.bacsu = readBacsu(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/bacsu-modified.txt")) 68 #self.subti = readSubtiwiki(os.path.expanduser("~/data/BioNLP11SharedTask/supporting-tasks/Subtiwiki-Synonyms.csv")) 69 #self.subti = readSubtiwiki(os.path.expanduser("~/cvs_checkout/JariSandbox/Wiki/subtiwiki/Subtiwiki-Synonyms.csv")) 70 if not hasattr(Settings, "TEES_RESOURCES"): 71 print >> sys.stderr, "TEES example builder data files not installed, installing now" 72 installRENData(updateLocalSettings=True) 73 self.bacsu = readBacsu(os.path.join(Settings.TEES_RESOURCES, "bacsu-modified.txt")) 74 self.subti = readSubtiwiki(os.path.join(Settings.TEES_RESOURCES, "Subtiwiki-Synonyms.csv")) 75 # OR the dictionaries 76 self.any = {} 77 for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): 78 self.any[key] = set() 79 if self.bacsu.has_key(key): 80 for value in self.bacsu[key]: 81 self.any[key].add(value) 82 if self.subti.has_key(key): 83 for value in self.subti[key]: 84 self.any[key].add(value) 85 self.any[key] = list(self.any[key]) 86 self.any[key].sort() 87 # AND the dictionaries 88 self.all = {} 89 for key in sorted(list(set(self.bacsu.keys() + self.subti.keys()))): 90 self.all[key] = set() 91 allSynonyms = set() 92 bacsuSet = set() 93 if self.bacsu.has_key(key): 94 bacsuSet = self.bacsu[key] 95 for x in bacsuSet: allSynonyms.add(x) 96 subtiSet = set() 97 if self.subti.has_key(key): 98 subtiSet = self.subti[key] 99 for x in subtiSet: allSynonyms.add(x) 100 for synonym in allSynonyms: 101 if synonym in bacsuSet and synonym in subtiSet: 102 self.all[key].add(synonym) 103 self.all[key] = list(self.all[key]) 104 self.all[key].sort()
105
106 - def buildPairFeatures(self, e1, e2):
107 self.buildPairFeaturesDict(e1, e2, self.bacsu, "bacsu") 108 self.buildPairFeaturesDict(e1, e2, self.subti, "subti") 109 self.buildPairFeaturesDict(e1, e2, self.any, "any") 110 self.buildPairFeaturesDict(e1, e2, self.all, "all")
111
112 - def buildPairFeaturesDict(self, e1, e2, synDict, synTag):
113 # build in both directions 114 for tag, pair in ( ("frw_", (e1, e2)), ("rev_", (e2, e1)) ): 115 e1Text = pair[0].get("text").strip().lower() 116 e2Text = pair[1].get("text").strip().lower() 117 if synDict.has_key(e1Text): 118 if e2Text in synDict[e1Text]: 119 self.setFeature(tag + synTag + "_synonym")
120
121 - def buildSubstringFeatures(self, e1, e2):
122 e1Text = e1.get("text").strip().lower() 123 e2Text = e2.get("text").strip().lower() 124 if e1Text != "": 125 e1FirstThreeLetters = e1Text[0:3] 126 e1LastLetter = e1Text[-1] 127 else: 128 e1FirstThreeLetters = "NONE" 129 e1LastLetter = "NONE" 130 if e2Text != "": 131 e2FirstThreeLetters = e2Text[0:3] 132 e2LastLetter = e2Text[-1] 133 else: 134 e2FirstThreeLetters = "NONE" 135 e2LastLetter = "NONE" 136 self.setFeature("REN_subpair_f3_" + e1FirstThreeLetters + "_" + e2FirstThreeLetters) 137 self.setFeature("REN_subpair_l1_" + e1LastLetter + "_" + e2LastLetter)
138