Package TEES :: Package ExampleBuilders :: Package FeatureBuilders :: Module DrugFeatureBuilder
[hide private]

Source Code for Module TEES.ExampleBuilders.FeatureBuilders.DrugFeatureBuilder

  1  import sys, os 
  2  sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")) 
  3  from Core.IdSet import IdSet 
  4  import Core.ExampleUtils as ExampleUtils 
  5  from FeatureBuilder import FeatureBuilder 
  6  import Utils.Settings as Settings 
  7  import Utils.Download 
  8  import Utils.ElementTreeUtils as ETUtils 
  9  from collections import defaultdict 
 10   
11 -def installDrugBank(destPath=None, downloadPath=None, redownload=False, updateLocalSettings=False):
12 print >> sys.stderr, "---------------", "Downloading Drug Bank XML", "---------------" 13 print >> sys.stderr, "See http://www.drugbank.ca/downloads for conditions of use" 14 if destPath == None: 15 destPath = os.path.join(Settings.DATAPATH, "resources") 16 if downloadPath == None: 17 downloadPath = os.path.join(Settings.DATAPATH, "resources/download") 18 filenames = Utils.Download.downloadAndExtract(Settings.URL["DRUG_BANK_XML"], destPath, downloadPath, redownload=redownload) 19 assert len(filenames) == 1 20 Settings.setLocal("DRUG_BANK_XML", os.path.join(destPath, filenames[0]), updateLocalSettings)
21
22 -class DrugFeatureBuilder(FeatureBuilder):
23 data = None 24
25 - def __init__(self, featureSet=None):
26 FeatureBuilder.__init__(self, featureSet) 27 if not hasattr(Settings, "DRUG_BANK_XML"): 28 print >> sys.stderr, "Drug Bank XML not installed, installing now" 29 installDrugBank(updateLocalSettings=True) 30 drugBankFile = Settings.DRUG_BANK_XML 31 #drugBankFile = "/home/jari/data/DDIExtraction2011/resources/drugbank.xml" 32 # Load drug data into memory on first call to constructor 33 if DrugFeatureBuilder.data == None: 34 DrugFeatureBuilder.data, DrugFeatureBuilder.nameToId = prepareDrugBank(drugBankFile) 35 DrugFeatureBuilder.interactionPairs = buildInteractionPairs(DrugFeatureBuilder.data)
36
37 - def buildPairFeatures(self, e1, e2):
38 e1Name = normalizeDrugName(e1.get("text")) 39 e2Name = normalizeDrugName(e2.get("text")) 40 interactionType = self.getInteraction(e1Name, e2Name) 41 if self.getInteraction(e1Name, e2Name) == True: 42 self.setFeature("DrugBankPairTrueInt") 43 else: 44 self.setFeature("NotDrugInteraction") 45 if interactionType == "UNKNOWN_NAME": 46 self.setFeature("PairNotInDrugBank") 47 else: 48 self.setFeature("DrugBankPairFalseInt")
49
50 - def getMTMXAttrs(self, e1, e2, attr):
51 rv = [str(e1.get(attr)).lower().replace(" ", ""), str(e2.get(attr)).lower().replace(" ", "")] 52 if rv[0] == "": rv[0] = "none" 53 if rv[1] == "": rv[1] = "none" 54 rv.sort() 55 return rv
56
57 - def buildMTMXFeatures(self, e1, e2):
58 names = self.getMTMXAttrs(e1, e2, "mtmxName") 59 self.setFeature("mtmxNames-" + "-".join(names)) 60 if names[0] == names[1]: 61 if names[0] in ["", "none"]: 62 self.setFeature("mtmxNames-both_unknown") 63 else: 64 self.setFeature("mtmxNames-both_identical") 65 self.setFeature("mtmxShortNames-" + "-".join(self.getMTMXAttrs(e1, e2, "mtmxNameShort"))) 66 mtmxCuis = self.getMTMXAttrs(e1, e2, "mtmxCui") 67 for mtmxCui in mtmxCuis: 68 self.setFeature("mtmxCui_" + mtmxCui) 69 self.setFeature("mtmxCuis-" + "-".join(mtmxCuis)) 70 # Probabilities 71 rv = self.getMTMXAttrs(e1, e2, "mtmxProb") 72 if rv[0] in ["", "none"]: rv[0] = "0" 73 if rv[1] in ["", "none"]: rv[1] = "0" 74 rv[0] = int(rv[0]) 75 rv[1] = int(rv[1]) 76 assert rv[0] <= 1000 and rv[1] <= 1000, (rv[0], rv[1]) 77 rv.sort() 78 self.setFeature("mtmxProbMin", float(rv[0]) / 1000.0) 79 self.setFeature("mtmxProbMax", float(rv[1]) / 1000.0) 80 # Semtypes 81 sem = self.getMTMXAttrs(e1, e2, "mtmxSemTypes") 82 #print sem 83 for i in sem[0].split(","): 84 for j in sem[1].split(","): 85 semPair = [i, j] 86 semPair.sort() 87 #print "semPair", semPair 88 self.setFeature("semPair-" + "-".join(semPair)) 89 self.setFeature("semType-" + i) 90 self.setFeature("semType-" + j)
91
92 - def getInteraction(self, e1Name, e2Name):
93 e1Name = normalizeDrugName(e1Name) 94 e2Name = normalizeDrugName(e2Name) 95 e1Ids = DrugFeatureBuilder.nameToId[e1Name] 96 e2Ids = DrugFeatureBuilder.nameToId[e2Name] 97 #print e1Ids, e2Ids 98 if len(e1Ids) == 0 or len(e2Ids) == 0: 99 return "UNKNOWN_NAME" # unknown drug name 100 for id1 in e1Ids: 101 for id2 in e2Ids: 102 if DrugFeatureBuilder.interactionPairs[id1][id2]: 103 return True 104 return False
105
106 -def normalizeDrugName(text):
107 return text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
108
109 -def getNestedItems(parent, term, data, preTag, termPlural=None, verbose=False):
110 if termPlural != None: 111 items = parent.find(preTag+termPlural).findall(preTag+term) 112 else: 113 items = parent.find(preTag+term+"s").findall(preTag+term) 114 for item in items: 115 data[term].append(item.text) 116 if verbose: print " " + term + ": " + item.text
117
118 -def resolveInteractions(data, verbose=False):
119 counts = defaultdict(int) 120 if verbose: print "Resolving Interactions" 121 for id in data: 122 if verbose: print id, data[id]["name"] 123 for interaction in data[id]["interaction"]: 124 partnerDBId = str(interaction[0]) 125 partnerDBId = "DB" + (5 - len(partnerDBId)) * "0" + partnerDBId 126 interaction[0] = partnerDBId 127 if partnerDBId in data: 128 interaction[1] = data[partnerDBId]["name"] 129 counts["found-partner-ids"] += 1 130 else: 131 counts["missing-partner-ids"] += 1 132 if verbose: print " ", interaction 133 if verbose: print "Interaction resolution counts:", counts
134
135 -def buildInteractionPairs(data):
136 intPairs = defaultdict(lambda : defaultdict(lambda: False)) 137 for id in data: 138 for interaction in data[id]["interaction"]: 139 if interaction[1] != None: 140 intPairs[id][interaction[0]] = True 141 intPairs[interaction[0]][id] = True 142 return intPairs
143
144 -def mapNamesToIds(data, normalize=True, verbose=False):
145 counts = defaultdict(int) 146 nameToId = defaultdict(list) 147 for id in sorted(data.keys()): 148 for name in [data[id]["name"]] + data[id]["synonym"] + data[id]["brand"]: 149 #assert name not in nameToId, name 150 if normalize: 151 name = normalizeDrugName(name) 152 if id not in nameToId[name]: 153 nameToId[name].append(id) 154 # count 155 for name in nameToId: 156 counts[len(nameToId[name])] += 1 157 if len(nameToId[name]) > 2: 158 if verbose: print "Multiple ids:", len(nameToId[name]), name, nameToId[name] 159 if verbose: print "Name to id:", counts 160 return nameToId
161
162 -def loadDrugBank(filename, preTag="{http://drugbank.ca}", verbose=False):
163 data = defaultdict(lambda : defaultdict(list)) 164 print "Loading DrugBank XML" 165 xml = ETUtils.ETFromObj(filename) 166 print "Processing DrugBank XML" 167 root = xml.getroot() 168 assert root.tag == preTag+"drugs", root.tag 169 for drug in root.findall(preTag+"drug"): 170 id = drug.find(preTag+"drugbank-id").text 171 name = drug.find(preTag+"name").text 172 if verbose: print id, name 173 assert id not in data 174 data[id]["name"] = name 175 # TODO: Enzymes & targets 176 # TODO: hydrophobicity 177 getNestedItems(drug, "synonym", data[id], preTag) 178 getNestedItems(drug, "brand", data[id], preTag) 179 getNestedItems(drug, "group", data[id], preTag) 180 getNestedItems(drug, "category", data[id], preTag, "categories") 181 interactions = drug.find(preTag+"drug-interactions").findall(preTag+"drug-interaction") 182 for interaction in interactions: 183 data[id]["interaction"].append( [interaction.find(preTag+"drug").text, None, interaction.find(preTag+"description").text,] ) 184 return data
185
186 -def prepareDrugBank(drugBankFile):
187 data = loadDrugBank(drugBankFile) 188 resolveInteractions(data) 189 nameToId = mapNamesToIds(data) 190 return data, nameToId
191 192 if __name__=="__main__": 193 # drugBankFile = "/home/jari/data/DDIExtraction2011/resources/drugbank.xml" 194 # data = loadDrugBank(drugBankFile, verbose=True) 195 # nameToId = mapNamesToIds(data, verbose=True) 196 # #print nameToId 197 # #resolveInteractions(data) 198 f = DrugFeatureBuilder() 199 #print f.interactionPairs 200 print "1:", f.getInteraction("Refludan", "Treprostinil") 201 print "2:", f.getInteraction("Refludan", "TreprostinilBlahBlah") 202 print "3:", f.getInteraction("Refludan", "[4-({5-(AMINOCARBONYL)-4-[(3-METHYLPHENYL)AMINO]PYRIMIDIN-2-YL}AMINO)PHENYL]ACETIC ACID") 203