TEES.Utils.Convert.convertDDI

1 import sys, os, time 2 import shutil 3 import tempfile 4 import subprocess 5 thisPath = os.path.dirname(os.path.abspath(__file__)) 6 sys.path.append(os.path.abspath(os.path.join(thisPath,"../../"))) 7 import Utils.STFormat.STTools as ST 8 import Utils.STFormat.ConvertXML as STConvert 9 import Utils.InteractionXML.RemoveUnconnectedEntities 10 import Utils.InteractionXML.DivideSets 11 import Utils.Download 12 import Utils.ProteinNameSplitter as ProteinNameSplitter 13 import Utils.Settings as Settings 14 import Utils.Stream as Stream 15 import Utils.FindHeads as FindHeads 16 import Tools.SentenceSplitter 17 import Tools.BLLIPParser 18 import Tools.StanfordParser 19 #import Utils.InteractionXML.CopyParse 20 try: 21 import cElementTree as ET 22 except ImportError: 23 import xml.etree.cElementTree as ET 24 import Utils.ElementTreeUtils as ETUtils 25 from collections import defaultdict 26 import Utils.Range as Range 27 import DDITools 28

29 -def getSets(popSize):

30 random.seed(15) 31 pop = range(popSize) 32 random.shuffle(pop) 33 floatPopSize = float(popSize) 34 trainSet = set(pop[0:int(0.5 * floatPopSize)]) 35 develSet = set(pop[int(0.5 * floatPopSize):int(0.75 * floatPopSize)]) 36 testSet = set(pop[int(0.75 * floatPopSize):]) 37 assert len(trainSet) + len(develSet) + len(testSet) == popSize 38 39 division = [] 40 for i in xrange(popSize): 41 if i in trainSet: 42 division.append("t") 43 elif i in develSet: 44 division.append("d") 45 else: 46 division.append("e") 47 assert len(division) == popSize 48 return division

49

50 -def fixEntities(xml):

51 counts = defaultdict(int) 52 for sentence in xml.getiterator("sentence"): 53 sText = sentence.get("text") 54 for entity in sentence.findall("entity"): 55 charOffset = entity.get("charOffset") 56 if charOffset == "-": 57 assert False, str(entity) 58 sentence.remove(entity) 59 counts["removed-invalid"] += 1 60 else: 61 charOffset = Range.charOffsetToSingleTuple(charOffset) 62 # fix length 63 realLength = len(entity.get("text")) 64 lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength 65 if lenDiff != realLength: 66 counts["incorrect-ent-offset"] += 1 67 counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1 68 if abs(lenDiff) > 2: 69 print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id")) 70 charOffset = (charOffset[0], charOffset[0] + realLength-1) 71 # find starting position 72 entIndex = sText.find(entity.get("text"), charOffset[0]) 73 if entIndex == -1: 74 for i in [-1,-2,-3]: 75 entIndex = sText.find(entity.get("text"), charOffset[0]+i) 76 if entIndex != -1: 77 break 78 if entIndex != 0: # could be lowercase 79 sTextLower = sText.lower() 80 for i in [0,-1,-2,-3]: 81 lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i) 82 if lowerEntIndex != -1: 83 break 84 if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]): 85 entIndex = lowerEntIndex 86 assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id")) 87 indexDiff = entIndex - charOffset[0] 88 if indexDiff != 0: 89 counts["incorrect-ent-index"] += 1 90 counts["incorrect-ent-index-diff"+str(indexDiff)] += 1 91 print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id")) 92 # move offset 93 charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff) 94 # validate new offset 95 sEntity = sText[charOffset[0]:charOffset[1]+1] 96 assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id")) 97 entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1] + 1))) 98 entity.set("isName", "True") 99 for interaction in sentence.findall("interaction"): 100 interaction.set("type", "DDI") 101 print "Fix counts:", counts

102

103 -def convertToInteractions(xml):

104 print "Renaming pair-elements" 105 counts = defaultdict(int) 106 for sentence in xml.getiterator("sentence"): 107 sentence.set("charOffset", "0-" + str(len(sentence.get("text"))-1) ) 108 for pair in sentence.findall("pair"): 109 if pair.get("interaction") == "true": 110 pair.tag = "interaction" 111 pair.set("type", "DDI") 112 counts["pos"] += 1 113 else: 114 sentence.remove(pair) 115 counts["neg"] += 1 116 print "Pair counts:", counts

117

118 -def loadDocs(url, outDir, tempDir, idStart=0):

119 inDir = Utils.Download.downloadAndExtract(url, tempDir, outDir)[0] 120 inDir = os.path.join(tempDir, inDir) 121 122 print "Loading documents from", inDir 123 sentences = {"positive":[], "negative":[]} 124 docCounts = {} 125 docById = {} 126 documents = [] 127 for filename in sorted(os.listdir(inDir)): 128 if filename.endswith(".xml"): 129 print "Reading", filename, 130 xml = ETUtils.ETFromObj(os.path.join(inDir, filename)) 131 for document in xml.getiterator("document"): 132 counts = [0,0] 133 for sentence in document.findall("sentence"): 134 #sentence.set("document.get("origId") + "." + sentence.get("origId")) 135 truePairs = False 136 for pair in sentence.findall("pair"): 137 if pair.get("interaction") == "true": 138 truePairs = True 139 break 140 if truePairs: 141 counts[0] += 1 142 sentences["positive"].append(sentence) 143 else: 144 counts[1] += 1 145 sentences["negative"].append(sentence) 146 assert document.get("id") not in docCounts 147 docCounts[document.get("id")] = counts 148 docById[document.get("id")] = document 149 documents.append(document) 150 print counts, 151 #print ETUtils.toStr(document) 152 print 153 print "Positive sentences:", len(sentences["positive"]) 154 print "Negative sentences:", len(sentences["negative"]) 155 return documents, docById, docCounts

156

157 -def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):

158 cwd = os.getcwd() 159 os.chdir(outDir) 160 logFileName = os.path.join(outDir, "DDI-conversion-log.txt") 161 Stream.openLog(logFileName) 162 print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" 163 164 bigfileName = os.path.join(outDir, "DDI") 165 #oldXML = ETUtils.ETFromObj(bigfileName+".xml") 166 if trainUnified == None: 167 trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"] 168 if trainMTMX == None: 169 trainMTMX = Settings.URL["DDI_TRAIN_MTMX"] 170 if testUnified == None: 171 testUnified = Settings.URL["DDI_TEST_UNIFIED"] 172 if testMTMX == None: 173 testMTMX = Settings.URL["DDI_TEST_MTMX"] 174 175 # Load main documents 176 tempdir = tempfile.mkdtemp() 177 print >> sys.stderr, "Temporary files directory at", tempdir 178 documents, docById, docCounts = loadDocs(trainUnified, outDir + "/DDI11-original", tempdir) 179 # Divide training data into a train and devel set 180 sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) 181 datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} 182 for i in range(0, len(sortedDocCounts)-3, 4): 183 for j in [0,1]: 184 docById[sortedDocCounts[i+j][0]].set("set", "train") 185 datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] 186 datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] 187 docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") 188 docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") 189 datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] 190 datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] 191 datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] 192 datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] 193 for document in documents: # epajaolliset jaa yli 194 if document.get("set") == None: 195 document.set("set", "train") 196 # Print division results 197 print >> sys.stderr, datasetCounts 198 for key in datasetCounts.keys(): 199 if datasetCounts[key][1] != 0: 200 print key, datasetCounts[key][0] / float(datasetCounts[key][1]) 201 else: 202 print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) 203 # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed 204 # for the final evaluation. 205 changeIdCount = 1000 206 for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 207 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 208 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 209 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 210 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 211 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 212 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 213 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 214 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 215 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']: 216 newId = "DrugDDI.d" + str(changeIdCount) 217 print >> sys.stderr, "Changing train/devel id", trainId, "to", newId 218 for element in docById[trainId].getiterator(): 219 for attrName, attrValue in element.attrib.iteritems(): 220 if trainId in attrValue: 221 element.set(attrName, attrValue.replace(trainId, newId)) 222 docById[newId] = docById[trainId] 223 del docById[trainId] 224 changeIdCount += 1 225 # If test set exists, load it, too 226 if testUnified != None: 227 testDocuments, testDocById, testDocCounts = loadDocs(testUnified, outDir + "/DDI11-original", tempdir) 228 for document in testDocuments: 229 document.set("set", "test") 230 documents = documents + testDocuments 231 overlappingIds = [] 232 for key in docById: 233 if key in testDocById: 234 overlappingIds.append(key) 235 for key in docById: 236 assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) 237 docById.update(testDocById) 238 239 # Add all documents into one XML 240 xmlTree = ET.ElementTree(ET.Element("corpus")) 241 root = xmlTree.getroot() 242 root.set("source", "DrugDDI") 243 for document in documents: 244 root.append(document) 245 if makeIntermediateFiles: 246 ETUtils.write(root, bigfileName + "-documents-notfixed.xml") 247 xml = xmlTree 248 print >> sys.stderr, "Fixing DDI XML" 249 fixEntities(xml) 250 convertToInteractions(xml) 251 # Add MTMX 252 if trainMTMX != None: 253 inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) 254 DDITools.addMTMX(xml, inDir) 255 if testMTMX != None: 256 inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) 257 DDITools.addMTMX(xml, inDir) 258 if makeIntermediateFiles: 259 ETUtils.write(root, bigfileName + "-documents.xml") 260 261 262 263 print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" 264 Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) 265 extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI" 266 print >> sys.stderr, "Making sentences" 267 Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) 268 print >> sys.stderr, "Inserting McCC parses" 269 Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"}) 270 print >> sys.stderr, "Inserting Stanford conversions" 271 Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"}) 272 print >> sys.stderr, "Protein Name Splitting" 273 splitTarget = "McCC" 274 #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) 275 ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) 276 print >> sys.stderr, "Head Detection" 277 #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) 278 xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) 279 280 print >> sys.stderr, "Dividing into sets" 281 Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI", ".xml") 282 283 Stream.closeLog(logFileName) 284 if not debug: 285 print >> sys.stderr, "Removing temporary directory", tempdir 286 shutil.rmtree(tempdir) 287 os.chdir(cwd)

288 289 if __name__=="__main__": 290 # Import Psyco if available 291 try: 292 import psyco 293 psyco.full() 294 print >> sys.stderr, "Found Psyco, using" 295 except ImportError: 296 print >> sys.stderr, "Psyco not installed" 297 298 from optparse import OptionParser 299 from Utils.Parameters import * 300 optparser = OptionParser(usage="%prog [options]\nDDI'11 Shared Task corpus conversion") 301 optparser.add_option("-o", "--outdir", default=os.path.normpath(Settings.DATAPATH + "/corpora"), dest="outdir", help="directory for output files") 302 optparser.add_option("-d", "--downloaddir", default=None, dest="downloaddir", help="directory to download corpus files to") 303 optparser.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="save intermediate corpus files") 304 optparser.add_option("--redownload", default=False, action="store_true", dest="redownload", help="re-download all source files") 305 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="Keep temporary files") 306 (options, args) = optparser.parse_args() 307 308 convertDDI(options.outdir, None, None, None, None, options.downloaddir, options.redownload, options.intermediateFiles, options.debug) 309

Source Code for Module TEES.Utils.Convert.convertDDI