1 import sys, os, time
2 import shutil
3 import tempfile
4 import subprocess
5 thisPath = os.path.dirname(os.path.abspath(__file__))
6 sys.path.append(os.path.abspath(os.path.join(thisPath,"../../")))
7 import Utils.STFormat.STTools as ST
8 import Utils.STFormat.ConvertXML as STConvert
9 import Utils.InteractionXML.RemoveUnconnectedEntities
10 import Utils.InteractionXML.DivideSets
11 import Utils.Download
12 import Utils.ProteinNameSplitter as ProteinNameSplitter
13 import Utils.Settings as Settings
14 import Utils.Stream as Stream
15 import Utils.FindHeads as FindHeads
16 import Tools.SentenceSplitter
17 import Tools.BLLIPParser
18 import Tools.StanfordParser
19
20 try:
21 import cElementTree as ET
22 except ImportError:
23 import xml.etree.cElementTree as ET
24 import Utils.ElementTreeUtils as ETUtils
25 from collections import defaultdict
26 import Utils.Range as Range
27 import DDITools
28
30 random.seed(15)
31 pop = range(popSize)
32 random.shuffle(pop)
33 floatPopSize = float(popSize)
34 trainSet = set(pop[0:int(0.5 * floatPopSize)])
35 develSet = set(pop[int(0.5 * floatPopSize):int(0.75 * floatPopSize)])
36 testSet = set(pop[int(0.75 * floatPopSize):])
37 assert len(trainSet) + len(develSet) + len(testSet) == popSize
38
39 division = []
40 for i in xrange(popSize):
41 if i in trainSet:
42 division.append("t")
43 elif i in develSet:
44 division.append("d")
45 else:
46 division.append("e")
47 assert len(division) == popSize
48 return division
49
51 counts = defaultdict(int)
52 for sentence in xml.getiterator("sentence"):
53 sText = sentence.get("text")
54 for entity in sentence.findall("entity"):
55 charOffset = entity.get("charOffset")
56 if charOffset == "-":
57 assert False, str(entity)
58 sentence.remove(entity)
59 counts["removed-invalid"] += 1
60 else:
61 charOffset = Range.charOffsetToSingleTuple(charOffset)
62
63 realLength = len(entity.get("text"))
64 lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
65 if lenDiff != realLength:
66 counts["incorrect-ent-offset"] += 1
67 counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1
68 if abs(lenDiff) > 2:
69 print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id"))
70 charOffset = (charOffset[0], charOffset[0] + realLength-1)
71
72 entIndex = sText.find(entity.get("text"), charOffset[0])
73 if entIndex == -1:
74 for i in [-1,-2,-3]:
75 entIndex = sText.find(entity.get("text"), charOffset[0]+i)
76 if entIndex != -1:
77 break
78 if entIndex != 0:
79 sTextLower = sText.lower()
80 for i in [0,-1,-2,-3]:
81 lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i)
82 if lowerEntIndex != -1:
83 break
84 if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]):
85 entIndex = lowerEntIndex
86 assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id"))
87 indexDiff = entIndex - charOffset[0]
88 if indexDiff != 0:
89 counts["incorrect-ent-index"] += 1
90 counts["incorrect-ent-index-diff"+str(indexDiff)] += 1
91 print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id"))
92
93 charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff)
94
95 sEntity = sText[charOffset[0]:charOffset[1]+1]
96 assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id"))
97 entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1] + 1)))
98 entity.set("isName", "True")
99 for interaction in sentence.findall("interaction"):
100 interaction.set("type", "DDI")
101 print "Fix counts:", counts
102
104 print "Renaming pair-elements"
105 counts = defaultdict(int)
106 for sentence in xml.getiterator("sentence"):
107 sentence.set("charOffset", "0-" + str(len(sentence.get("text"))-1) )
108 for pair in sentence.findall("pair"):
109 if pair.get("interaction") == "true":
110 pair.tag = "interaction"
111 pair.set("type", "DDI")
112 counts["pos"] += 1
113 else:
114 sentence.remove(pair)
115 counts["neg"] += 1
116 print "Pair counts:", counts
117
118 -def loadDocs(url, outDir, tempDir, idStart=0):
119 inDir = Utils.Download.downloadAndExtract(url, tempDir, outDir)[0]
120 inDir = os.path.join(tempDir, inDir)
121
122 print "Loading documents from", inDir
123 sentences = {"positive":[], "negative":[]}
124 docCounts = {}
125 docById = {}
126 documents = []
127 for filename in sorted(os.listdir(inDir)):
128 if filename.endswith(".xml"):
129 print "Reading", filename,
130 xml = ETUtils.ETFromObj(os.path.join(inDir, filename))
131 for document in xml.getiterator("document"):
132 counts = [0,0]
133 for sentence in document.findall("sentence"):
134
135 truePairs = False
136 for pair in sentence.findall("pair"):
137 if pair.get("interaction") == "true":
138 truePairs = True
139 break
140 if truePairs:
141 counts[0] += 1
142 sentences["positive"].append(sentence)
143 else:
144 counts[1] += 1
145 sentences["negative"].append(sentence)
146 assert document.get("id") not in docCounts
147 docCounts[document.get("id")] = counts
148 docById[document.get("id")] = document
149 documents.append(document)
150 print counts,
151
152 print
153 print "Positive sentences:", len(sentences["positive"])
154 print "Negative sentences:", len(sentences["negative"])
155 return documents, docById, docCounts
156
157 -def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):
158 cwd = os.getcwd()
159 os.chdir(outDir)
160 logFileName = os.path.join(outDir, "DDI-conversion-log.txt")
161 Stream.openLog(logFileName)
162 print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
163
164 bigfileName = os.path.join(outDir, "DDI")
165
166 if trainUnified == None:
167 trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"]
168 if trainMTMX == None:
169 trainMTMX = Settings.URL["DDI_TRAIN_MTMX"]
170 if testUnified == None:
171 testUnified = Settings.URL["DDI_TEST_UNIFIED"]
172 if testMTMX == None:
173 testMTMX = Settings.URL["DDI_TEST_MTMX"]
174
175
176 tempdir = tempfile.mkdtemp()
177 print >> sys.stderr, "Temporary files directory at", tempdir
178 documents, docById, docCounts = loadDocs(trainUnified, outDir + "/DDI11-original", tempdir)
179
180 sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
181 datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]}
182 for i in range(0, len(sortedDocCounts)-3, 4):
183 for j in [0,1]:
184 docById[sortedDocCounts[i+j][0]].set("set", "train")
185 datasetCounts["train"][0] += sortedDocCounts[i+j][1][0]
186 datasetCounts["train"][1] += sortedDocCounts[i+j][1][1]
187 docById[sortedDocCounts[i+2][0]].set("set", "train")
188 docById[sortedDocCounts[i+3][0]].set("set", "devel")
189 datasetCounts["train"][0] += sortedDocCounts[i+2][1][0]
190 datasetCounts["train"][1] += sortedDocCounts[i+2][1][1]
191 datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0]
192 datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1]
193 for document in documents:
194 if document.get("set") == None:
195 document.set("set", "train")
196
197 print >> sys.stderr, datasetCounts
198 for key in datasetCounts.keys():
199 if datasetCounts[key][1] != 0:
200 print key, datasetCounts[key][0] / float(datasetCounts[key][1])
201 else:
202 print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
203
204
205 changeIdCount = 1000
206 for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337',
207 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379',
208 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398',
209 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430',
210 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462',
211 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482',
212 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498',
213 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523',
214 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554',
215 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']:
216 newId = "DrugDDI.d" + str(changeIdCount)
217 print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
218 for element in docById[trainId].getiterator():
219 for attrName, attrValue in element.attrib.iteritems():
220 if trainId in attrValue:
221 element.set(attrName, attrValue.replace(trainId, newId))
222 docById[newId] = docById[trainId]
223 del docById[trainId]
224 changeIdCount += 1
225
226 if testUnified != None:
227 testDocuments, testDocById, testDocCounts = loadDocs(testUnified, outDir + "/DDI11-original", tempdir)
228 for document in testDocuments:
229 document.set("set", "test")
230 documents = documents + testDocuments
231 overlappingIds = []
232 for key in docById:
233 if key in testDocById:
234 overlappingIds.append(key)
235 for key in docById:
236 assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds))
237 docById.update(testDocById)
238
239
240 xmlTree = ET.ElementTree(ET.Element("corpus"))
241 root = xmlTree.getroot()
242 root.set("source", "DrugDDI")
243 for document in documents:
244 root.append(document)
245 if makeIntermediateFiles:
246 ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
247 xml = xmlTree
248 print >> sys.stderr, "Fixing DDI XML"
249 fixEntities(xml)
250 convertToInteractions(xml)
251
252 if trainMTMX != None:
253 inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original"))
254 DDITools.addMTMX(xml, inDir)
255 if testMTMX != None:
256 inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original"))
257 DDITools.addMTMX(xml, inDir)
258 if makeIntermediateFiles:
259 ETUtils.write(root, bigfileName + "-documents.xml")
260
261
262
263 print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
264 Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload)
265 extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI"
266 print >> sys.stderr, "Making sentences"
267 Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
268 print >> sys.stderr, "Inserting McCC parses"
269 Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"})
270 print >> sys.stderr, "Inserting Stanford conversions"
271 Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"})
272 print >> sys.stderr, "Protein Name Splitting"
273 splitTarget = "McCC"
274
275 ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
276 print >> sys.stderr, "Head Detection"
277
278 xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)
279
280 print >> sys.stderr, "Dividing into sets"
281 Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI", ".xml")
282
283 Stream.closeLog(logFileName)
284 if not debug:
285 print >> sys.stderr, "Removing temporary directory", tempdir
286 shutil.rmtree(tempdir)
287 os.chdir(cwd)
288
289 if __name__=="__main__":
290
291 try:
292 import psyco
293 psyco.full()
294 print >> sys.stderr, "Found Psyco, using"
295 except ImportError:
296 print >> sys.stderr, "Psyco not installed"
297
298 from optparse import OptionParser
299 from Utils.Parameters import *
300 optparser = OptionParser(usage="%prog [options]\nDDI'11 Shared Task corpus conversion")
301 optparser.add_option("-o", "--outdir", default=os.path.normpath(Settings.DATAPATH + "/corpora"), dest="outdir", help="directory for output files")
302 optparser.add_option("-d", "--downloaddir", default=None, dest="downloaddir", help="directory to download corpus files to")
303 optparser.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="save intermediate corpus files")
304 optparser.add_option("--redownload", default=False, action="store_true", dest="redownload", help="re-download all source files")
305 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="Keep temporary files")
306 (options, args) = optparser.parse_args()
307
308 convertDDI(options.outdir, None, None, None, None, options.downloaddir, options.redownload, options.intermediateFiles, options.debug)
309