1 import sys, os, shutil
2 import subprocess
3 import tempfile
4 thisPath = os.path.dirname(os.path.abspath(__file__))
5 sys.path.append(os.path.join(thisPath,".."))
6 import Utils.Settings as Settings
7 import Utils.Download as Download
8
9
10
11
12 evaluateGE09 = None
13
14
15
16 -def install(destDir=None, downloadDir=None, redownload=False):
17 print >> sys.stderr, "Installing BioNLP'11 evaluators"
18 settings = {}
19 if downloadDir == None:
20 downloadDir = Settings.DATAPATH
21 if destDir == None:
22 destDir = Settings.DATAPATH
23 for corpus in ["GE", "BB", "BI", "CO"]:
24 print >> sys.stderr, "Installing BioNLP'11", corpus, "evaluator"
25 settings[corpus + "_EVALUATOR"] = Download.getTopDir(destDir + "/tools/evaluators/", Download.downloadAndExtract(Settings.URL[corpus + "_EVALUATOR"], destDir + "/tools/evaluators/", downloadDir + "/tools/download/"))
26 print >> sys.stderr, "Installing BioNLP'11", corpus, "evaluator gold data"
27 Download.downloadAndExtract(Settings.URL[corpus + "_DEVEL"], destDir + "/tools/evaluators/gold/" + corpus + "-devel", downloadDir + "/corpora/BioNLP11-original/corpus/", os.path.basename(Settings.URL[corpus + "_DEVEL"])[:-len(".tar.gz")])
28 return settings
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
46 lines = lines[3:]
47 results = {}
48 for line in lines:
49 if line[0] == "-":
50 continue
51 splits = line.strip().split()
52
53 name = splits[0]
54 name = name.replace("=","")
55 name = name.replace("[","")
56 name = name.replace("]","")
57 results[name] = {}
58
59 results[name]["gold"] = int(splits[1])
60 results[name]["gold_match"] = int(splits[3][:-1])
61 results[name]["answer"] = int(splits[4])
62 results[name]["answer_match"] = int(splits[6][:-1])
63 results[name]["recall"] = float(splits[7])
64 results[name]["precision"] = float(splits[8])
65 results[name]["fscore"] = float(splits[9])
66 return results
67
69 for line in lines:
70 print >> sys.stderr, line[:-1]
71
73 import Core.Split as Split
74 files = os.listdir(path)
75 docNumbers = set()
76 for file in files:
77 numPart = file.split(".",1)[0]
78 if numPart.isdigit():
79 docNumbers.add(int(numPart))
80 docNumbers = list(docNumbers)
81 folds = Split.getFolds(len(docNumbers), folds, seed)
82 foldByDocNumber = {}
83 for i in range(len(docNumbers)):
84 foldByDocNumber[docNumbers[i]] = folds[i]
85 return foldByDocNumber
86
88 files = os.listdir(path)
89 for file in files:
90 numPart = file.split(".",1)[0]
91 if numPart.isdigit():
92 numPart = int(numPart)
93 assert folds.has_key(numPart)
94 if folds[numPart] == foldToRemove:
95 os.remove(os.path.join(path, file))
96
98 results = []
99 for i in range(folds):
100 results.append( evaluate(sourceDir, task, folds, i) )
101 print >> sys.stderr, "##### Variance estimation results #####"
102 for r in results:
103 print >> sys.stderr, r["approximate"]["ALL-TOTAL"]
104
106 goldDocIds = set()
107 for filename in os.listdir(goldDir):
108 if filename[-4:] == ".txt":
109 goldDocIds.add(filename.split(".", 1)[0])
110 for filename in os.listdir(sourceDir):
111 if filename.find(".a2") != -1:
112 if filename.split(".", 1)[0] in goldDocIds:
113 return True
114 return False
115
116
117
118
119
120
121
122
123
124
125
126
128 if task in ["GE", "GE09"]:
129 path = ["approximate", "ALL-TOTAL", "fscore"]
130 elif task in ["EPI", "ID", "REN"]:
131 path = ["TOTAL", "fscore"]
132 elif task in ["BB", "BI"]:
133 path = ["fscore"]
134 elif task == "CO":
135 path = ["MENTION LINKING", "fscore"]
136 else:
137 assert False
138
139 current = results
140 for step in path:
141 if step in current:
142 current = current[step]
143 else:
144 return -1
145 return current
146
147 -def evaluate(source, task, goldDir=None, debug=False):
148 print >> sys.stderr, "BioNLP'11 task", task, "devel evaluation"
149
150 subTask = "1"
151 if "." in task:
152 task, subTask = task.split(".")
153
154 if task in ["GE", "GE09"]:
155 results = evaluateGE(source, task, subTask, goldDir=goldDir, debug=debug)
156 elif task in ["EPI", "ID"]:
157 results = evaluateEPIorID(task, source, goldDir)
158 elif task == "REN":
159 results = evaluateREN(source, goldDir)
160 elif task in ["BB", "BI"]:
161 results = evaluateBX(task, source, goldDir)
162 elif task == "CO":
163 results = evaluateCO(source, goldDir)
164 else:
165 results = None
166 print >> sys.stderr, "No BioNLP'11 evaluator for task", task
167
168 if results == None:
169 return None
170 return (getFScore(results, task), results)
171
173
174 if not hasattr(Settings, "BIONLP_EVALUATOR_DIR"):
175 print >> sys.stderr, corpus, "BIONLP_EVALUATOR_DIR setting not defined"
176 evaluatorDir = None
177 else:
178 evaluatorDir = os.path.join(Settings.BIONLP_EVALUATOR_DIR, Settings.EVALUATOR[corpus])
179
180 tempdir = None
181 if sourceDir.endswith(".tar.gz"):
182 tempdir = tempfile.mkdtemp()
183 Download.extractPackage(sourceDir, os.path.join(tempdir, "source"))
184 sourceDir = os.path.join(tempdir, "source")
185 elif corpus == "GE09":
186 tempdir = tempfile.mkdtemp()
187 shutil.copytree(sourceDir, os.path.join(tempdir, "source"))
188 sourceDir = os.path.join(tempdir, "source")
189
190 if goldDir == None:
191 if not hasattr(Settings, "BIONLP_EVALUATOR_GOLD_DIR"):
192 print >> sys.stderr, corpus, "BIONLP_EVALUATOR_GOLD_DIR setting not defined"
193 return evaluatorDir, None
194 goldDir = os.path.join(Settings.BIONLP_EVALUATOR_GOLD_DIR, Settings.EVALUATOR[corpus + "-gold"])
195 if not os.path.exists(goldDir):
196 print >> sys.stderr, corpus, "Evaluator gold data directory", goldDir, "does not exist"
197 goldDir = None
198 if goldDir != None and goldDir.endswith(".tar.gz"):
199 if tempdir == None:
200 tempdir = tempfile.mkdtemp()
201 goldDir = Download.getTopDir(os.path.join(tempdir, "gold"), Download.extractPackage(goldDir, os.path.join(tempdir, "gold")))
202 print >> sys.stderr, "Uncompressed evaluation gold to", goldDir
203 if goldDir != None and not hasGoldDocuments(sourceDir, goldDir):
204 print >> sys.stderr, "Evaluation input has no gold documents"
205 goldDir = None
206
207 sourceDir = os.path.abspath(sourceDir)
208 if evaluatorDir != None:
209 evaluatorDir = os.path.abspath(evaluatorDir)
210 if goldDir != None:
211 goldDir = os.path.abspath(goldDir)
212 if tempdir != None:
213 tempdir = os.path.abspath(tempdir)
214 return evaluatorDir, sourceDir, goldDir, tempdir
215
216 -def evaluateGE(sourceDir, mainTask="GE", task=1, goldDir=None, folds=-1, foldToRemove=-1, evaluations=["strict", "approximate", "decomposition"], verbose=True, silent=False, debug=False):
217 task = str(task)
218 assert mainTask in ["GE", "GE09"], mainTask
219 assert task in ["1","2","3"], task
220 if not silent:
221 print >> sys.stderr, mainTask, "task", task, "evaluation of", sourceDir, "against", goldDir
222 if mainTask == "GE":
223 evaluatorDir, sourceDir, goldDir, tempDir = checkEvaluator("GE", sourceDir, goldDir)
224 taskSuffix = ".a2"
225 else:
226 evaluatorDir, sourceDir, goldDir, tempDir = checkEvaluator("GE09", sourceDir, goldDir)
227
228 taskSuffix = ".a2.t1"
229 for filename in os.listdir(sourceDir):
230 if filename.endswith(".a2"):
231 if task == 1:
232 taskSuffix = ".a2.t1"
233 elif task == 2:
234 taskSuffix = ".a2.t12"
235 else:
236 taskSuffix = ".a2.t123"
237 shutil.move(os.path.join(sourceDir, filename), os.path.join(sourceDir, filename.rsplit(".", 1)[0] + taskSuffix))
238 if goldDir == None:
239 return None
240
241 origDir = os.getcwd()
242 os.chdir(evaluatorDir)
243 if tempDir == None:
244 tempDir = tempfile.mkdtemp()
245 if folds != -1:
246 folds = getFolds(sourceDir, folds)
247 sourceSubsetDir = tempDir + "/source-subset"
248 if os.path.exists(sourceSubsetDir):
249 shutil.rmtree(sourceSubsetDir)
250 shutil.copytree(sourceDir, sourceSubsetDir)
251 removeDocuments(sourceSubsetDir, folds, foldToRemove)
252 else:
253 sourceSubsetDir = sourceDir
254
255 results = {}
256
257
258 if mainTask == "GE09":
259 preparedGoldDir = os.path.join(tempDir, "prepared-gold")
260 commands = "perl prepare-gold.pl " + goldDir + " " + preparedGoldDir
261 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
262 if verbose and not silent:
263 printLines(p.stderr.readlines())
264 printLines(p.stdout.readlines())
265 else:
266 p.stderr.readlines()
267 p.stdout.readlines()
268 goldDir = preparedGoldDir
269
270
271 outDir = tempDir + "/output"
272 if mainTask == "GE":
273 commands = "perl a2-normalize.pl -g " + goldDir
274 commands += " -o " + outDir
275 commands += " " + sourceSubsetDir + "/*" + taskSuffix
276 else:
277 commands = "perl prepare-eval.pl -g " + goldDir
278 commands += " " + sourceSubsetDir + " " + outDir
279 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
280 if verbose and not silent:
281 printLines(p.stderr.readlines())
282 printLines(p.stdout.readlines())
283 else:
284 p.stderr.readlines()
285 p.stdout.readlines()
286
287 if "strict" in evaluations:
288
289 commands = "perl a2-evaluate.pl"
290 if mainTask == "GE": commands += " -t " + str(task)
291 if debug: commands += " -v -d"
292 commands += " -g " + goldDir + " " + outDir + "/*" + taskSuffix
293 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
294 stderrLines = p.stderr.readlines()
295 stdoutLines = p.stdout.readlines()
296 if not silent:
297 printLines(stderrLines)
298 print >> sys.stderr, "##### strict evaluation mode #####"
299 printLines(stdoutLines)
300 results["strict"] = parseResults(stdoutLines)
301
302 if "approximate" in evaluations:
303 if not silent:
304 print >> sys.stderr, "##### approximate span and recursive mode #####"
305
306 commands = "perl a2-evaluate.pl"
307 if mainTask == "GE": commands += " -t " + str(task)
308 if debug: commands += " -v -d"
309 commands += " -g " + goldDir + " -sp " + outDir + "/*" + taskSuffix
310 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
311 stderrLines = p.stderr.readlines()
312 stdoutLines = p.stdout.readlines()
313 if not silent:
314 printLines(stderrLines)
315 printLines(stdoutLines)
316 results["approximate"] = parseResults(stdoutLines)
317
318 if "decomposition" in evaluations:
319 if not silent:
320 print >> sys.stderr, "##### event decomposition in the approximate span mode #####"
321
322 commands = "perl a2-evaluate.pl"
323 if mainTask == "GE": commands += " -t " + str(task)
324 if debug: commands += " -v -d"
325 commands += " -g " + goldDir + " -sp " + outDir + "/*" + taskSuffix
326 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
327 stderrLines = p.stderr.readlines()
328 stdoutLines = p.stdout.readlines()
329 if not silent:
330 printLines(stderrLines)
331 printLines(stdoutLines)
332 results["decomposition"] = parseResults(stdoutLines)
333
334 if not debug:
335 shutil.rmtree(tempDir)
336 else:
337 print >> sys.stderr, "Temporary directory left at", tempDir
338
339
340 os.chdir(origDir)
341 return results
342
344
345
346 for line in lines:
347 print >> sys.stderr, line[:-1]
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363 -def evaluateBX(corpusName, sourceDir, goldDir=None, silent=False):
364 assert corpusName in ["BI", "BB"], corpusName
365 evaluatorDir, sourceDir, goldDir, tempDir = checkEvaluator(corpusName, sourceDir, goldDir)
366 if goldDir == None:
367 return None
368
369 if corpusName == "BI":
370 commands = "java -jar " + evaluatorDir + "/BioNLP-ST_2011_bacteria_interactions_evaluation_software.jar " + goldDir + " " + sourceDir
371 elif corpusName == "BB":
372 commands = "java -jar " + evaluatorDir + "/BioNLP-ST_2011_Bacteria_Biotopes_evaluation_software.jar " + goldDir + " " + sourceDir
373 else:
374 assert False, corpusName
375
376 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
377 stderrLines = p.stderr.readlines()
378 stdoutLines = p.stdout.readlines()
379 if not silent:
380 printLinesBX(stderrLines)
381 printLinesBX(stdoutLines)
382
383 results = {}
384 if corpusName == "BI":
385 category = None
386 for line in stdoutLines:
387 if ":" in line:
388 category = line.split(":")[0].strip()
389 if category == "Global scores" and line.startswith(" "):
390 key, value = line.strip().split("=")
391 key = key.strip()
392 value = value.strip()
393 assert key not in results
394 if key == "f-score":
395 key = "fscore"
396 if value == "NaN":
397 results[key] = 0.0
398 else:
399 results[key] = float(value)
400 elif corpusName == "BB":
401 for line in stdoutLines:
402 key, value = line.strip().split("=")
403 key = key.strip()
404 value = value.strip()
405 assert key not in results
406 if key == "F-score":
407 key = "fscore"
408 if value == "NaN":
409 results[key] = 0.0
410 else:
411 results[key] = float(value)
412 if tempDir != None:
413 shutil.rmtree(tempDir)
414 return results
415
417 assert corpus in ["EPI", "ID"], corpus
418 evaluatorDir, sourceDir, goldDir, tempDir = checkEvaluator(corpus, sourceDir, goldDir)
419 if goldDir == None:
420 return None
421 commands = "cd " + evaluatorDir
422 commands += " ; " + "python evaluation.py -s -p -r " + goldDir + " " + sourceDir + "/*.a2"
423 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
424 stderrLines = p.stderr.readlines()
425 stdoutLines = p.stdout.readlines()
426 if not silent:
427 for line in stderrLines:
428 print >> sys.stderr, line,
429 for line in stdoutLines:
430 print >> sys.stderr, line,
431 print >> sys.stderr
432 for line in stderrLines + stdoutLines:
433 if "No such file or directory" in line:
434 return None
435 if tempDir != None:
436 shutil.rmtree(tempDir)
437 return parseResults(stdoutLines)
438
439 -def evaluateREN(sourceDir, goldDir=None, silent=False):
440 evaluatorDir, sourceDir, goldDir, tempDir = checkEvaluator("REN", sourceDir, goldDir)
441 if goldDir == None:
442 return None
443 commands = "cd " + evaluatorDir
444 commands += " ; " + "java -jar eval_rename.jar " + goldDir + " " + sourceDir
445 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
446 stderrLines = p.stderr.readlines()
447 stdoutLines = p.stdout.readlines()
448 if not silent:
449 for line in stderrLines:
450 print >> sys.stderr, line,
451 for line in stdoutLines:
452 print >> sys.stderr, line,
453 print >> sys.stderr
454 results = {}
455 for line in stdoutLines:
456 category, value = line.strip().split(":")
457 value = value.strip()
458 if value == "NaN":
459 value = 0.0
460 elif "." in value:
461 value = float(value)
462 else:
463 value = int(value)
464 results[category.strip()] = value
465 if tempDir != None:
466 shutil.rmtree(tempDir)
467 return results
468
469 -def evaluateCO(sourceDir, goldDir=None, silent=False):
470 evaluatorDir, sourceDir, goldDir, tempDir = checkEvaluator("CO", sourceDir, goldDir)
471 if goldDir == None:
472 return None
473
474 if tempDir == None:
475 tempDir = tempfile.mkdtemp()
476 resultDir = os.path.join(tempDir, "result")
477 os.makedirs(resultDir)
478 commands = "cd " + evaluatorDir
479 commands += " ; " + "java -jar CRScorer.jar " + goldDir + " " + sourceDir + " " + resultDir
480 p = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
481 stderrLines = p.stderr.readlines()
482 stdoutLines = p.stdout.readlines()
483 if not silent:
484 for i in range(len(stdoutLines)):
485
486 if (not stdoutLines[i].strip().endswith("...")) or (i < len(stdoutLines) - 1 and not stdoutLines[i+1].strip().endswith("...")):
487 print >> sys.stderr, stdoutLines[i],
488 for line in stderrLines:
489 print >> sys.stderr, line,
490 print >> sys.stderr
491
492 f = open(os.path.join(resultDir, "eval.results"), "rt")
493 resultLines = f.readlines()
494 f.close()
495 results = {"MENTION DETECTION":{}, "MENTION LINKING":{}}
496 currentBlock = None
497 for line in resultLines:
498 line = line.replace("\t", " ")
499 print >> sys.stderr, line.rstrip()
500 if line[0] == "*":
501 continue
502 if "EVALUATION OF MENTION DETECTION" in line:
503 currentBlock = results["MENTION DETECTION"]
504 elif "EVALUATION OF MENTION LINKING" in line:
505 currentBlock = results["MENTION LINKING"]
506 elif ":" in line:
507 name, value = line.split(":")
508 name = name.strip()
509 value = int(value)
510 currentBlock[name] = value
511 elif line[0] == "P":
512 splits = line.split()
513 assert splits[0] == "P" and splits[1] == "=" and splits[3] == "R" and splits[4] == "=" and splits[6] == "F" and splits[7] == "=", line
514 currentBlock["precision"] = float(splits[2])
515 currentBlock["recall"] = float(splits[5])
516 currentBlock["fscore"] = float(splits[8])
517
518 if tempDir != None:
519 shutil.rmtree(tempDir)
520 return results
521
522 if __name__=="__main__":
523
524 try:
525 import psyco
526 psyco.full()
527 print >> sys.stderr, "Found Psyco, using"
528 except ImportError:
529 print >> sys.stderr, "Psyco not installed"
530
531 from optparse import OptionParser
532 optparser = OptionParser(description="Evaluate BioNLP Shared Task predictions")
533 optparser.add_option("-i", "--input", default=None, dest="input", help="input directory with predicted shared task files", metavar="FILE")
534 optparser.add_option("-g", "--gold", default=None, dest="gold", help="optional gold directory (default is the task development set)", metavar="FILE")
535 optparser.add_option("-t", "--task", default="GE.2", dest="task", help="")
536 optparser.add_option("-v", "--variance", default=0, type="int", dest="variance", help="variance folds")
537 optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="debug")
538 optparser.add_option("--install", default=None, dest="install", help="Install directory (or DEFAULT)")
539 (options, args) = optparser.parse_args()
540
541
542 if options.install == None:
543 assert(options.input != None)
544 evalResult = evaluate(options.input, options.task, options.gold, debug=options.debug)
545 if options.debug:
546 print >> sys.stderr, "evaluate output:", evalResult
547 else:
548 downloadDir = None
549 destDir = None
550 if options.install != "DEFAULT":
551 if "," in options.install:
552 destDir, downloadDir = options.install.split(",")
553 else:
554 destDir = options.install
555 settings = install(destDir, downloadDir)
556 for key in sorted(settings.keys()):
557 print key + "=\"" + str(settings[key]) + "\""
558
559
560
561
562
563
564
565
566
567
568
569
570