1 import sys, os
2 import STTools
3 import tempfile
4 import shutil
5 from collections import defaultdict
6 thisPath = os.path.dirname(os.path.abspath(__file__))
7 sys.path.append(os.path.abspath(os.path.join(thisPath,"../../Evaluators")))
8 import BioNLP11GeniaTools
9 from pylab import *
10 import time, datetime
11 sys.path.append(os.path.abspath(os.path.join(thisPath,"../Statistics")))
12 import Utils.Libraries.stats
13
15 if "approximate" in results and "ALL-TOTAL" in results["approximate"]:
16 return results["approximate"]["ALL-TOTAL"]
17 else:
18 return results["TOTAL"]
19
21 """
22 Extract individual scores from a comma-separated list
23 """
24 scoreDict = {}
25 for pairString in scoreString.split(","):
26 className, score = pairString.split("=")
27 score = float(score)
28 assert className not in scoreDict
29 scoreDict[className] = score
30 return scoreDict
31
33 for key in sourceDict:
34
35 if rangeDict[key][0] == None or rangeDict[key][0] > sourceDict[key]:
36 rangeDict[key][0] = sourceDict[key]
37 if rangeDict[key][1] == None or rangeDict[key][1] < sourceDict[key]:
38 rangeDict[key][1] = sourceDict[key]
39
40
42 rangeDicts = {}
43 rangeDicts["unmerging"] = defaultdict(lambda:[None, None])
44 rangeDicts["triggers"] = defaultdict(lambda:[None, None])
45 rangeDicts["arguments"] = defaultdict(lambda:[None, None])
46 for doc in documents:
47 for event in doc.events:
48 updateRange(rangeDicts["triggers"], event.trigger.triggerScoreDict)
49 updateRange(rangeDicts["unmerging"], event.trigger.unmergingScoreDict)
50 for argScoreDict in event.argScoreDicts:
51 updateRange(rangeDicts["arguments"], argScoreDict)
52 print "Ranges", rangeDicts
53 return rangeDicts
54
56 statValues = {}
57 triggerValues = []
58 unmergingValues = []
59 argValues = []
60 for doc in documents:
61 for event in doc.events:
62 for value in sorted(event.trigger.triggerScoreDict.values()):
63 triggerValues.append(value)
64 if hasattr(event.trigger, "unmergingScoreDict"):
65 for value in sorted(event.trigger.unmergingScoreDict.values()):
66 unmergingValues.append(value)
67 for argScoreDict in event.argScoreDicts:
68 for value in sorted(argScoreDict.values()):
69 argValues.append(value)
70 for relation in doc.relations:
71 for argScoreDict in relation.argScoreDicts:
72 for value in sorted(argScoreDict.values()):
73 argValues.append(value)
74
75 if len(triggerValues) > 0:
76 statValues["trigger-stdev"] = stats.lstdev(triggerValues)
77 statValues["trigger-mean"] = stats.lmean(triggerValues)
78 if len(unmergingValues) > 0:
79 statValues["unmerging-stdev"] = stats.lstdev(unmergingValues)
80 statValues["unmerging-mean"] = stats.lmean(unmergingValues)
81 statValues["arg-stdev"] = stats.lstdev(argValues)
82 statValues["arg-mean"] = stats.lmean(argValues)
83 return statValues
84
86 return (score - statValues[scoreType+"-mean"]) / statValues[scoreType+"-stdev"]
87
89 scores = []
90 if event.trigger != None:
91 scores.append( standardize(event.trigger.triggerScore, statValues, "trigger") )
92 scores.append( standardize(event.trigger.triggerScore, statValues, "unmerging") )
93 if hasattr(event, "argScores"):
94 for argScore in event.argScores:
95 scores.append( standardize(argScore, statValues, "arg") )
96
97 score = min(scores)
98 for arg in event.arguments:
99 if arg[1].id[0] == "E":
100 score = min(score, getEventEVEXScore(arg[1], statValues))
101 return score
102
103 -def getScore(scoreDict, typeString=None):
104 """
105 Get the highest score (optionally for a known type)
106 """
107 currentScore = None
108
109 if typeString == "Site" and "SiteArg" in scoreDict:
110 assert "Site" not in scoreDict, scoreDict.keys()
111 typeString = "SiteArg"
112
113 for key in scoreDict:
114 if typeString != None:
115 for keySplit in key.split("---"):
116 if key == typeString and currentScore == None or currentScore < scoreDict[key]:
117 currentScore = scoreDict[key]
118 highestKey = key
119 else:
120 if currentScore == None or currentScore < scoreDict[key]:
121 currentScore = scoreDict[key]
122 highestKey = key
123 assert highestKey != "neg", (typeString, scoreDict)
124 assert currentScore != None, (typeString, scoreDict)
125 return currentScore, highestKey
126
128 return (value - rangeDict[key][0]) / (abs(rangeDict[key][0]) + abs(rangeDict[key][1]))
129
131 """
132 Convert score strings to a single float value
133 """
134 print "Extracting scores"
135 for document in documents:
136 for event in document.events:
137 if event.trigger != None:
138 if event.trigger.triggerScores != None:
139 event.trigger.triggerScoreDict = getScoreDict(event.trigger.triggerScores)
140 if event.trigger.unmergingScores != None:
141
142 event.trigger.unmergingScoreDict = getScoreDict(event.trigger.unmergingScores)
143 event.argScoreDicts = []
144 for arg in event.arguments:
145
146 event.argScoreDicts.append( getScoreDict(arg[3]) )
147 for relation in document.relations:
148
149 relation.argScoreDicts = []
150 relation.argScoreDicts.append( getScoreDict(relation.arguments[0][3]) )
151
152 counts = defaultdict(int)
153 if normalize:
154 print "Normalizing ranges"
155 rangeDicts = getRangeDicts(documents)
156 statValues = getStatValues(documents)
157 for document in documents:
158 counts["documents"] += 1
159 for event in document.events + document.relations:
160 counts["events"] += 1
161 if event.trigger != None:
162 if event.trigger.triggerScores != None:
163 event.trigger.triggerScore, event.trigger.triggerScoreKey = getScore(event.trigger.triggerScoreDict, event.trigger.type)
164 if normalize:
165 event.trigger.triggerScore = normalizeScore(event.trigger.triggerScore, event.trigger.triggerScoreKey, rangeDicts["triggers"])
166 counts["event-trigger-scores"] += 1
167 if event.trigger.unmergingScores != None:
168
169 event.trigger.unmergingScore, event.trigger.unmergingScoreKey = getScore(event.trigger.unmergingScoreDict)
170 if normalize:
171 event.trigger.unmergingScore = normalizeScore(event.trigger.unmergingScore, event.trigger.unmergingScoreKey, rangeDicts["unmerging"])
172 counts["event-unmerging-scores"] += 1
173
174 event.argScores = []
175 event.argScoreKeys = []
176 for i in range(len(event.arguments)):
177 if i < len(event.argScoreDicts):
178 argScore, argScoreKey = getScore(event.argScoreDicts[i])
179 if normalize:
180 argScore = normalizeScore(argScore, argScoreKey, rangeDicts["arguments"])
181 event.argScores.append(argScore)
182 event.argScoreKeys.append(argScoreKey)
183 return counts
184
185
186
187
189 """
190 Make an ordered list for all events in all documents
191 """
192 eventList = []
193 if "EVEX" in sortMethod or "standardize" in sortMethod:
194 statValues = getStatValues(documents)
195 print "Stat values:", statValues
196 for document in documents:
197 for event in document.events + document.relations:
198 if "unmerging" in sortMethod:
199 score = event.trigger.unmergingScore
200 if "standardize" in sortMethod:
201 score = standardize(score, statValues, "unmerging")
202 eventList.append( (score, event.id, event, document) )
203 elif "triggers" in sortMethod:
204 score = event.trigger.triggerScore
205 if "standardize" in sortMethod:
206 score = standardize(score, statValues, "trigger")
207 eventList.append( (score, event.id, event, document) )
208 elif "EVEX" in sortMethod:
209 eventList.append( (getEventEVEXScore(event, statValues), event.id, event, document) )
210 eventList.sort()
211 return eventList
212
214 """
215 Take an ordered event list, and mark a fraction for removal by setting their arguments to [], thus
216 causing them to be removed in validation.
217 """
218 breakPoint = cutoff * len(eventList)
219 for i in range(len(eventList)):
220 if i >= breakPoint:
221 break
222 eventList[i][2].arguments = []
223
224 -def evaluate(documents, sortMethod, verbose, cutoffs=[], task="GE.2"):
225 workdir = tempfile.gettempdir()
226 outdir = os.path.join(workdir, "events")
227 cutoffs.sort()
228 eventList = sortByScore(documents, sortMethod)
229 results = {}
230 startTime = time.time()
231 for cutoff in cutoffs:
232 print "Cutoff", cutoff, str(datetime.timedelta(seconds=time.time()-startTime))
233 markForRemoval(eventList, cutoff)
234 STTools.writeSet(documents, outdir, validate=True)
235
236 if "REL" not in task:
237 results[cutoff] = getResults(BioNLP11GeniaTools.evaluate(outdir, task=task)[1])
238 else:
239 results[cutoff] = {}
240 print results
241
242
243
244 maxEvents = results[0.0]["answer"]
245 print "Max events", maxEvents
246 return results, maxEvents
247
248 -def resultsToGraph(results, outputname, maxEvents=None, manualEvaluationFile=None, graphs="prf"):
249 fig = figure()
250
251 ax = subplot(111)
252 ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
253 ax.yaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5)
254 ax.xaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
255 ax.xaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5)
256
257 ylabel('precision / recall / F-score [%]', size=12)
258 if maxEvents != None:
259 xlabel('events [%]', size=12)
260 else:
261 xlabel('events', size=12)
262
263 plots = {}
264 plotNames = []
265 legendText = []
266 plotColours = {}
267 lineStyles = {}
268 markerStyles = {}
269 graphs = graphs.lower()
270 if "p" in graphs:
271 plotNames.append("precision")
272 legendText.append("precision (BioNLP'11)")
273 plotColours["precision"] = "red"
274 lineStyles["precision"] = "-"
275 markerStyles["precision"] = "v"
276 if "r" in graphs:
277 plotNames.append("recall")
278 legendText.append("recall (BioNLP'11)")
279 plotColours["recall"] = "green"
280 lineStyles["recall"] = "-"
281 markerStyles["recall"] = "^"
282 if "f" in graphs:
283 plotNames.append("fscore")
284 legendText.append("fscore (BioNLP'11)")
285 plotColours["fscore"] = "blue"
286 lineStyles["fscore"] = "-"
287 markerStyles["fscore"] = "s"
288 for name in plotNames:
289 plots[name] = []
290 xValues = []
291 for key in sorted(results):
292 for name in plotNames:
293 plots[name].append(results[key][name])
294 xValue = results[key]["answer"]
295 if maxEvents != None:
296 xValue = float(xValue) / maxEvents * 100.0
297 xValues.append(xValue)
298
299 if manualEvaluationFile != None:
300 manualPrecisions = getManualEvaluationPrecisions(manualEvaluationFile)
301 plotManualEvaluationPrecisions(manualPrecisions, binSize=5, makeFig=False)
302
303 for name in plotNames:
304 plot(xValues, plots[name], marker=markerStyles[name], color=plotColours[name], linestyle=lineStyles[name], markersize=4)
305
306 ylim([0, 80])
307
308 if manualEvaluationFile != None:
309 legendText = ["precision (EVEX)"] + legendText
310
311 leg = legend(legendText, 'lower right')
312 ltext = leg.get_texts()
313 setp(ltext, fontsize='small')
314 savefig(outputname, bbox_inches='tight')
315
316
318 f = open(manualEvaluationFile, "rt")
319 lines = f.readlines()
320 f.close()
321
322 events = []
323 truePositives = 0
324 falsePositives = 0
325 for line in lines:
326 begin, middle = line.split("--->")
327 end = "\n"
328 if "#" in line:
329 middle, end = middle.split("#")
330 eventEvaluation = middle.split(",")[-1].strip()
331 if "F" in eventEvaluation:
332 eventIsTrue = False
333 falsePositives += 1
334 else:
335 eventIsTrue = True
336 truePositives += 1
337
338 beginSplits = begin.split()
339 eventWeight = float(beginSplits[3])
340 fromAbstract = beginSplits[4] == "ABSTRACT"
341
342 events.append( (eventWeight, eventIsTrue) )
343 events.sort()
344 precisions = [float(truePositives) / (truePositives + falsePositives)]
345 count = 0
346 for event in events:
347 if event[1]:
348 truePositives -= 1
349 else:
350 falsePositives -= 1
351 if truePositives + falsePositives > 0:
352 precisions.append(float(truePositives) / (truePositives + falsePositives))
353 count += 1
354
355 return precisions
356
358 if makeFig:
359 fig = figure()
360
361 ax = subplot(111)
362 ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
363 ax.yaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5)
364 ax.xaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
365 ax.xaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5)
366 ylabel('precision', size=12)
367 xlabel('events [%]', size=12)
368
369 binnedScores = []
370 currentBin = []
371 count = 0
372 for precision in precisions:
373 currentBin.append(precision)
374 count += 1
375 if count >= binSize:
376 binnedScores.append( float(sum(currentBin)) / len(currentBin) * 100.0 )
377 currentBin = []
378 count = 0
379
380 numEvents = len(binnedScores)
381 xValues = []
382 for i in range(numEvents):
383 xValues.append( float(numEvents-i)/numEvents*100 )
384 plot(xValues, binnedScores, marker="o", color="red", linestyle="-", markersize=4)
385
386 if makeFig:
387 savefig("manual-scores-binned.pdf", bbox_inches='tight')
388
389 if __name__=="__main__":
390 from optparse import OptionParser
391
392 try:
393 import psyco
394 psyco.full()
395 print >> sys.stderr, "Found Psyco, using"
396 except ImportError:
397 print >> sys.stderr, "Psyco not installed"
398
399 optparser = OptionParser(description="Analyze confidence scores")
400 optparser.add_option("-i", "--input", default=None, dest="input", help="", metavar="FILE")
401 optparser.add_option("-t", "--task", default="GE.2", dest="task", help="", metavar="FILE")
402 optparser.add_option("-o", "--output", default=None, dest="output", help="", metavar="FILE")
403 optparser.add_option("-m", "--manual", default=None, dest="manual", help="", metavar="FILE")
404 optparser.add_option("-g", "--graphs", default="prf", dest="graphs", help="", metavar="FILE")
405 optparser.add_option("-s", "--sortmethod", default="unmerging", dest="sortmethod", help="")
406 optparser.add_option("-v", "--verbose", default=False, action="store_true", dest="verbose", help="")
407 optparser.add_option("--steps", default=10, type="int", dest="steps", help="", metavar="FILE")
408 optparser.add_option("--binSize", default=1, type="int", dest="binSize", help="", metavar="FILE")
409 (options, args) = optparser.parse_args()
410
411 if options.manual != None and options.input == None:
412 precisions = getManualEvaluationPrecisions(options.manual)
413 plotManualEvaluationPrecisions(precisions, options.binSize)
414 else:
415 cutoffs = [float(x)/options.steps for x in range(options.steps)]
416 print "Loading documents"
417 documents = STTools.loadSet(options.input, readScores=True)
418
419
420
421
422
423
424 print "Processing scores"
425 print processScores(documents, normalize="normalize" in options.sortmethod)
426 print "Evaluating"
427 results, maxEvents = evaluate(documents, options.sortmethod, verbose=options.verbose, cutoffs=cutoffs, task=options.task)
428 if options.output == None:
429 output = "scorefig-" + options.sortmethod + ".pdf"
430 else:
431 output = options.output
432 resultsToGraph(results, options.output, maxEvents, manualEvaluationFile=options.manual)
433