TEES.Utils.STFormat.Scores

1 import sys, os 2 import STTools 3 import tempfile 4 import shutil 5 from collections import defaultdict 6 thisPath = os.path.dirname(os.path.abspath(__file__)) 7 sys.path.append(os.path.abspath(os.path.join(thisPath,"../../Evaluators"))) 8 import BioNLP11GeniaTools 9 from pylab import * 10 import time, datetime 11 sys.path.append(os.path.abspath(os.path.join(thisPath,"../Statistics"))) 12 import Utils.Libraries.stats 13

14 -def getResults(results):

15 if "approximate" in results and "ALL-TOTAL" in results["approximate"]: 16 return results["approximate"]["ALL-TOTAL"] 17 else: 18 return results["TOTAL"]

19

20 -def getScoreDict(scoreString):

21 """ 22 Extract individual scores from a comma-separated list 23 """ 24 scoreDict = {} 25 for pairString in scoreString.split(","): 26 className, score = pairString.split("=") 27 score = float(score) 28 assert className not in scoreDict 29 scoreDict[className] = score 30 return scoreDict

31

32 -def updateRange(rangeDict, sourceDict):

33 for key in sourceDict: 34 # per key 35 if rangeDict[key][0] == None or rangeDict[key][0] > sourceDict[key]: 36 rangeDict[key][0] = sourceDict[key] 37 if rangeDict[key][1] == None or rangeDict[key][1] < sourceDict[key]: 38 rangeDict[key][1] = sourceDict[key]

39 # total 40

41 -def getRangeDicts(documents):

42 rangeDicts = {} 43 rangeDicts["unmerging"] = defaultdict(lambda:[None, None]) 44 rangeDicts["triggers"] = defaultdict(lambda:[None, None]) 45 rangeDicts["arguments"] = defaultdict(lambda:[None, None]) 46 for doc in documents: 47 for event in doc.events: 48 updateRange(rangeDicts["triggers"], event.trigger.triggerScoreDict) 49 updateRange(rangeDicts["unmerging"], event.trigger.unmergingScoreDict) 50 for argScoreDict in event.argScoreDicts: 51 updateRange(rangeDicts["arguments"], argScoreDict) 52 print "Ranges", rangeDicts 53 return rangeDicts

54

55 -def getStatValues(documents):

56 statValues = {} 57 triggerValues = [] 58 unmergingValues = [] 59 argValues = [] 60 for doc in documents: 61 for event in doc.events: 62 for value in sorted(event.trigger.triggerScoreDict.values()): 63 triggerValues.append(value) 64 if hasattr(event.trigger, "unmergingScoreDict"): 65 for value in sorted(event.trigger.unmergingScoreDict.values()): 66 unmergingValues.append(value) 67 for argScoreDict in event.argScoreDicts: 68 for value in sorted(argScoreDict.values()): 69 argValues.append(value) 70 for relation in doc.relations: 71 for argScoreDict in relation.argScoreDicts: 72 for value in sorted(argScoreDict.values()): 73 argValues.append(value) 74 #print triggerValues, unmergingValues, argValues 75 if len(triggerValues) > 0: 76 statValues["trigger-stdev"] = stats.lstdev(triggerValues) 77 statValues["trigger-mean"] = stats.lmean(triggerValues) 78 if len(unmergingValues) > 0: 79 statValues["unmerging-stdev"] = stats.lstdev(unmergingValues) 80 statValues["unmerging-mean"] = stats.lmean(unmergingValues) 81 statValues["arg-stdev"] = stats.lstdev(argValues) 82 statValues["arg-mean"] = stats.lmean(argValues) 83 return statValues

84

85 -def standardize(score, statValues, scoreType):

86 return (score - statValues[scoreType+"-mean"]) / statValues[scoreType+"-stdev"]

87

88 -def getEventEVEXScore(event, statValues):

89 scores = [] 90 if event.trigger != None: 91 scores.append( standardize(event.trigger.triggerScore, statValues, "trigger") ) 92 scores.append( standardize(event.trigger.triggerScore, statValues, "unmerging") ) 93 if hasattr(event, "argScores"): 94 for argScore in event.argScores: 95 scores.append( standardize(argScore, statValues, "arg") ) 96 #scores.append( (argScore - statValues["arg-mean"]) / statValues["arg-stdev"] ) 97 score = min(scores) 98 for arg in event.arguments: # recursively pick the lowest score 99 if arg[1].id[0] == "E": # a nested event 100 score = min(score, getEventEVEXScore(arg[1], statValues)) 101 return score

102

103 -def getScore(scoreDict, typeString=None):

104 """ 105 Get the highest score (optionally for a known type) 106 """ 107 currentScore = None 108 # EPI sites 109 if typeString == "Site" and "SiteArg" in scoreDict: 110 assert "Site" not in scoreDict, scoreDict.keys() 111 typeString = "SiteArg" 112 # Find the values 113 for key in scoreDict: 114 if typeString != None: # match type 115 for keySplit in key.split("---"): # check for merged classes 116 if key == typeString and currentScore == None or currentScore < scoreDict[key]: 117 currentScore = scoreDict[key] 118 highestKey = key 119 else: # take highest 120 if currentScore == None or currentScore < scoreDict[key]: 121 currentScore = scoreDict[key] 122 highestKey = key 123 assert highestKey != "neg", (typeString, scoreDict) 124 assert currentScore != None, (typeString, scoreDict) 125 return currentScore, highestKey

126

127 -def normalizeScore(value, key, rangeDict):

128 return (value - rangeDict[key][0]) / (abs(rangeDict[key][0]) + abs(rangeDict[key][1]))

129

130 -def processScores(documents, normalize=False):

131 """ 132 Convert score strings to a single float value 133 """ 134 print "Extracting scores" 135 for document in documents: 136 for event in document.events: 137 if event.trigger != None: 138 if event.trigger.triggerScores != None: 139 event.trigger.triggerScoreDict = getScoreDict(event.trigger.triggerScores) 140 if event.trigger.unmergingScores != None: 141 # unmerging scores should actually be in the event, but triggers are never shared anyway 142 event.trigger.unmergingScoreDict = getScoreDict(event.trigger.unmergingScores) 143 event.argScoreDicts = [] 144 for arg in event.arguments: 145 #print arg 146 event.argScoreDicts.append( getScoreDict(arg[3]) ) 147 for relation in document.relations: 148 # Use only the first value so you don't get the relation score twice 149 relation.argScoreDicts = [] 150 relation.argScoreDicts.append( getScoreDict(relation.arguments[0][3]) ) 151 152 counts = defaultdict(int) 153 if normalize: 154 print "Normalizing ranges" 155 rangeDicts = getRangeDicts(documents) 156 statValues = getStatValues(documents) 157 for document in documents: 158 counts["documents"] += 1 159 for event in document.events + document.relations: 160 counts["events"] += 1 161 if event.trigger != None: 162 if event.trigger.triggerScores != None: 163 event.trigger.triggerScore, event.trigger.triggerScoreKey = getScore(event.trigger.triggerScoreDict, event.trigger.type) 164 if normalize: 165 event.trigger.triggerScore = normalizeScore(event.trigger.triggerScore, event.trigger.triggerScoreKey, rangeDicts["triggers"]) 166 counts["event-trigger-scores"] += 1 167 if event.trigger.unmergingScores != None: 168 # unmerging scores should actually be in the event, but triggers are never shared anyway 169 event.trigger.unmergingScore, event.trigger.unmergingScoreKey = getScore(event.trigger.unmergingScoreDict) 170 if normalize: 171 event.trigger.unmergingScore = normalizeScore(event.trigger.unmergingScore, event.trigger.unmergingScoreKey, rangeDicts["unmerging"]) 172 counts["event-unmerging-scores"] += 1 173 # argument scores 174 event.argScores = [] 175 event.argScoreKeys = [] 176 for i in range(len(event.arguments)): 177 if i < len(event.argScoreDicts): # REL has only one score 178 argScore, argScoreKey = getScore(event.argScoreDicts[i])#, arg[1]) 179 if normalize: 180 argScore = normalizeScore(argScore, argScoreKey, rangeDicts["arguments"]) 181 event.argScores.append(argScore) 182 event.argScoreKeys.append(argScoreKey) 183 return counts

184 185 #def sortByUnmergingScore(): 186 # pass 187

188 -def sortByScore(documents, sortMethod="unmerging"):

189 """ 190 Make an ordered list for all events in all documents 191 """ 192 eventList = [] 193 if "EVEX" in sortMethod or "standardize" in sortMethod: 194 statValues = getStatValues(documents) 195 print "Stat values:", statValues 196 for document in documents: 197 for event in document.events + document.relations: 198 if "unmerging" in sortMethod: 199 score = event.trigger.unmergingScore 200 if "standardize" in sortMethod: 201 score = standardize(score, statValues, "unmerging") 202 eventList.append( (score, event.id, event, document) ) # event.id should keep things deterministic if two scores are the same 203 elif "triggers" in sortMethod: 204 score = event.trigger.triggerScore 205 if "standardize" in sortMethod: 206 score = standardize(score, statValues, "trigger") 207 eventList.append( (score, event.id, event, document) ) 208 elif "EVEX" in sortMethod: 209 eventList.append( (getEventEVEXScore(event, statValues), event.id, event, document) ) 210 eventList.sort() 211 return eventList

212

213 -def markForRemoval(eventList, cutoff=1.0):

214 """ 215 Take an ordered event list, and mark a fraction for removal by setting their arguments to [], thus 216 causing them to be removed in validation. 217 """ 218 breakPoint = cutoff * len(eventList) 219 for i in range(len(eventList)): 220 if i >= breakPoint: 221 break 222 eventList[i][2].arguments = [] # validation will remove events with 0 arguments

223

224 -def evaluate(documents, sortMethod, verbose, cutoffs=[], task="GE.2"):

225 workdir = tempfile.gettempdir() 226 outdir = os.path.join(workdir, "events") 227 cutoffs.sort() 228 eventList = sortByScore(documents, sortMethod) 229 results = {} 230 startTime = time.time() 231 for cutoff in cutoffs: 232 print "Cutoff", cutoff, str(datetime.timedelta(seconds=time.time()-startTime)) 233 markForRemoval(eventList, cutoff) 234 STTools.writeSet(documents, outdir, validate=True) # validation will remove events with 0 arguments 235 #results[cutoff] = getResults(BioNLP11GeniaTools.evaluateGE(outdir, task=2, evaluations=["approximate"], verbose=False, silent=not verbose)) 236 if "REL" not in task: 237 results[cutoff] = getResults(BioNLP11GeniaTools.evaluate(outdir, task=task)[1]) 238 else: 239 results[cutoff] = {} 240 print results 241 #print results[cutoff]["approximate"]["ALL-TOTAL"] 242 #shutil.rmtree(workdir) 243 #maxEvents = results[0.0]["approximate"]["ALL-TOTAL"]["answer"] 244 maxEvents = results[0.0]["answer"] 245 print "Max events", maxEvents 246 return results, maxEvents

247

248 -def resultsToGraph(results, outputname, maxEvents=None, manualEvaluationFile=None, graphs="prf"):

249 fig = figure() 250 251 ax = subplot(111) 252 ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) 253 ax.yaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5) 254 ax.xaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) 255 ax.xaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5) 256 257 ylabel('precision / recall / F-score [%]', size=12) 258 if maxEvents != None: 259 xlabel('events [%]', size=12) 260 else: 261 xlabel('events', size=12) 262 263 plots = {} 264 plotNames = [] #["precision", "fscore", "recall"] 265 legendText = [] 266 plotColours = {} 267 lineStyles = {} 268 markerStyles = {} 269 graphs = graphs.lower() 270 if "p" in graphs: 271 plotNames.append("precision") 272 legendText.append("precision (BioNLP'11)") 273 plotColours["precision"] = "red" 274 lineStyles["precision"] = "-" 275 markerStyles["precision"] = "v" 276 if "r" in graphs: 277 plotNames.append("recall") 278 legendText.append("recall (BioNLP'11)") 279 plotColours["recall"] = "green" 280 lineStyles["recall"] = "-" 281 markerStyles["recall"] = "^" 282 if "f" in graphs: 283 plotNames.append("fscore") 284 legendText.append("fscore (BioNLP'11)") 285 plotColours["fscore"] = "blue" 286 lineStyles["fscore"] = "-" 287 markerStyles["fscore"] = "s" 288 for name in plotNames: 289 plots[name] = [] 290 xValues = [] 291 for key in sorted(results): 292 for name in plotNames: 293 plots[name].append(results[key][name]) 294 xValue = results[key]["answer"] 295 if maxEvents != None: 296 xValue = float(xValue) / maxEvents * 100.0 297 xValues.append(xValue) 298 299 if manualEvaluationFile != None: 300 manualPrecisions = getManualEvaluationPrecisions(manualEvaluationFile) 301 plotManualEvaluationPrecisions(manualPrecisions, binSize=5, makeFig=False) 302 303 for name in plotNames: 304 plot(xValues, plots[name], marker=markerStyles[name], color=plotColours[name], linestyle=lineStyles[name], markersize=4) 305 306 ylim([0, 80]) 307 308 if manualEvaluationFile != None: 309 legendText = ["precision (EVEX)"] + legendText 310 311 leg = legend(legendText, 'lower right') 312 ltext = leg.get_texts() 313 setp(ltext, fontsize='small') 314 savefig(outputname, bbox_inches='tight')

315 #show() 316

317 -def getManualEvaluationPrecisions(manualEvaluationFile):

318 f = open(manualEvaluationFile, "rt") 319 lines = f.readlines() 320 f.close() 321 322 events = [] 323 truePositives = 0 324 falsePositives = 0 325 for line in lines: 326 begin, middle = line.split("--->") 327 end = "\n" 328 if "#" in line: 329 middle, end = middle.split("#") 330 eventEvaluation = middle.split(",")[-1].strip() 331 if "F" in eventEvaluation: 332 eventIsTrue = False 333 falsePositives += 1 334 else: 335 eventIsTrue = True 336 truePositives += 1 337 # get predicted event info 338 beginSplits = begin.split() 339 eventWeight = float(beginSplits[3]) 340 fromAbstract = beginSplits[4] == "ABSTRACT" 341 # add to list 342 events.append( (eventWeight, eventIsTrue) ) 343 events.sort() 344 precisions = [float(truePositives) / (truePositives + falsePositives)] 345 count = 0 346 for event in events: 347 if event[1]: 348 truePositives -= 1 349 else: 350 falsePositives -= 1 351 if truePositives + falsePositives > 0: 352 precisions.append(float(truePositives) / (truePositives + falsePositives)) 353 count += 1 354 #print "Count", count, event[0], (truePositives, falsePositives), precisions[-1] 355 return precisions

356

357 -def plotManualEvaluationPrecisions(precisions, binSize=1, makeFig=True):

358 if makeFig: 359 fig = figure() 360 361 ax = subplot(111) 362 ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) 363 ax.yaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5) 364 ax.xaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) 365 ax.xaxis.grid(True, linestyle='-', which='minor', color='lightgrey', alpha=0.5) 366 ylabel('precision', size=12) 367 xlabel('events [%]', size=12) 368 369 binnedScores = [] 370 currentBin = [] 371 count = 0 372 for precision in precisions: 373 currentBin.append(precision) 374 count += 1 375 if count >= binSize: 376 binnedScores.append( float(sum(currentBin)) / len(currentBin) * 100.0 ) 377 currentBin = [] 378 count = 0 379 380 numEvents = len(binnedScores) 381 xValues = [] 382 for i in range(numEvents): 383 xValues.append( float(numEvents-i)/numEvents*100 ) 384 plot(xValues, binnedScores, marker="o", color="red", linestyle="-", markersize=4) 385 386 if makeFig: 387 savefig("manual-scores-binned.pdf", bbox_inches='tight')

388 389 if __name__=="__main__": 390 from optparse import OptionParser 391 # Import Psyco if available 392 try: 393 import psyco 394 psyco.full() 395 print >> sys.stderr, "Found Psyco, using" 396 except ImportError: 397 print >> sys.stderr, "Psyco not installed" 398 399 optparser = OptionParser(description="Analyze confidence scores") 400 optparser.add_option("-i", "--input", default=None, dest="input", help="", metavar="FILE") 401 optparser.add_option("-t", "--task", default="GE.2", dest="task", help="", metavar="FILE") 402 optparser.add_option("-o", "--output", default=None, dest="output", help="", metavar="FILE") 403 optparser.add_option("-m", "--manual", default=None, dest="manual", help="", metavar="FILE") 404 optparser.add_option("-g", "--graphs", default="prf", dest="graphs", help="", metavar="FILE") 405 optparser.add_option("-s", "--sortmethod", default="unmerging", dest="sortmethod", help="") 406 optparser.add_option("-v", "--verbose", default=False, action="store_true", dest="verbose", help="") 407 optparser.add_option("--steps", default=10, type="int", dest="steps", help="", metavar="FILE") 408 optparser.add_option("--binSize", default=1, type="int", dest="binSize", help="", metavar="FILE") 409 (options, args) = optparser.parse_args() 410 411 if options.manual != None and options.input == None: 412 precisions = getManualEvaluationPrecisions(options.manual) 413 plotManualEvaluationPrecisions(precisions, options.binSize) 414 else: 415 cutoffs = [float(x)/options.steps for x in range(options.steps)] 416 print "Loading documents" 417 documents = STTools.loadSet(options.input, readScores=True) 418 # print "Testing evaluator" 419 # tempdir = tempfile.mkdtemp() 420 # print tempdir 421 # STTools.writeSet(documents, tempdir, debug=True, validate=False) # validation will remove events with 0 arguments 422 # BioNLP11GeniaTools.evaluate(tempdir, task=options.task) 423 #shutil.rmtree(tempdir) 424 print "Processing scores" 425 print processScores(documents, normalize="normalize" in options.sortmethod) 426 print "Evaluating" 427 results, maxEvents = evaluate(documents, options.sortmethod, verbose=options.verbose, cutoffs=cutoffs, task=options.task) 428 if options.output == None: 429 output = "scorefig-" + options.sortmethod + ".pdf" 430 else: 431 output = options.output 432 resultsToGraph(results, options.output, maxEvents, manualEvaluationFile=options.manual) 433

Source Code for Module TEES.Utils.STFormat.Scores