TEES.Utils.STFormat.Compare

1 from collections import defaultdict 2 from STTools import * 3

4 -def getCounts(documents):

5 counts = defaultdict(int) 6 counts["modifier"] = 0 7 counts["modifier(spec)"] = 0 8 counts["modifier(neg)"] = 0 9 for doc in documents: 10 if doc.proteins != None: 11 for protein in doc.proteins: 12 counts["protein("+protein.type+")"] += 1 13 counts["protein"] += 1 14 if doc.triggers != None: 15 for trigger in doc.triggers: 16 counts["trigger("+trigger.type+")"] += 1 17 counts["trigger"] += 1 18 if doc.events != None: 19 for event in doc.events: 20 counts["event("+event.type+")"] += 1 21 counts["event"] += 1 22 if event.speculation != None: 23 counts["modifier"] += 1 24 counts["modifier(spec)"] += 1 25 if event.negation != None: 26 counts["modifier"] += 1 27 counts["modifier(neg)"] += 1 28 if doc.relations != None: 29 for relation in doc.relations: 30 counts["relation("+relation.type+")"] += 1 31 counts["relation"] += 1 32 for ann in doc.events + doc.relations: 33 for arg in ann.arguments: 34 counts["arg("+arg[0]+")"] += 1 35 counts["arg"] += 1 36 return counts

37

38 -def compare(a, b, a2Tag="a2"):

39 print >> sys.stderr, "Comparing BioNLP Shared Task format document sets" 40 print "Loading set A:", a 41 docsA = loadSet(a, a2Tag=a2Tag) 42 print "Loading set B:", b 43 docsB = loadSet(b, a2Tag=a2Tag) 44 countsA = getCounts(docsA) 45 countsB = getCounts(docsB) 46 allKeys = list(set(countsA.keys() + countsB.keys())) 47 allKeys.sort() 48 maxKeyLength = max([len(x) for x in allKeys]) 49 # Sets 50 print "Sets" 51 print "A:", a, "(documents: " + str(len(docsA)) + ")" 52 print "B:", b, "(documents: " + str(len(docsB)) + ")" 53 # Make title 54 titleLine = "Category" 55 while len(titleLine) <= maxKeyLength: 56 titleLine += " " 57 titleLine += "A" 58 titleLine += " " * 9 59 titleLine += "B" 60 titleLine += " " * 9 61 titleLine += "Diff" 62 titleLine += " " * 6 63 titleLine += "Status" 64 print titleLine 65 # Make lines 66 for key in allKeys: 67 line = key 68 while len(line) <= maxKeyLength: 69 line += " " 70 valA = (countsA[key] / float(len(docsA))) 71 line += "%.2f" % valA 72 while len(line) <= maxKeyLength + 10: 73 line += " " 74 valB = (countsB[key] / float(len(docsB))) 75 line += "%.2f" % valB 76 # Diff 77 while len(line) <= maxKeyLength + 20: 78 line += " " 79 if valA == 0 or valB == 0: 80 diff = None 81 line += "N/A" 82 else: 83 diff = valA / valB 84 line += "%.2f" % diff 85 # Dist 86 while len(line) <= maxKeyLength + 30: 87 line += " " 88 if diff != None: 89 dist = abs(1.0 - diff) 90 maxCount = 30 91 step = 0.01 92 count = 0 93 while dist > 0: 94 dist -= step 95 count += 1 96 line += "!" 97 if count >= maxCount: 98 line += "+" 99 break 100 else: 101 line += "-" 102 #if dist > 0.05: 103 # line += "!!!!!!!!!!+" 104 #else: 105 # for i in range(int(dist * 100 * 2)): 106 # line += "!" 107 print line

108 109 if __name__=="__main__": 110 import sys 111 112 from optparse import OptionParser 113 # Import Psyco if available 114 try: 115 import psyco 116 psyco.full() 117 print >> sys.stderr, "Found Psyco, using" 118 except ImportError: 119 print >> sys.stderr, "Psyco not installed" 120 121 optparser = OptionParser(description="Compare event distribution") 122 optparser.add_option("-a", "--inputA", default=None, dest="inputA", help="", metavar="FILE") 123 optparser.add_option("-b", "--inputB", default=None, dest="inputB", help="") 124 optparser.add_option("-t", "--a2Tag", default="a2", dest="a2Tag", help="") 125 #optparser.add_option("-p", "--parse", default=None, dest="parse", help="Name of parse element.") 126 (options, args) = optparser.parse_args() 127 128 compare(options.inputA, options.inputB, a2Tag=options.a2Tag) 129

Source Code for Module TEES.Utils.STFormat.Compare