1 from collections import defaultdict
2 from STTools import *
3
5 counts = defaultdict(int)
6 counts["modifier"] = 0
7 counts["modifier(spec)"] = 0
8 counts["modifier(neg)"] = 0
9 for doc in documents:
10 if doc.proteins != None:
11 for protein in doc.proteins:
12 counts["protein("+protein.type+")"] += 1
13 counts["protein"] += 1
14 if doc.triggers != None:
15 for trigger in doc.triggers:
16 counts["trigger("+trigger.type+")"] += 1
17 counts["trigger"] += 1
18 if doc.events != None:
19 for event in doc.events:
20 counts["event("+event.type+")"] += 1
21 counts["event"] += 1
22 if event.speculation != None:
23 counts["modifier"] += 1
24 counts["modifier(spec)"] += 1
25 if event.negation != None:
26 counts["modifier"] += 1
27 counts["modifier(neg)"] += 1
28 if doc.relations != None:
29 for relation in doc.relations:
30 counts["relation("+relation.type+")"] += 1
31 counts["relation"] += 1
32 for ann in doc.events + doc.relations:
33 for arg in ann.arguments:
34 counts["arg("+arg[0]+")"] += 1
35 counts["arg"] += 1
36 return counts
37
39 print >> sys.stderr, "Comparing BioNLP Shared Task format document sets"
40 print "Loading set A:", a
41 docsA = loadSet(a, a2Tag=a2Tag)
42 print "Loading set B:", b
43 docsB = loadSet(b, a2Tag=a2Tag)
44 countsA = getCounts(docsA)
45 countsB = getCounts(docsB)
46 allKeys = list(set(countsA.keys() + countsB.keys()))
47 allKeys.sort()
48 maxKeyLength = max([len(x) for x in allKeys])
49
50 print "Sets"
51 print "A:", a, "(documents: " + str(len(docsA)) + ")"
52 print "B:", b, "(documents: " + str(len(docsB)) + ")"
53
54 titleLine = "Category"
55 while len(titleLine) <= maxKeyLength:
56 titleLine += " "
57 titleLine += "A"
58 titleLine += " " * 9
59 titleLine += "B"
60 titleLine += " " * 9
61 titleLine += "Diff"
62 titleLine += " " * 6
63 titleLine += "Status"
64 print titleLine
65
66 for key in allKeys:
67 line = key
68 while len(line) <= maxKeyLength:
69 line += " "
70 valA = (countsA[key] / float(len(docsA)))
71 line += "%.2f" % valA
72 while len(line) <= maxKeyLength + 10:
73 line += " "
74 valB = (countsB[key] / float(len(docsB)))
75 line += "%.2f" % valB
76
77 while len(line) <= maxKeyLength + 20:
78 line += " "
79 if valA == 0 or valB == 0:
80 diff = None
81 line += "N/A"
82 else:
83 diff = valA / valB
84 line += "%.2f" % diff
85
86 while len(line) <= maxKeyLength + 30:
87 line += " "
88 if diff != None:
89 dist = abs(1.0 - diff)
90 maxCount = 30
91 step = 0.01
92 count = 0
93 while dist > 0:
94 dist -= step
95 count += 1
96 line += "!"
97 if count >= maxCount:
98 line += "+"
99 break
100 else:
101 line += "-"
102
103
104
105
106
107 print line
108
109 if __name__=="__main__":
110 import sys
111
112 from optparse import OptionParser
113
114 try:
115 import psyco
116 psyco.full()
117 print >> sys.stderr, "Found Psyco, using"
118 except ImportError:
119 print >> sys.stderr, "Psyco not installed"
120
121 optparser = OptionParser(description="Compare event distribution")
122 optparser.add_option("-a", "--inputA", default=None, dest="inputA", help="", metavar="FILE")
123 optparser.add_option("-b", "--inputB", default=None, dest="inputB", help="")
124 optparser.add_option("-t", "--a2Tag", default="a2", dest="a2Tag", help="")
125
126 (options, args) = optparser.parse_args()
127
128 compare(options.inputA, options.inputB, a2Tag=options.a2Tag)
129