1 import CorpusElements
2 import SentenceElements
3 import sys, os
4 thisPath = os.path.dirname(os.path.abspath(__file__))
5 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
6 import Utils.ElementTreeUtils as ETUtils
7 from optparse import OptionParser
8 import sys
9 from collections import defaultdict
10
12 if entity1.get("charOffset") == entity2.get("charOffset") and entity1.get("type") == entity2.get("type"):
13
14 assert(entity1.get("headOffset") == entity2.get("headOffset"))
15 assert(entity1.get("text") == entity2.get("text"))
16 return True
17 else:
18 return False
19
21 if interaction1.get("e1") == interaction2.get("e1") and interaction1.get("e2") == interaction2.get("e2") and interaction1.get("type") == interaction2.get("type"):
22 assert(interaction1.get("interaction") == interaction2.get("interaction"))
23 return True
24 else:
25 return False
26
28 entitiesByType = {}
29 duplicatesRemovedByType = {}
30 globalEntityIsDuplicateOf = {}
31 for sentence in sentences:
32 entityIsDuplicateOf = {}
33 for k in sentence.entitiesById.keys():
34 assert k not in entityIsDuplicateOf
35 entityIsDuplicateOf[k] = None
36 if not entitiesByType.has_key(sentence.entitiesById[k].attrib["type"]):
37 entitiesByType[sentence.entitiesById[k].attrib["type"]] = 0
38 entitiesByType[sentence.entitiesById[k].attrib["type"]] += 1
39
40 for i in range(len(sentence.entities)-1):
41 if entityIsDuplicateOf[sentence.entities[i].attrib["id"]] == None:
42 for j in range(i+1,len(sentence.entities)):
43 if compareEntities(sentence.entities[i], sentence.entities[j]):
44 entityIsDuplicateOf[sentence.entities[j].attrib["id"]] = sentence.entities[i].attrib["id"]
45
46 for k,v in entityIsDuplicateOf.iteritems():
47 assert k not in globalEntityIsDuplicateOf, k
48 globalEntityIsDuplicateOf[k] = v
49 if v != None:
50 entityToRemove = sentence.entitiesById[k]
51 if not duplicatesRemovedByType.has_key(entityToRemove.attrib["type"]):
52 duplicatesRemovedByType[entityToRemove.attrib["type"]] = 0
53 duplicatesRemovedByType[entityToRemove.attrib["type"]] += 1
54 sentence.sentence.remove(entityToRemove)
55 if debug: print "Removing Entity", k, "duplicate of", v
56
57 for sentence in sentences:
58 for pair in sentence.pairs + sentence.interactions:
59
60
61
62
63 if pair.get("e1") not in globalEntityIsDuplicateOf or pair.get("e2") not in globalEntityIsDuplicateOf:
64 print >> sys.stderr, "Warning, interaction", pair.get("id"), [pair.get("e1"), pair.get("e2")], "links to a non-existing entity"
65 continue
66 if globalEntityIsDuplicateOf[pair.attrib["e1"]] != None:
67 pair.attrib["e1"] = globalEntityIsDuplicateOf[pair.attrib["e1"]]
68 if debug: print "Remapping", pair.get("id"), "arg e1 from", pair.get("e1"), "to", globalEntityIsDuplicateOf[pair.get("e1")]
69 if globalEntityIsDuplicateOf[pair.attrib["e2"]] != None:
70 pair.attrib["e2"] = globalEntityIsDuplicateOf[pair.attrib["e2"]]
71 if debug: print "Remapping", pair.get("id"), "arg e2 from", pair.get("e2"), "to", globalEntityIsDuplicateOf[pair.get("e2")]
72
73
74
75
76
77
78 return entitiesByType, duplicatesRemovedByType
79
81 interactionsByType = {}
82 duplicatesRemovedByType = {}
83 for sentence in sentences:
84 interactions = sentence.pairs + sentence.interactions
85 interactionIsDuplicateOf = {}
86 for interaction in interactions:
87 interactionIsDuplicateOf[interaction.attrib["id"]] = None
88 if not interactionsByType.has_key(interaction.attrib["type"]):
89 interactionsByType[interaction.attrib["type"]] = 0
90 interactionsByType[interaction.attrib["type"]] += 1
91
92 for i in range(len(interactions)-1):
93 if interactionIsDuplicateOf[interactions[i].attrib["id"]] == None:
94 for j in range(i+1,len(interactions)):
95 if compareInteractions(interactions[i], interactions[j]):
96 interactionIsDuplicateOf[interactions[j].attrib["id"]] = interactions[i].attrib["id"]
97
98 for k,v in interactionIsDuplicateOf.iteritems():
99 if v != None:
100 elementToRemove = None
101 if k.rsplit(".",1)[-1][0] == "p":
102 for pair in sentence.pairs:
103 if pair.attrib["id"] == k:
104 elementToRemove = pair
105 break
106 elif k.rsplit(".",1)[-1][0] == "i":
107 for interaction in sentence.interactions:
108 if interaction.attrib["id"] == k:
109 elementToRemove = interaction
110 break
111
112 if not duplicatesRemovedByType.has_key(elementToRemove.attrib["type"]):
113 duplicatesRemovedByType[elementToRemove.attrib["type"]] = 0
114 duplicatesRemovedByType[elementToRemove.attrib["type"]] += 1
115 sentence.sentence.remove(elementToRemove)
116 if debug: print "Removing Interaction", k, "duplicate of", v
117
118 return interactionsByType, duplicatesRemovedByType
119
120 -def printStats(origItemsByType, duplicatesRemovedByType):
121 print >> sys.stderr, "Removed duplicates (original count in parenthesis):"
122 keys = duplicatesRemovedByType.keys()
123 keys.sort()
124 for key in keys:
125 print >> sys.stderr, " " + key + ": " + str(duplicatesRemovedByType[key]) + " (" + str(origItemsByType[key]) + ")"
126 print >> sys.stderr, " ---------------------------------"
127 print >> sys.stderr, " Total: " + str(sum(duplicatesRemovedByType.values())) + " (" + str(sum(origItemsByType.values())) + ")"
128
129 -def mergeAll(input, output=None, debug=False, iterate=False):
130 if iterate:
131 origItems = defaultdict(int)
132 removedItems = defaultdict(int)
133 for docSentences in SentenceElements.getCorpusIterator(input, output):
134 entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug)
135 for key in entitiesByType: origItems[key] += entitiesByType[key]
136 for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
137 interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug)
138 for key in interactionsByType: origItems[key] += interactionsByType[key]
139 for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
140 printStats(origItems, removedItems)
141 return None
142 else:
143 corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
144 print >> sys.stderr, "Merging duplicate entities"
145 entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug)
146 printStats(entitiesByType, duplicatesRemovedByType)
147 print >> sys.stderr, "Merging duplicate interactions"
148 interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug)
149 printStats(interactionsByType, duplicatesRemovedByType)
150 if output != None:
151 print >> sys.stderr, "Writing output to", output
152 ETUtils.write(corpusElements.rootElement, output)
153 return corpusElements
154
155 if __name__=="__main__":
156 print >> sys.stderr, "##### Merge duplicate entities and interactions #####"
157
158 try:
159 import psyco
160 psyco.full()
161 print >> sys.stderr, "Found Psyco, using"
162 except ImportError:
163 print >> sys.stderr, "Psyco not installed"
164
165 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
166 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE")
167 optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE")
168 optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="")
169 optparser.add_option("-r", "--iterate", default=False, action="store_true", dest="iterate", help="")
170 (options, args) = optparser.parse_args()
171 assert(options.input != None)
172
173
174 mergeAll(options.input, options.output, options.debug, options.iterate)
175