Package TEES :: Package Utils :: Package InteractionXML :: Module MergeDuplicateEntities
[hide private]

Source Code for Module TEES.Utils.InteractionXML.MergeDuplicateEntities

  1  import CorpusElements 
  2  import SentenceElements 
  3  import sys, os 
  4  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  5  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  6  import Utils.ElementTreeUtils as ETUtils 
  7  from optparse import OptionParser 
  8  import sys 
  9  from collections import defaultdict 
 10   
11 -def compareEntities(entity1, entity2):
12 if entity1.get("charOffset") == entity2.get("charOffset") and entity1.get("type") == entity2.get("type"): 13 #assert(entity1.get("isName") == entity2.get("isName")) 14 assert(entity1.get("headOffset") == entity2.get("headOffset")) 15 assert(entity1.get("text") == entity2.get("text")) 16 return True 17 else: 18 return False
19
20 -def compareInteractions(interaction1, interaction2):
21 if interaction1.get("e1") == interaction2.get("e1") and interaction1.get("e2") == interaction2.get("e2") and interaction1.get("type") == interaction2.get("type"): 22 assert(interaction1.get("interaction") == interaction2.get("interaction")) 23 return True 24 else: 25 return False
26
27 -def mergeDuplicateEntities(sentences, debug=False):
28 entitiesByType = {} 29 duplicatesRemovedByType = {} 30 globalEntityIsDuplicateOf = {} 31 for sentence in sentences: 32 entityIsDuplicateOf = {} 33 for k in sentence.entitiesById.keys(): 34 assert k not in entityIsDuplicateOf 35 entityIsDuplicateOf[k] = None 36 if not entitiesByType.has_key(sentence.entitiesById[k].attrib["type"]): 37 entitiesByType[sentence.entitiesById[k].attrib["type"]] = 0 38 entitiesByType[sentence.entitiesById[k].attrib["type"]] += 1 39 # Mark entities for removal 40 for i in range(len(sentence.entities)-1): 41 if entityIsDuplicateOf[sentence.entities[i].attrib["id"]] == None: 42 for j in range(i+1,len(sentence.entities)): 43 if compareEntities(sentence.entities[i], sentence.entities[j]): 44 entityIsDuplicateOf[sentence.entities[j].attrib["id"]] = sentence.entities[i].attrib["id"] 45 # Remove entities from sentence element 46 for k,v in entityIsDuplicateOf.iteritems(): 47 assert k not in globalEntityIsDuplicateOf, k 48 globalEntityIsDuplicateOf[k] = v 49 if v != None: 50 entityToRemove = sentence.entitiesById[k] 51 if not duplicatesRemovedByType.has_key(entityToRemove.attrib["type"]): 52 duplicatesRemovedByType[entityToRemove.attrib["type"]] = 0 53 duplicatesRemovedByType[entityToRemove.attrib["type"]] += 1 54 sentence.sentence.remove(entityToRemove) 55 if debug: print "Removing Entity", k, "duplicate of", v 56 # Remap pairs and interactions that used the removed entities 57 for sentence in sentences: 58 for pair in sentence.pairs + sentence.interactions: 59 # if pair.get("id") == "GE.d1.s13.i56": 60 # print "BEFORE" 61 # print pair.get("e1"), globalEntityIsDuplicateOf[pair.get("e1")] 62 # print pair.get("e2"), globalEntityIsDuplicateOf[pair.get("e2")] 63 if pair.get("e1") not in globalEntityIsDuplicateOf or pair.get("e2") not in globalEntityIsDuplicateOf: 64 print >> sys.stderr, "Warning, interaction", pair.get("id"), [pair.get("e1"), pair.get("e2")], "links to a non-existing entity" 65 continue 66 if globalEntityIsDuplicateOf[pair.attrib["e1"]] != None: 67 pair.attrib["e1"] = globalEntityIsDuplicateOf[pair.attrib["e1"]] 68 if debug: print "Remapping", pair.get("id"), "arg e1 from", pair.get("e1"), "to", globalEntityIsDuplicateOf[pair.get("e1")] 69 if globalEntityIsDuplicateOf[pair.attrib["e2"]] != None: 70 pair.attrib["e2"] = globalEntityIsDuplicateOf[pair.attrib["e2"]] 71 if debug: print "Remapping", pair.get("id"), "arg e2 from", pair.get("e2"), "to", globalEntityIsDuplicateOf[pair.get("e2")] 72 # if pair.get("id") == "GE.d1.s13.i56": 73 # print "AFTER" 74 # print pair.get("e1"), globalEntityIsDuplicateOf[pair.get("e1")] 75 # print pair.get("e2"), globalEntityIsDuplicateOf[pair.get("e2")] 76 # pair.set("Processed", "True") 77 78 return entitiesByType, duplicatesRemovedByType
79
80 -def mergeDuplicateInteractions(sentences, debug=False):
81 interactionsByType = {} 82 duplicatesRemovedByType = {} 83 for sentence in sentences: 84 interactions = sentence.pairs + sentence.interactions 85 interactionIsDuplicateOf = {} 86 for interaction in interactions: 87 interactionIsDuplicateOf[interaction.attrib["id"]] = None 88 if not interactionsByType.has_key(interaction.attrib["type"]): 89 interactionsByType[interaction.attrib["type"]] = 0 90 interactionsByType[interaction.attrib["type"]] += 1 91 # Mark entities for removal 92 for i in range(len(interactions)-1): 93 if interactionIsDuplicateOf[interactions[i].attrib["id"]] == None: 94 for j in range(i+1,len(interactions)): 95 if compareInteractions(interactions[i], interactions[j]): 96 interactionIsDuplicateOf[interactions[j].attrib["id"]] = interactions[i].attrib["id"] 97 # Remove entities from sentence element 98 for k,v in interactionIsDuplicateOf.iteritems(): 99 if v != None: 100 elementToRemove = None 101 if k.rsplit(".",1)[-1][0] == "p": 102 for pair in sentence.pairs: 103 if pair.attrib["id"] == k: 104 elementToRemove = pair 105 break 106 elif k.rsplit(".",1)[-1][0] == "i": 107 for interaction in sentence.interactions: 108 if interaction.attrib["id"] == k: 109 elementToRemove = interaction 110 break 111 112 if not duplicatesRemovedByType.has_key(elementToRemove.attrib["type"]): 113 duplicatesRemovedByType[elementToRemove.attrib["type"]] = 0 114 duplicatesRemovedByType[elementToRemove.attrib["type"]] += 1 115 sentence.sentence.remove(elementToRemove) 116 if debug: print "Removing Interaction", k, "duplicate of", v 117 118 return interactionsByType, duplicatesRemovedByType
119
120 -def printStats(origItemsByType, duplicatesRemovedByType):
121 print >> sys.stderr, "Removed duplicates (original count in parenthesis):" 122 keys = duplicatesRemovedByType.keys() 123 keys.sort() 124 for key in keys: 125 print >> sys.stderr, " " + key + ": " + str(duplicatesRemovedByType[key]) + " (" + str(origItemsByType[key]) + ")" 126 print >> sys.stderr, " ---------------------------------" 127 print >> sys.stderr, " Total: " + str(sum(duplicatesRemovedByType.values())) + " (" + str(sum(origItemsByType.values())) + ")" 128
129 -def mergeAll(input, output=None, debug=False, iterate=False):
130 if iterate: 131 origItems = defaultdict(int) 132 removedItems = defaultdict(int) 133 for docSentences in SentenceElements.getCorpusIterator(input, output): 134 entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug) 135 for key in entitiesByType: origItems[key] += entitiesByType[key] 136 for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] 137 interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug) 138 for key in interactionsByType: origItems[key] += interactionsByType[key] 139 for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] 140 printStats(origItems, removedItems) 141 return None 142 else: 143 corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False) 144 print >> sys.stderr, "Merging duplicate entities" 145 entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug) 146 printStats(entitiesByType, duplicatesRemovedByType) 147 print >> sys.stderr, "Merging duplicate interactions" 148 interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug) 149 printStats(interactionsByType, duplicatesRemovedByType) 150 if output != None: 151 print >> sys.stderr, "Writing output to", output 152 ETUtils.write(corpusElements.rootElement, output) 153 return corpusElements
154 155 if __name__=="__main__": 156 print >> sys.stderr, "##### Merge duplicate entities and interactions #####" 157 # Import Psyco if available 158 try: 159 import psyco 160 psyco.full() 161 print >> sys.stderr, "Found Psyco, using" 162 except ImportError: 163 print >> sys.stderr, "Psyco not installed" 164 165 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") 166 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") 167 optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE") 168 optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="") 169 optparser.add_option("-r", "--iterate", default=False, action="store_true", dest="iterate", help="") 170 (options, args) = optparser.parse_args() 171 assert(options.input != None) 172 #assert(options.output != None) 173 174 mergeAll(options.input, options.output, options.debug, options.iterate) 175