Package TEES :: Package Utils :: Package InteractionXML :: Module RemoveUnconnectedEntities
[hide private]

Source Code for Module TEES.Utils.InteractionXML.RemoveUnconnectedEntities

 1  import sys, os 
 2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
 3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
 4  import Utils.ElementTreeUtils as ETUtils 
 5   
6 -def removeUnconnectedEntities(input, output=None):
7 input = ETUtils.ETFromObj(input) 8 root = input.getroot() 9 removed = 0 10 preserved = 0 11 for document in root.findall("document"): 12 sentMap = {} # allow for intersentence interactions 13 for sentence in document.findall("sentence"): 14 sentMap[sentence.get("id")] = sentence 15 connected = set() 16 for interaction in document.getiterator("interaction"): 17 connected.add(interaction.get("e1")) 18 connected.add(interaction.get("e2")) 19 entities = [] 20 for entity in document.getiterator("entity"): 21 entities.append(entity) 22 for entity in entities: 23 if entity.get("isName") == "True": # never remove named entities 24 continue 25 eId = entity.get("id") 26 if eId not in connected: 27 if eId.find(".s") != -1: # sentence level entity 28 sentMap[eId.rsplit(".", 1)[0]].remove(entity) 29 else: # document level entity 30 document.remove(entity) 31 removed += 1 32 else: 33 preserved += 1 34 35 print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities" 36 37 if output != None: 38 print >> sys.stderr, "Writing output to", output 39 ETUtils.write(root, output) 40 return input
41 42 if __name__=="__main__": 43 import sys 44 45 from optparse import OptionParser 46 # Import Psyco if available 47 try: 48 import psyco 49 psyco.full() 50 print >> sys.stderr, "Found Psyco, using" 51 except ImportError: 52 print >> sys.stderr, "Psyco not installed" 53 54 optparser = OptionParser(usage="%prog [options]\n") 55 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 56 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 57 (options, args) = optparser.parse_args() 58 59 if options.input == None: 60 print >> sys.stderr, "Error, first input file not defined." 61 optparser.print_help() 62 sys.exit(1) 63 if options.output == None: 64 print >> sys.stderr, "Error, output file not defined." 65 optparser.print_help() 66 sys.exit(1) 67 68 removeUnconnectedEntities(options.input, options.output) 69