Package TEES :: Package Utils :: Package InteractionXML :: Module SplitMergedElements
[hide private]

Source Code for Module TEES.Utils.InteractionXML.SplitMergedElements

  1  import sys, os, copy 
  2  extraPath = os.path.dirname(os.path.abspath(__file__))+"/../../" 
  3  sys.path.append(extraPath) 
  4  from Utils.ProgressCounter import ProgressCounter 
  5  try: 
  6      import xml.etree.cElementTree as ET 
  7  except ImportError: 
  8      import cElementTree as ET 
  9  import Utils.ElementTreeUtils as ETUtils 
 10  import IDUtils 
 11   
 12  # Splits merged types generated from overlapping entities/edges into their components 
13 -def getElementTypes(element, separator="---"):
14 typeName = element.get("type") 15 if typeName.find(separator) != -1: 16 return typeName.split(separator) 17 else: 18 return [typeName]
19
20 -def splitMerged(sentence, elementName, countsByType):
21 elements = sentence.findall(elementName) 22 elementCount = len(elements) 23 newElements = [] 24 # split old elements and remove them 25 removeCount = 0 26 newIdCount = IDUtils.getNextFreeId(elements) 27 for element in elements: 28 types = getElementTypes(element) 29 if len(types) > 1: 30 for type in types: 31 #newElement = copy.deepcopy(element) 32 newElement = ET.Element(elementName) 33 for k,v in element.attrib.iteritems(): 34 newElement.set(k, v) 35 newElement.set("type", type) 36 idSplits = element.get("id").rsplit(".",1) 37 newElement.set("id", idSplits[0] + "." + idSplits[1][0] + str(newIdCount) ) 38 newIdCount += 1 39 #print "new id", idSplits[0] + "." + idSplits[1][0] + str(elementCount) 40 newElements.append(newElement) 41 elementCount += 1 42 sentence.remove(element) 43 removeCount += 1 44 # insert the new elements 45 elements = sentence.findall(elementName) 46 if len(newElements) > 0: 47 insertPos = 0 48 if len(elements) > 0: 49 for element in sentence: 50 if element == elements[-1]: 51 break 52 insertPos += 1 53 for newElement in newElements: 54 sentence.insert(insertPos, newElement) 55 # increment counts 56 if countsByType != None: 57 countsByType[elementName][0] += removeCount 58 countsByType[elementName][1] += len(newElements)
59 60 # Splits entities/edges with merged types into separate elements
61 -def processSentence(sentence, countsByType):
62 splitMerged(sentence, "entity", countsByType) 63 splitMerged(sentence, "interaction", countsByType) 64 splitMerged(sentence, "pair", countsByType)
65
66 -def splitMergedElements(inputFilename, outputFilename=None):
67 print >> sys.stderr, "##### Split elements with merged types #####" 68 print >> sys.stderr, "Loading corpus", inputFilename 69 corpusTree = ETUtils.ETFromObj(inputFilename) 70 corpusRoot = corpusTree.getroot() 71 72 documents = corpusRoot.findall("document") 73 counter = ProgressCounter(len(documents), "Documents") 74 countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]} 75 for document in documents: 76 counter.update() 77 for sentence in document.findall("sentence"): 78 processSentence(sentence, countsByType) 79 print >> sys.stderr, "Results" 80 for k in sorted(countsByType.keys()): 81 print >> sys.stderr, " " + k + ": removed", countsByType[k][0], "created", countsByType[k][1] 82 83 if outputFilename != None: 84 print >> sys.stderr, "Writing output to", outputFilename 85 ETUtils.write(corpusRoot, outputFilename) 86 return corpusTree
87 88 if __name__=="__main__": 89 import sys 90 91 from optparse import OptionParser 92 # Import Psyco if available 93 try: 94 import psyco 95 psyco.full() 96 print >> sys.stderr, "Found Psyco, using" 97 except ImportError: 98 print >> sys.stderr, "Psyco not installed" 99 100 optparser = OptionParser(usage="%prog [options]\nPath generator.") 101 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 102 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 103 (options, args) = optparser.parse_args() 104 105 if options.input == None: 106 print >> sys.stderr, "Error, input file not defined." 107 optparser.print_help() 108 sys.exit(1) 109 if options.output == None: 110 print >> sys.stderr, "Error, output file not defined." 111 optparser.print_help() 112 sys.exit(1) 113 114 run(options.input, options.output) 115