Package TEES :: Package Utils :: Package InteractionXML :: Module RecalculateIds
[hide private]

Source Code for Module TEES.Utils.InteractionXML.RecalculateIds

  1  import sys, os 
  2  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..") 
  3  try: 
  4      import xml.etree.cElementTree as ET 
  5  except ImportError: 
  6      import cElementTree as ET 
  7  import Utils.ElementTreeUtils as ETUtils 
  8   
9 -def recalculateIds(input, output=None, onlyWithinSentence=False, docIndexStart=0):
10 print >> sys.stderr, "##### Recalculate hierarchical interaction XML ids #####" 11 print >> sys.stderr, "Loading corpus", input 12 corpusTree = ETUtils.ETFromObj(input) 13 print >> sys.stderr, "Corpus file loaded" 14 corpusRoot = corpusTree.getroot() 15 16 # Rebuild hierarchical ids 17 print >> sys.stderr, "Recalculating interaction xml ids" 18 corpusName = corpusRoot.attrib["source"] 19 documents = corpusRoot.findall("document") 20 # Recalculate ids for documents, sentences and entities 21 entDictionary = {} 22 docIndex = docIndexStart 23 for document in documents: 24 if not onlyWithinSentence: 25 document.attrib["id"] = corpusName + ".d" + str(docIndex) 26 sentIndex = 0 27 sentences = document.findall("sentence") 28 for sentence in sentences: 29 if not onlyWithinSentence: 30 sentence.attrib["id"] = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex) 31 entIndex = 0 32 entities = sentence.findall("entity") 33 for entity in entities: 34 if not onlyWithinSentence: 35 entNewId = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex) + ".e" + str(entIndex) 36 else: 37 entNewId = sentence.attrib["id"] + ".e" + str(entIndex) 38 assert not entDictionary.has_key(entity.attrib["id"]),entity.get("id") 39 entDictionary[entity.attrib["id"]] = entNewId 40 entity.attrib["id"] = entNewId 41 entIndex += 1 42 sentIndex += 1 43 docIndex += 1 44 # Recalculate ids for pairs and interactions 45 docIndex = docIndexStart 46 for document in documents: 47 sentences = document.findall("sentence") 48 sentIndex = 0 49 for sentence in sentences: 50 interactions = sentence.findall("interaction") 51 intIndex = 0 52 for interaction in interactions: 53 if onlyWithinSentence: 54 interaction.attrib["id"] = sentence.attrib["id"] + ".i" + str(intIndex) 55 else: 56 interaction.attrib["id"] = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex) + ".i" + str(intIndex) 57 if interaction.attrib["e1"] in entDictionary: 58 interaction.attrib["e1"] = entDictionary[interaction.attrib["e1"]] 59 if interaction.attrib["e2"] in entDictionary: 60 interaction.attrib["e2"] = entDictionary[interaction.attrib["e2"]] 61 intIndex += 1 62 pairs = sentence.findall("pair") 63 pairIndex = 0 64 for pair in pairs: 65 if onlyWithinSentence: 66 pair.attrib["id"] = sentence.attrib["id"] + ".p" + str(pairIndex) 67 else: 68 pair.attrib["id"] = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex) + ".p" + str(pairIndex) 69 pair.attrib["e1"] = entDictionary[pair.attrib["e1"]] 70 pair.attrib["e2"] = entDictionary[pair.attrib["e2"]] 71 pairIndex += 1 72 sentIndex += 1 73 docIndex += 1 74 75 if output != None: 76 print >> sys.stderr, "Writing output to", output 77 ETUtils.write(corpusRoot, output) 78 return corpusTree
79 80 if __name__=="__main__": 81 import sys 82 83 from optparse import OptionParser 84 # Import Psyco if available 85 try: 86 import psyco 87 psyco.full() 88 print >> sys.stderr, "Found Psyco, using" 89 except ImportError: 90 print >> sys.stderr, "Psyco not installed" 91 92 defaultCorpusFilename = "BioInfer.xml" 93 defaultOutputName = "BioInfer.xml" 94 optparser = OptionParser(usage="%prog [options]\nPath generator.") 95 optparser.add_option("-i", "--input", default=defaultCorpusFilename, dest="input", help="Corpus in interaction xml format", metavar="FILE") 96 optparser.add_option("-o", "--output", default=defaultOutputName, dest="output", help="Output file in interaction xml format.") 97 optparser.add_option("-s", "--sentence", action="store_true", default=False, dest="sentence", help="Only recalculate within a sentence element.") 98 optparser.add_option("-d", "--docIndexStart", type="int", default=0, dest="docIndexStart", help="Start document indexing from.") 99 (options, args) = optparser.parse_args() 100 101 if options.input == None: 102 print >> sys.stderr, "Error, input file not defined." 103 optparser.print_help() 104 sys.exit(1) 105 if options.output == None: 106 print >> sys.stderr, "Error, output file not defined." 107 optparser.print_help() 108 sys.exit(1) 109 110 recalculateIds(options.input, options.output, options.sentence, options.docIndexStart) 111