1 import sys, os
2 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..")
3 try:
4 import xml.etree.cElementTree as ET
5 except ImportError:
6 import cElementTree as ET
7 import Utils.ElementTreeUtils as ETUtils
8
9 -def recalculateIds(input, output=None, onlyWithinSentence=False, docIndexStart=0):
10 print >> sys.stderr, "##### Recalculate hierarchical interaction XML ids #####"
11 print >> sys.stderr, "Loading corpus", input
12 corpusTree = ETUtils.ETFromObj(input)
13 print >> sys.stderr, "Corpus file loaded"
14 corpusRoot = corpusTree.getroot()
15
16
17 print >> sys.stderr, "Recalculating interaction xml ids"
18 corpusName = corpusRoot.attrib["source"]
19 documents = corpusRoot.findall("document")
20
21 entDictionary = {}
22 docIndex = docIndexStart
23 for document in documents:
24 if not onlyWithinSentence:
25 document.attrib["id"] = corpusName + ".d" + str(docIndex)
26 sentIndex = 0
27 sentences = document.findall("sentence")
28 for sentence in sentences:
29 if not onlyWithinSentence:
30 sentence.attrib["id"] = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex)
31 entIndex = 0
32 entities = sentence.findall("entity")
33 for entity in entities:
34 if not onlyWithinSentence:
35 entNewId = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex) + ".e" + str(entIndex)
36 else:
37 entNewId = sentence.attrib["id"] + ".e" + str(entIndex)
38 assert not entDictionary.has_key(entity.attrib["id"]),entity.get("id")
39 entDictionary[entity.attrib["id"]] = entNewId
40 entity.attrib["id"] = entNewId
41 entIndex += 1
42 sentIndex += 1
43 docIndex += 1
44
45 docIndex = docIndexStart
46 for document in documents:
47 sentences = document.findall("sentence")
48 sentIndex = 0
49 for sentence in sentences:
50 interactions = sentence.findall("interaction")
51 intIndex = 0
52 for interaction in interactions:
53 if onlyWithinSentence:
54 interaction.attrib["id"] = sentence.attrib["id"] + ".i" + str(intIndex)
55 else:
56 interaction.attrib["id"] = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex) + ".i" + str(intIndex)
57 if interaction.attrib["e1"] in entDictionary:
58 interaction.attrib["e1"] = entDictionary[interaction.attrib["e1"]]
59 if interaction.attrib["e2"] in entDictionary:
60 interaction.attrib["e2"] = entDictionary[interaction.attrib["e2"]]
61 intIndex += 1
62 pairs = sentence.findall("pair")
63 pairIndex = 0
64 for pair in pairs:
65 if onlyWithinSentence:
66 pair.attrib["id"] = sentence.attrib["id"] + ".p" + str(pairIndex)
67 else:
68 pair.attrib["id"] = corpusName + ".d" + str(docIndex) + ".s" + str(sentIndex) + ".p" + str(pairIndex)
69 pair.attrib["e1"] = entDictionary[pair.attrib["e1"]]
70 pair.attrib["e2"] = entDictionary[pair.attrib["e2"]]
71 pairIndex += 1
72 sentIndex += 1
73 docIndex += 1
74
75 if output != None:
76 print >> sys.stderr, "Writing output to", output
77 ETUtils.write(corpusRoot, output)
78 return corpusTree
79
80 if __name__=="__main__":
81 import sys
82
83 from optparse import OptionParser
84
85 try:
86 import psyco
87 psyco.full()
88 print >> sys.stderr, "Found Psyco, using"
89 except ImportError:
90 print >> sys.stderr, "Psyco not installed"
91
92 defaultCorpusFilename = "BioInfer.xml"
93 defaultOutputName = "BioInfer.xml"
94 optparser = OptionParser(usage="%prog [options]\nPath generator.")
95 optparser.add_option("-i", "--input", default=defaultCorpusFilename, dest="input", help="Corpus in interaction xml format", metavar="FILE")
96 optparser.add_option("-o", "--output", default=defaultOutputName, dest="output", help="Output file in interaction xml format.")
97 optparser.add_option("-s", "--sentence", action="store_true", default=False, dest="sentence", help="Only recalculate within a sentence element.")
98 optparser.add_option("-d", "--docIndexStart", type="int", default=0, dest="docIndexStart", help="Start document indexing from.")
99 (options, args) = optparser.parse_args()
100
101 if options.input == None:
102 print >> sys.stderr, "Error, input file not defined."
103 optparser.print_help()
104 sys.exit(1)
105 if options.output == None:
106 print >> sys.stderr, "Error, output file not defined."
107 optparser.print_help()
108 sys.exit(1)
109
110 recalculateIds(options.input, options.output, options.sentence, options.docIndexStart)
111