1 __version__ = "$Revision: 1.1 $"
2
3 import sys,os
4 try:
5 import xml.etree.cElementTree as ET
6 except ImportError:
7 import cElementTree as ET
8 import Utils.ElementTreeUtils as ETUtils
9 import Utils.Range as Range
10
11 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
12 from Utils.ProgressCounter import ProgressCounter
13
15 print >> sys.stderr, "Loading corpus", input
16 corpusTree = ETUtils.ETFromObj(input)
17 print >> sys.stderr, "Corpus file loaded"
18 corpusRoot = corpusTree.getroot()
19
20 docCount = 0
21 sentencesCreated = 0
22 sentences = [x for x in corpusRoot.getiterator("sentence")]
23 counter = ProgressCounter(len(sentences), "FixAltOffsets")
24 fixCount = 0
25
26 for sentence in sentences:
27 counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
28 sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
29 for entity in sentence.findall("entity"):
30 altOffsetString = entity.get("altOffset")
31 if altOffsetString == None:
32 continue
33
34 altOffsets = Range.charOffsetToTuples(altOffsetString)
35 assert len(altOffsets) == 1
36 for i in range(len(altOffsets)):
37 altOffset = altOffsets[i]
38 altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
39 entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
40 fixCount += 1
41
42 print >> sys.stderr, "Fixed", fixCount, "altOffsets"
43
44 if output != None:
45 print >> sys.stderr, "Writing output to", output
46 ETUtils.write(corpusRoot, output)
47 return corpusTree
48
49 if __name__=="__main__":
50 import sys
51
52 from optparse import OptionParser
53
54 try:
55 import psyco
56 psyco.full()
57 print >> sys.stderr, "Found Psyco, using"
58 except ImportError:
59 print >> sys.stderr, "Psyco not installed"
60
61 optparser = OptionParser(usage="%prog [options]\n")
62 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
63 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
64 (options, args) = optparser.parse_args()
65
66 makeSentences(input=options.input, output=options.output)
67