1 import sys, os
2 thisPath = os.path.dirname(os.path.abspath(__file__))
3 sys.path.append(os.path.abspath(os.path.join(thisPath,"../..")))
4 import gzip, codecs
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9 import Utils.ElementTreeUtils as ETUtils
10 import RecalculateIds
11
13 if not os.path.exists(os.path.dirname(output)):
14 os.makedirs(os.path.dirname(output))
15 if fast:
16 catenateFiles(inputs, output)
17 else:
18 catenateElements(inputs, output)
19 return output
20
22 print >> sys.stderr, "##### Catenate interaction XML as files #####"
23 assert len(inputs) > 1
24 print >> sys.stderr, "Writing catenated XML to", output
25 if output.endswith(".gz"):
26 outFile = gzip.open(output, 'wb')
27 else:
28 outFile = open(output, "wb")
29 outWriter = codecs.getwriter("utf-8")(outFile)
30 for i in range(len(inputs)):
31 print >> sys.stderr, "Catenating", inputs[i]
32 if inputs[i].endswith(".gz"):
33 f = gzip.open(inputs[i], 'rb')
34 else:
35 f = open(inputs[i], "rb")
36 state = "BEGIN"
37 for line in codecs.getreader("utf-8")(f):
38 if "<corpus" in line:
39 assert state == "BEGIN"
40 state = "MIDDLE"
41 if i > 0:
42 continue
43 elif "</corpus" in line:
44 assert state == "MIDDLE"
45 state = "END"
46 if state == "BEGIN" and i > 0:
47 continue
48 if state == "END" and i < len(inputs) - 1:
49 continue
50 outWriter.write(line)
51 f.close()
52 outFile.close()
53
55 print >> sys.stderr, "##### Catenate interaction XML as elements #####"
56 c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
57 numDocs = len(c1.getroot().findall("document"))
58 print >> sys.stderr, "Documents in input 1:", numDocs
59 c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)
60
61 print >> sys.stderr, "Appending documents"
62 c1Root = c1.getroot()
63 for document in c2.getroot().findall("document"):
64 c1Root.append(document)
65
66 print >> sys.stderr, "Validating ids"
67 ids = set()
68 for element in c1Root.getiterator("entity"):
69 id = element.get("id")
70 assert not id in ids
71 ids.add(id)
72 for element in c1Root.getiterator("interaction"):
73 id = element.get("id")
74 assert not id in ids
75 ids.add(id)
76 for element in c1Root.getiterator("sentence"):
77 id = element.get("id")
78 assert not id in ids
79 ids.add(id)
80 for element in c1Root.getiterator("document"):
81 id = element.get("id")
82 assert not id in ids
83 ids.add(id)
84
85 if output != None:
86 print >> sys.stderr, "Writing output to", output
87 ETUtils.write(c1Root, output)
88 return c1
89
90 if __name__=="__main__":
91 import sys
92
93 from optparse import OptionParser
94
95 try:
96 import psyco
97 psyco.full()
98 print >> sys.stderr, "Found Psyco, using"
99 except ImportError:
100 print >> sys.stderr, "Psyco not installed"
101
102 optparser = OptionParser(usage="%prog [options]\n")
103 optparser.add_option("-i", "--inputs", default=None, dest="inputs", help="A comma-separated list of corpora in interaction xml format", metavar="FILE")
104 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
105 optparser.add_option("-f", "--fast", default=False, action="store_true", dest="fast", help="Fast, but unsafe catenation")
106 (options, args) = optparser.parse_args()
107
108 if options.inputs == None:
109 print >> sys.stderr, "Error, input files not defined."
110 optparser.print_help()
111 sys.exit(1)
112 options.inputs = options.inputs.split(",")
113 if options.output == None:
114 print >> sys.stderr, "Error, output file not defined."
115 optparser.print_help()
116 sys.exit(1)
117
118 catenate(options.inputs, options.output, options.fast)
119