1 import sys, os
2 thisPath = os.path.dirname(os.path.abspath(__file__))
3 sys.path.append(os.path.abspath(os.path.join(thisPath,"../..")))
4 from Utils.ProgressCounter import ProgressCounter
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9 import Utils.ElementTreeUtils as ETUtils
10
11 -def processCorpus(input, outDir, stem, tail, mergedSets=[], saveCombined=False, verbose=False):
12 newCorpora = {}
13 print >> sys.stderr, "Loading corpus file", input
14 corpusRoot = ETUtils.ETFromObj(input).getroot()
15
16 documents = corpusRoot.findall("document")
17 counter = ProgressCounter(len(documents), "Documents")
18 countsByType = {}
19 for document in documents:
20 counter.update()
21 docSet = document.get("set")
22 if docSet == None:
23 if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
24 if not countsByType.has_key("No set"):
25 countsByType["No set"] = 0
26 countsByType["No set"] += 1
27 continue
28 elif not newCorpora.has_key(docSet):
29 newCorpora[docSet] = ET.Element("corpus")
30 for k, v in corpusRoot.attrib.iteritems():
31 newCorpora[docSet].set(k, v)
32 countsByType[docSet] = 0
33 newCorpora[docSet].append(document)
34 countsByType[docSet] += 1
35
36
37 for mergedSet in mergedSets:
38 tag = "-and-".join(sorted(mergedSet))
39 if not newCorpora.has_key(tag):
40 newCorpora[tag] = ET.Element("corpus")
41 for k, v in corpusRoot.attrib.iteritems():
42 newCorpora[tag].set(k, v)
43 countsByType[tag] = 0
44 for componentSet in mergedSet:
45 for element in newCorpora[componentSet].findall("document"):
46 newCorpora[tag].append(element)
47 countsByType[tag] += 1
48
49 print >> sys.stderr, "Documents per set"
50 for k in sorted(countsByType.keys()):
51 print >> sys.stderr, " " + str(k) + ":", countsByType[k]
52
53 if not os.path.exists(outDir):
54 os.makedirs(outDir)
55
56 print >> sys.stderr, "Writing output files to directory", outDir
57 if saveCombined:
58 print >> sys.stderr, "Saving combined input to", stem + tail
59 ETUtils.write(corpusRoot, stem + tail)
60 else:
61 print >> sys.stderr, "Combined input not saved"
62 for docSet in sorted(newCorpora.keys()):
63 outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
64 print >> sys.stderr, "Writing set", docSet, "to", outFilename
65 ETUtils.write(newCorpora[docSet], outFilename)
66
67 if __name__=="__main__":
68 import sys
69 print >> sys.stderr, "##### Split elements with merged types #####"
70
71 from optparse import OptionParser
72
73 try:
74 import psyco
75 psyco.full()
76 print >> sys.stderr, "Found Psyco, using"
77 except ImportError:
78 print >> sys.stderr, "Psyco not installed"
79
80 optparser = OptionParser(usage="%prog [options]\nPath generator.")
81 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
82 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
83 optparser.add_option("-s", "--stem", default=None, dest="stem", help="Output file stem.")
84 optparser.add_option("-t", "--tail", default=None, dest="tail", help="Output file tail.")
85 optparser.add_option("-m", "--merged", default=None, dest="merged", help="Output file tail.")
86 (options, args) = optparser.parse_args()
87
88 if options.input == None:
89 print >> sys.stderr, "Error, input file not defined."
90 optparser.print_help()
91 sys.exit(1)
92 if options.output == None:
93 print >> sys.stderr, "Error, output directory not defined."
94 optparser.print_help()
95 sys.exit(1)
96 if options.stem == None:
97 print >> sys.stderr, "Error, output stem not defined."
98 optparser.print_help()
99 sys.exit(1)
100 if options.tail == None:
101 print >> sys.stderr, "Error, output tail not defined."
102 optparser.print_help()
103 sys.exit(1)
104
105 processCorpus(options.input, options.output, options.stem, options.tail)
106