1 """
2 Pseudorandomly distributed subsets
3 """
4 __version__ = "$Revision: 1.3 $"
5
6 import Split
7 import sys, os
8 sys.path.append("..")
9 try:
10 import xml.etree.cElementTree as ET
11 except ImportError:
12 import cElementTree as ET
13 import Utils.ElementTreeUtils as ETUtils
14 import Utils.InteractionXML.CorpusElements as CorpusElements
15
16 if __name__=="__main__":
17 from optparse import OptionParser
18 optparser = OptionParser(usage="%prog [options]\n")
19 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE")
20 optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory")
21 optparser.add_option("-f", "--folds", type="int", default=10, dest="folds", help="X-fold cross validation")
22 (options, args) = optparser.parse_args()
23
24
25 corpusElements = CorpusElements.loadCorpus(options.input)
26
27 outputTrees = []
28 for i in range(options.folds):
29 newRoot = ET.Element("corpus")
30 for key in corpusElements.rootElement.attrib.keys():
31 newRoot.attrib[key] = corpusElements.rootElement.attrib[key]
32 outputTrees.append(newRoot)
33
34 print >> sys.stderr, "Reading document ids"
35 documentIds = []
36 for document in corpusElements.documents:
37 docId = document.attrib["id"]
38 assert( not docId in documentIds )
39 documentIds.append(docId)
40
41 print >> sys.stderr, "Calculating document division"
42 sample = Split.getFolds(len(documentIds),options.folds)
43 division = {}
44 for i in range(len(documentIds)):
45 division[documentIds[i]] = sample[i]
46
47 print >> sys.stderr, "Dividing documents"
48 for document in corpusElements.documents:
49 docId = document.attrib["id"]
50 outputTrees[division[docId]].append(document)
51
52 for i in range(options.folds):
53 if options.output == None:
54 filename = options.input + ".fold" + str(i)
55 else:
56 filename = os.path.join(options.output, os.path.basename(options.input) + ".fold" + str(i))
57 print >> sys.stderr, "Writing file", filename
58 ETUtils.write(outputTrees[i], filename)
59