Package TEES :: Package Core :: Module DivideCorpus
[hide private]

Source Code for Module TEES.Core.DivideCorpus

 1  """ 
 2  Pseudorandomly distributed subsets 
 3  """ 
 4  __version__ = "$Revision: 1.3 $" 
 5   
 6  import Split 
 7  import sys, os 
 8  sys.path.append("..") 
 9  try: 
10      import xml.etree.cElementTree as ET 
11  except ImportError: 
12      import cElementTree as ET 
13  import Utils.ElementTreeUtils as ETUtils 
14  import Utils.InteractionXML.CorpusElements as CorpusElements 
15   
16  if __name__=="__main__": 
17      from optparse import OptionParser 
18      optparser = OptionParser(usage="%prog [options]\n") 
19      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") 
20      optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory") 
21      optparser.add_option("-f", "--folds", type="int", default=10, dest="folds", help="X-fold cross validation") 
22      (options, args) = optparser.parse_args() 
23   
24      # Load corpus and make sentence graphs 
25      corpusElements = CorpusElements.loadCorpus(options.input)    
26       
27      outputTrees = [] 
28      for i in range(options.folds): 
29          newRoot = ET.Element("corpus") 
30          for key in corpusElements.rootElement.attrib.keys(): 
31              newRoot.attrib[key] = corpusElements.rootElement.attrib[key] 
32          outputTrees.append(newRoot) 
33       
34      print >> sys.stderr, "Reading document ids" 
35      documentIds = [] 
36      for document in corpusElements.documents: 
37          docId = document.attrib["id"] 
38          assert( not docId in documentIds ) 
39          documentIds.append(docId) 
40   
41      print >> sys.stderr, "Calculating document division" 
42      sample = Split.getFolds(len(documentIds),options.folds) 
43      division = {} 
44      for i in range(len(documentIds)):  
45          division[documentIds[i]] = sample[i] 
46   
47      print >> sys.stderr, "Dividing documents" 
48      for document in corpusElements.documents: 
49          docId = document.attrib["id"] 
50          outputTrees[division[docId]].append(document) 
51       
52      for i in range(options.folds): 
53          if options.output == None: 
54              filename = options.input + ".fold" + str(i) 
55          else: 
56              filename = os.path.join(options.output, os.path.basename(options.input) + ".fold" + str(i)) 
57          print >> sys.stderr, "Writing file", filename 
58          ETUtils.write(outputTrees[i], filename) 
59