1  """ 
 2  Pseudorandomly distributed subsets 
 3  """ 
 4  __version__ = "$Revision: 1.4 $" 
 5   
 6  import Split 
 7  import sys 
 8   
10      return idString.rsplit(".",2)[0] 
 11   
13      assert(line.find("#") != -1) 
14      return line.split("#")[-1].strip() 
 15   
17      documentIds = [] 
18      inputFile = open(filename, "rt") 
19      try: 
20          for line in inputFile: 
21              if len(line) == 0 or line[0] == "#": 
22                  continue 
23              docId = getDocumentId(getIdFromLine(line)) 
24              if not docId in documentIds: 
25                  documentIds.append(docId) 
26      finally: 
27          inputFile.close() 
28      return documentIds 
 29   
31      sample = Split.getFolds(len(documentIds),folds) 
32      division = {} 
33      for i in range(len(documentIds)):  
34          division[documentIds[i]] = sample[i] 
35      return division 
 36   
38      print >> sys.stderr, "Reading document ids" 
39      documentIds = getDocumentIds(filename) 
40   
41      print >> sys.stderr, "Dividing documents into folds" 
42      division = getDocumentFolds(documentIds, len(outputFilenames)) 
43   
44      print >> sys.stderr, "Dividing examples" 
45       
46      outputFiles = [] 
47      for name in outputFilenames: 
48          outputFiles.append(open(name, "wt")) 
49   
50      inputFile = open(filename, "rt") 
51      try: 
52          for line in inputFile: 
53              if len(line) == 0 or line[0] == "#": 
54                  continue 
55              docId = getDocumentId(getIdFromLine(line)) 
56              outputFiles[division[docId]].write(line) 
57      finally: 
58          inputFile.close() 
59   
60      for outputFile in outputFiles: 
61          outputFile.close() 
 62           
63  if __name__=="__main__": 
64      from optparse import OptionParser 
65      defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" 
66      optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") 
67      optparser.add_option("-i", "--input", default=defaultAnalysisFilename, dest="input", help="Corpus in analysis format", metavar="FILE") 
68      optparser.add_option("-o", "--output", default="", dest="output", help="Output directory") 
69      optparser.add_option("-f", "--folds", type="int", default=10, dest="folds", help="X-fold cross validation") 
70      (options, args) = optparser.parse_args() 
71       
72      outputFilenames = [] 
73      for i in range(options.folds): 
74          outputFilenames.append(options.output + options.input + ".fold" + str(i)) 
75   
76      divideExamples(options.input, outputFilenames) 
77