1 """
2 Pseudorandomly distributed subsets
3 """
4 __version__ = "$Revision: 1.4 $"
5
6 import Split
7 import sys
8
10 return idString.rsplit(".",2)[0]
11
13 assert(line.find("#") != -1)
14 return line.split("#")[-1].strip()
15
17 documentIds = []
18 inputFile = open(filename, "rt")
19 try:
20 for line in inputFile:
21 if len(line) == 0 or line[0] == "#":
22 continue
23 docId = getDocumentId(getIdFromLine(line))
24 if not docId in documentIds:
25 documentIds.append(docId)
26 finally:
27 inputFile.close()
28 return documentIds
29
31 sample = Split.getFolds(len(documentIds),folds)
32 division = {}
33 for i in range(len(documentIds)):
34 division[documentIds[i]] = sample[i]
35 return division
36
38 print >> sys.stderr, "Reading document ids"
39 documentIds = getDocumentIds(filename)
40
41 print >> sys.stderr, "Dividing documents into folds"
42 division = getDocumentFolds(documentIds, len(outputFilenames))
43
44 print >> sys.stderr, "Dividing examples"
45
46 outputFiles = []
47 for name in outputFilenames:
48 outputFiles.append(open(name, "wt"))
49
50 inputFile = open(filename, "rt")
51 try:
52 for line in inputFile:
53 if len(line) == 0 or line[0] == "#":
54 continue
55 docId = getDocumentId(getIdFromLine(line))
56 outputFiles[division[docId]].write(line)
57 finally:
58 inputFile.close()
59
60 for outputFile in outputFiles:
61 outputFile.close()
62
63 if __name__=="__main__":
64 from optparse import OptionParser
65 defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
66 optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
67 optparser.add_option("-i", "--input", default=defaultAnalysisFilename, dest="input", help="Corpus in analysis format", metavar="FILE")
68 optparser.add_option("-o", "--output", default="", dest="output", help="Output directory")
69 optparser.add_option("-f", "--folds", type="int", default=10, dest="folds", help="X-fold cross validation")
70 (options, args) = optparser.parse_args()
71
72 outputFilenames = []
73 for i in range(options.folds):
74 outputFilenames.append(options.output + options.input + ".fold" + str(i))
75
76 divideExamples(options.input, outputFilenames)
77