1 import sys, os
2 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..")
3 import Utils.ElementTreeUtils as ETUtils
4 import Core.Split
5 import shutil
6
7 -def makeSubset(input, output=None, ratio=1.0, seed=0):
8 if ratio == 1.0:
9 if output != None:
10 shutil.copy2(input, output)
11 return output
12 else:
13 return input
14 totalFolds = 100
15 selectedFolds = int(ratio * 100.0)
16 print >> sys.stderr, "====== Making subset ======"
17 print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
18 xml = ETUtils.ETFromObj(input).getroot()
19 count = 0
20 sentCount = 0
21 for document in xml.findall("document"):
22 sentCount += len(document.findall("sentence"))
23 count += 1
24 division = Core.Split.getFolds(count, totalFolds, seed)
25
26 index = 0
27 removeCount = 0
28 sentRemoveCount = 0
29 for document in xml.findall("document"):
30 if division[index] > selectedFolds - 1:
31 xml.remove(document)
32 sentRemoveCount += len(document.findall("sentence"))
33 removeCount += 1
34 index += 1
35 print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
36 xml.set("subsetRatio", str(ratio))
37 xml.set("subsetSeed", str(seed))
38 if output != None:
39 ETUtils.write(xml, output)
40 return output
41
42 if __name__=="__main__":
43 import sys
44 print >> sys.stderr, "##### Making subset #####"
45
46
47 try:
48 import psyco
49 psyco.full()
50 print >> sys.stderr, "Found Psyco, using"
51 except ImportError:
52 print >> sys.stderr, "Psyco not installed"
53 from optparse import OptionParser
54 optparser = OptionParser()
55 optparser.add_option("-i", "--input", default=None, dest="input", help="input interaction XML file")
56 optparser.add_option("-o", "--output", default=None, dest="output", help="output interaction XML file")
57 optparser.add_option("-f", "--fraction", default=1.0, type="float", dest="fraction", help="")
58 optparser.add_option("-s", "--seed", default=1, type="int", dest="seed", help="")
59 (options, args) = optparser.parse_args()
60
61 makeSubset(options.input, options.output, options.fraction, options.seed)
62