1  import sys, os 
 2  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..") 
 3  import Utils.ElementTreeUtils as ETUtils 
 4  import Core.Split 
 5  import shutil 
 6   
 7 -def makeSubset(input, output=None, ratio=1.0, seed=0): 
  8      if ratio == 1.0: 
 9          if output != None: 
10              shutil.copy2(input, output) 
11              return output 
12          else: 
13              return input 
14      totalFolds = 100 
15      selectedFolds = int(ratio * 100.0) 
16      print >> sys.stderr, "====== Making subset ======" 
17      print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed 
18      xml = ETUtils.ETFromObj(input).getroot() 
19      count = 0 
20      sentCount = 0 
21      for document in xml.findall("document"): 
22          sentCount += len(document.findall("sentence")) 
23          count += 1 
24      division = Core.Split.getFolds(count, totalFolds, seed) 
25       
26      index = 0 
27      removeCount = 0 
28      sentRemoveCount = 0 
29      for document in xml.findall("document"): 
30          if division[index] > selectedFolds - 1: 
31              xml.remove(document) 
32              sentRemoveCount += len(document.findall("sentence")) 
33              removeCount += 1 
34          index += 1 
35      print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount 
36      xml.set("subsetRatio", str(ratio)) 
37      xml.set("subsetSeed", str(seed)) 
38      if output != None: 
39          ETUtils.write(xml, output) 
40      return output 
 41   
42  if __name__=="__main__": 
43      import sys 
44      print >> sys.stderr, "##### Making subset #####" 
45       
46       
47      try: 
48          import psyco 
49          psyco.full() 
50          print >> sys.stderr, "Found Psyco, using" 
51      except ImportError: 
52          print >> sys.stderr, "Psyco not installed" 
53      from optparse import OptionParser 
54      optparser = OptionParser() 
55      optparser.add_option("-i", "--input", default=None, dest="input", help="input interaction XML file") 
56      optparser.add_option("-o", "--output", default=None, dest="output", help="output interaction XML file") 
57      optparser.add_option("-f", "--fraction", default=1.0, type="float", dest="fraction", help="") 
58      optparser.add_option("-s", "--seed", default=1, type="int", dest="seed", help="") 
59      (options, args) = optparser.parse_args() 
60       
61      makeSubset(options.input, options.output, options.fraction, options.seed) 
62