Package TEES :: Package Utils :: Package InteractionXML :: Module MakeSubset
[hide private]

Source Code for Module TEES.Utils.InteractionXML.MakeSubset

 1  import sys, os 
 2  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../..") 
 3  import Utils.ElementTreeUtils as ETUtils 
 4  import Core.Split 
 5  import shutil 
 6   
7 -def makeSubset(input, output=None, ratio=1.0, seed=0):
8 if ratio == 1.0: 9 if output != None: 10 shutil.copy2(input, output) 11 return output 12 else: 13 return input 14 totalFolds = 100 15 selectedFolds = int(ratio * 100.0) 16 print >> sys.stderr, "====== Making subset ======" 17 print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed 18 xml = ETUtils.ETFromObj(input).getroot() 19 count = 0 20 sentCount = 0 21 for document in xml.findall("document"): 22 sentCount += len(document.findall("sentence")) 23 count += 1 24 division = Core.Split.getFolds(count, totalFolds, seed) 25 #print division, selectedFolds - 1 26 index = 0 27 removeCount = 0 28 sentRemoveCount = 0 29 for document in xml.findall("document"): 30 if division[index] > selectedFolds - 1: 31 xml.remove(document) 32 sentRemoveCount += len(document.findall("sentence")) 33 removeCount += 1 34 index += 1 35 print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount 36 xml.set("subsetRatio", str(ratio)) 37 xml.set("subsetSeed", str(seed)) 38 if output != None: 39 ETUtils.write(xml, output) 40 return output
41 42 if __name__=="__main__": 43 import sys 44 print >> sys.stderr, "##### Making subset #####" 45 46 # Import Psyco if available 47 try: 48 import psyco 49 psyco.full() 50 print >> sys.stderr, "Found Psyco, using" 51 except ImportError: 52 print >> sys.stderr, "Psyco not installed" 53 from optparse import OptionParser 54 optparser = OptionParser() 55 optparser.add_option("-i", "--input", default=None, dest="input", help="input interaction XML file") 56 optparser.add_option("-o", "--output", default=None, dest="output", help="output interaction XML file") 57 optparser.add_option("-f", "--fraction", default=1.0, type="float", dest="fraction", help="") 58 optparser.add_option("-s", "--seed", default=1, type="int", dest="seed", help="") 59 (options, args) = optparser.parse_args() 60 61 makeSubset(options.input, options.output, options.fraction, options.seed) 62