Package TEES :: Package Utils :: Package InteractionXML :: Module Subset
[hide private]

Source Code for Module TEES.Utils.InteractionXML.Subset

  1  import sys, os 
  2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
  4  try: 
  5      import xml.etree.cElementTree as ET 
  6  except ImportError: 
  7      import cElementTree as ET 
  8  import Utils.ElementTreeUtils as ETUtils 
  9  import random 
 10   
 11  # From Split.py, which should be moved to CommonUtils 
12 -def getSample(popSize, sampleFraction, seed=0):
13 random.seed(seed) 14 sample = random.sample( xrange(popSize), int(sampleFraction*float(popSize)) ) 15 vector = [] 16 for i in range(popSize): 17 if i in sample: 18 vector.append(0) 19 else: 20 vector.append(1) 21 return vector
22 23 if __name__=="__main__": 24 import sys 25 print >> sys.stderr, "##### Create a subset of documents from an interaction XML-file #####" 26 27 from optparse import OptionParser 28 # Import Psyco if available 29 try: 30 import psyco 31 psyco.full() 32 print >> sys.stderr, "Found Psyco, using" 33 except ImportError: 34 print >> sys.stderr, "Psyco not installed" 35 36 defaultCorpusFilename = "BioInfer.xml" 37 defaultOutputName = "BioInfer.xml" 38 optparser = OptionParser(usage="%prog [options]\n.") 39 optparser.add_option("-i", "--input", default=defaultCorpusFilename, dest="input", help="Corpus in interaction xml format", metavar="FILE") 40 optparser.add_option("-o", "--output", default=defaultOutputName, dest="output", help="Output file in interaction xml format.") 41 optparser.add_option("-d", "--IDs", default=None, dest="ids", help="id list in file") 42 optparser.add_option("-f", "--fraction", type="float", default=1.0, dest="fraction", help="Selected set fraction") 43 optparser.add_option("-s", "--seed", type="int", default=0, dest="seed", help="Seed for random set") 44 optparser.add_option("-v", "--invert", default=False, dest="invert", action="store_true", help="Invert") 45 (options, args) = optparser.parse_args() 46 47 if options.input == None: 48 print >> sys.stderr, "Error, input file not defined." 49 optparser.print_help() 50 sys.exit(1) 51 if options.output == None: 52 print >> sys.stderr, "Error, output file not defined." 53 optparser.print_help() 54 sys.exit(1) 55 56 idList = [] 57 if options.ids != None: 58 print >> sys.stderr, "Loading set ids from file", options.ids 59 idListFile = open(options.ids) 60 lines = idListFile.readlines() 61 for line in lines: 62 idList.append(line.strip()) 63 64 print >> sys.stderr, "Loading corpus file", options.input 65 corpusTree = ET.parse(options.input) 66 print >> sys.stderr, "Corpus file loaded" 67 corpusRoot = corpusTree.getroot() 68 69 documents = corpusRoot.findall("document") 70 if options.ids == None: 71 print >> sys.stderr, "No id-file, defining pseudorandom distribution" 72 documentSets = getSample(len(documents), options.fraction, options.seed) 73 74 # Remove those documents not in subset 75 keptDocuments = 0 76 keptSentences = 0 77 removedDocuments = 0 78 removedSentences = 0 79 for i in range(len(documents)): 80 document = documents[i] 81 sentences = document.findall("sentence") 82 if options.ids != None: 83 keep = None 84 for sentence in sentences: 85 selection = sentence.attrib["origId"] in idList 86 if options.invert: 87 selection = not selection 88 assert(keep == None or keep == selection) 89 keep = selection 90 if not keep: 91 corpusRoot.remove(document) 92 removedDocuments += 1 93 removedSentences += len(sentences) 94 else: 95 keptDocuments += 1 96 keptSentences += len(sentences) 97 else: 98 selection = documentSets[i] != 0 99 if options.invert: 100 selection = not selection 101 if selection: 102 corpusRoot.remove(document) 103 removedDocuments += 1 104 removedSentences += len(sentences) 105 else: 106 keptDocuments += 1 107 keptSentences += len(sentences) 108 109 print >> sys.stderr, "Corpus:", keptDocuments + removedDocuments, "documents,", keptSentences + removedSentences, "sentences." 110 print >> sys.stderr, "Removed:", removedDocuments, "documents,", removedSentences, "sentences." 111 print >> sys.stderr, "Subset:", keptDocuments, "documents,", keptSentences, "sentences." 112 113 print >> sys.stderr, "Writing subset to", options.output 114 ETUtils.write(corpusRoot, options.output) 115