1 import sys, os
2 thisPath = os.path.dirname(os.path.abspath(__file__))
3 sys.path.append(os.path.abspath(os.path.join(thisPath,"../..")))
4 try:
5 import xml.etree.cElementTree as ET
6 except ImportError:
7 import cElementTree as ET
8 import Utils.ElementTreeUtils as ETUtils
9 import random
10
11
12 -def getSample(popSize, sampleFraction, seed=0):
13 random.seed(seed)
14 sample = random.sample( xrange(popSize), int(sampleFraction*float(popSize)) )
15 vector = []
16 for i in range(popSize):
17 if i in sample:
18 vector.append(0)
19 else:
20 vector.append(1)
21 return vector
22
23 if __name__=="__main__":
24 import sys
25 print >> sys.stderr, "##### Create a subset of documents from an interaction XML-file #####"
26
27 from optparse import OptionParser
28
29 try:
30 import psyco
31 psyco.full()
32 print >> sys.stderr, "Found Psyco, using"
33 except ImportError:
34 print >> sys.stderr, "Psyco not installed"
35
36 defaultCorpusFilename = "BioInfer.xml"
37 defaultOutputName = "BioInfer.xml"
38 optparser = OptionParser(usage="%prog [options]\n.")
39 optparser.add_option("-i", "--input", default=defaultCorpusFilename, dest="input", help="Corpus in interaction xml format", metavar="FILE")
40 optparser.add_option("-o", "--output", default=defaultOutputName, dest="output", help="Output file in interaction xml format.")
41 optparser.add_option("-d", "--IDs", default=None, dest="ids", help="id list in file")
42 optparser.add_option("-f", "--fraction", type="float", default=1.0, dest="fraction", help="Selected set fraction")
43 optparser.add_option("-s", "--seed", type="int", default=0, dest="seed", help="Seed for random set")
44 optparser.add_option("-v", "--invert", default=False, dest="invert", action="store_true", help="Invert")
45 (options, args) = optparser.parse_args()
46
47 if options.input == None:
48 print >> sys.stderr, "Error, input file not defined."
49 optparser.print_help()
50 sys.exit(1)
51 if options.output == None:
52 print >> sys.stderr, "Error, output file not defined."
53 optparser.print_help()
54 sys.exit(1)
55
56 idList = []
57 if options.ids != None:
58 print >> sys.stderr, "Loading set ids from file", options.ids
59 idListFile = open(options.ids)
60 lines = idListFile.readlines()
61 for line in lines:
62 idList.append(line.strip())
63
64 print >> sys.stderr, "Loading corpus file", options.input
65 corpusTree = ET.parse(options.input)
66 print >> sys.stderr, "Corpus file loaded"
67 corpusRoot = corpusTree.getroot()
68
69 documents = corpusRoot.findall("document")
70 if options.ids == None:
71 print >> sys.stderr, "No id-file, defining pseudorandom distribution"
72 documentSets = getSample(len(documents), options.fraction, options.seed)
73
74
75 keptDocuments = 0
76 keptSentences = 0
77 removedDocuments = 0
78 removedSentences = 0
79 for i in range(len(documents)):
80 document = documents[i]
81 sentences = document.findall("sentence")
82 if options.ids != None:
83 keep = None
84 for sentence in sentences:
85 selection = sentence.attrib["origId"] in idList
86 if options.invert:
87 selection = not selection
88 assert(keep == None or keep == selection)
89 keep = selection
90 if not keep:
91 corpusRoot.remove(document)
92 removedDocuments += 1
93 removedSentences += len(sentences)
94 else:
95 keptDocuments += 1
96 keptSentences += len(sentences)
97 else:
98 selection = documentSets[i] != 0
99 if options.invert:
100 selection = not selection
101 if selection:
102 corpusRoot.remove(document)
103 removedDocuments += 1
104 removedSentences += len(sentences)
105 else:
106 keptDocuments += 1
107 keptSentences += len(sentences)
108
109 print >> sys.stderr, "Corpus:", keptDocuments + removedDocuments, "documents,", keptSentences + removedSentences, "sentences."
110 print >> sys.stderr, "Removed:", removedDocuments, "documents,", removedSentences, "sentences."
111 print >> sys.stderr, "Subset:", keptDocuments, "documents,", keptSentences, "sentences."
112
113 print >> sys.stderr, "Writing subset to", options.output
114 ETUtils.write(corpusRoot, options.output)
115