1 """
2 Base class for ExampleBuilders
3 """
4 import sys, os, types
5 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
6 from Core.SentenceGraph import getCorpusIterator
7 from Core.IdSet import IdSet
8 import gzip
9 import itertools
10 from Utils.ProgressCounter import ProgressCounter
11 import Utils.Parameters
12 import Core.ExampleUtils as ExampleUtils
13 import Core.SentenceGraph
14 from ExampleBuilders.ExampleStats import ExampleStats
17 """
18 ExampleBuilder is the abstract base class for specialized example builders.
19 Example builders take some data and convert it to examples usable by e.g. SVMs.
20 An example builder writes three files, an example-file (in extended Joachim's
21 SVM format) and .class_names and .feature_names files, which contain the names
22 for the class and feature id-numbers. An example builder can also be given
23 pre-existing sets of class and feature ids (optionally in files) so that the
24 generated examples are consistent with other, previously generated examples.
25 """
26 - def __init__(self, classSet=None, featureSet=None):
27 if(type(classSet) == types.StringType):
28 self.classSet = IdSet(filename=classSet)
29 else:
30 self.classSet = classSet
31
32 if(type(featureSet) == types.StringType):
33 self.featureSet = IdSet(filename=featureSet)
34 else:
35 self.featureSet = featureSet
36
37 self.featureTag = ""
38 self.exampleStats = ExampleStats()
39 self.parse = None
40 self.tokenization = None
41
42 self.classIdFilename = None
43 self.featureIdFilename = None
44
45 self.styles = None
46 self._defaultParameters = None
47 self._parameterValueLimits = None
48 self._setDefaultParameters(["sentenceLimit"])
49
51
52 if self._defaultParameters == None:
53 self._defaultParameters = {}
54 if self._parameterValueLimits == None:
55 self._parameterValueLimits = {}
56 newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits)
57 self._defaultParameters.update(newParameters)
58 if valueLimits != None:
59 self._parameterValueLimits.update(valueLimits)
60
62 return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits)
63
65 self.features[self.featureSet.getId(self.featureTag+name)] = value
66
68 print >> sys.stderr, "Counting elements:",
69 if filename.endswith(".gz"):
70 f = gzip.open(filename, "rt")
71 else:
72 f = open(filename, "rt")
73 counts = {"documents":0, "sentences":0}
74 for line in f:
75 if "<document " in line:
76 counts["documents"] += 1
77 elif "<sentence " in line:
78 counts["sentences"] += 1
79 f.close()
80 print >> sys.stderr, counts
81 return counts
82
84 if self.classIdFilename != None:
85 print >> sys.stderr, "Saving class names to", self.classIdFilename
86 self.classSet.write(self.classIdFilename)
87 else:
88 print >> sys.stderr, "Class names not saved"
89 if self.featureIdFilename != None:
90 print >> sys.stderr, "Saving feature names to", self.featureIdFilename
91 self.featureSet.write(self.featureIdFilename)
92 else:
93 print >> sys.stderr, "Feature names not saved"
94
95 - def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True):
96
97 if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)):
98 os.makedirs(os.path.dirname(output))
99
100 openStyle = "wt"
101 if append:
102
103 openStyle = "at"
104 if output.endswith(".gz"):
105 outfile = gzip.open(output, openStyle)
106 else:
107 outfile = open(output, openStyle)
108
109
110 self.exampleCount = 0
111 if type(input) in types.StringTypes:
112 self.elementCounts = self.getElementCounts(input)
113 if self.elementCounts["sentences"] > 0:
114 self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples")
115 else:
116 self.elementCounts = None
117 self.progress = ProgressCounter(None, "Build examples")
118 else:
119 self.elementCounts = None
120 self.progress = ProgressCounter(None, "Build examples")
121
122 self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization))
123
124 inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization)
125
126
127 if gold != None:
128 goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization)
129 for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None):
130 assert inputSentences != None
131 assert goldSentences != None
132 self.processDocument(inputSentences, goldSentences, outfile)
133 else:
134 for inputSentences in inputIterator:
135 self.processDocument(inputSentences, None, outfile)
136 outfile.close()
137 self.progress.endUpdate()
138
139
140 print >> sys.stderr, "Examples built:", self.exampleCount
141 print >> sys.stderr, "Features:", len(self.featureSet.getNames())
142 print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles))
143 if self.exampleStats.getExampleCount() > 0:
144 self.exampleStats.printStats()
145
146
147 if allowNewIds:
148 self.saveIds()
149
151
152 for i in range(len(sentences)):
153 sentence = sentences[i]
154 goldSentence = None
155 if goldSentences != None:
156 goldSentence = goldSentences[i]
157 self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ")
158 self.processSentence(sentence, outfile, goldSentence)
159
161
162 if self.styles["sentenceLimit"]:
163
164 limitRules = self.styles["sentenceLimit"]
165 if type(limitRules) in types.StringTypes:
166 limitRules = [limitRules]
167
168 sentenceElement = sentence.sentence
169 sentenceAttributes = sorted(sentenceElement.attrib.keys())
170
171 for rule in limitRules:
172 for sentAttr in sentenceAttributes:
173
174
175
176 if rule.startswith(sentAttr + "."):
177 value = rule.split(".", 1)[-1]
178 if value not in sentenceElement.get(sentAttr):
179 return
180
181 if sentence.sentenceGraph != None:
182 goldGraph = None
183 if goldSentence != None:
184 goldGraph = goldSentence.sentenceGraph
185 self.exampleCount += self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph)
186
187 @classmethod
188 - def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True):
189 print >> sys.stderr, "Running", cls.__name__
190 print >> sys.stderr, " input:", input
191 if gold != None:
192 print >> sys.stderr, " gold:", gold
193 print >> sys.stderr, " output:", output, "(append:", str(append) + ")"
194 print >> sys.stderr, " add new class/feature ids:", allowNewIds
195 if not isinstance(style, types.StringTypes):
196 style = Utils.Parameters.toString(style)
197 print >> sys.stderr, " style:", style
198 if tokenization == None:
199 print >> sys.stderr, " parse:", parse
200 else:
201 print >> sys.stderr, " parse:", parse + ", tokenization:", tokenization
202 classSet, featureSet = cls.getIdSets(classIds, featureIds, allowNewIds)
203 builder = cls(style=style, classSet=classSet, featureSet=featureSet)
204
205 builder.classIdFilename = classIds
206 builder.featureIdFilename = featureIds
207 builder.parse = parse ; builder.tokenization = tokenization
208 builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds)
209 return builder
210
212 raise NotImplementedError
213
216
219
220 @classmethod
221 - def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
222
223
224
225 if classIds != None and os.path.exists(classIds):
226 print >> sys.stderr, "Using predefined class names from", classIds
227 classSet = IdSet(allowNewIds=allowNewIds)
228 classSet.load(classIds)
229 else:
230 print >> sys.stderr, "No predefined class names"
231 classSet = None
232
233 if featureIds != None and os.path.exists(featureIds):
234 print >> sys.stderr, "Using predefined feature names from", featureIds
235 featureSet = IdSet(allowNewIds=allowNewIds)
236 featureSet.load(featureIds)
237 else:
238 print >> sys.stderr, "No predefined feature names"
239 featureSet = None
240 return classSet, featureSet
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257 - def getSentences(self, input, parse, tokenization, removeNameInfo=False):
258 if type(input) != types.ListType:
259
260 corpusElements = Core.SentenceGraph.loadCorpus(input, parse, tokenization, removeNameInfo=removeNameInfo)
261 sentences = []
262 for sentence in corpusElements.sentences:
263 if sentence.sentenceGraph != None:
264 sentences.append( [sentence.sentenceGraph,None] )
265 return sentences
266 else:
267 assert(removeNameInfo == False)
268 return input
269
271 print >> sys.stderr, "Defining predicted value range:",
272 sentenceElements = []
273 for sentence in sentences:
274 sentenceElements.append(sentence[0].sentenceElement)
275 self.definePredictedValueRange(sentenceElements, "entity")
276 print >> sys.stderr, self.getPredictedValueRange()
277
279 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE")
280 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file for the examples")
281 optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="parse")
282 optparser.add_option("-x", "--exampleBuilderParameters", default=None, dest="parameters", help="Parameters for the example builder")
283 optparser.add_option("-b", "--exampleBuilder", default="SimpleDependencyExampleBuilder", dest="exampleBuilder", help="Example Builder Class")
284 optparser.add_option("-c", "--classes", default=None, dest="classes", help="Class ids")
285 optparser.add_option("-f", "--features", default=None, dest="features", help="Feature ids")
286 optparser.add_option("-a", "--addIds", default=False, action="store_true", dest="addIds", help="Add new features")
287
288 if __name__=="__main__":
289
290 try:
291 import psyco
292 psyco.full()
293 print >> sys.stderr, "Found Psyco, using"
294 except ImportError:
295 print >> sys.stderr, "Psyco not installed"
296
297 from optparse import OptionParser
298 optparser = OptionParser(usage="%prog [options]\nBuild machine learning examples from interaction XML.")
299 addBasicOptions(optparser)
300 (options, args) = optparser.parse_args()
301
302 print >> sys.stderr, "Importing modules"
303 exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilderClass"
304
305
306 ExampleBuilderClass.run(options.input, options.output, options.parse, None, options.parameters,
307 options.classes, options.features, allowNewIds=options.addIds )
308