1  """ 
  2  Base class for ExampleBuilders 
  3  """ 
  4  import sys, os, types 
  5  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 
  6  from Core.SentenceGraph import getCorpusIterator 
  7  from Core.IdSet import IdSet 
  8  import gzip 
  9  import itertools 
 10  from Utils.ProgressCounter import ProgressCounter 
 11  import Utils.Parameters 
 12  import Core.ExampleUtils as ExampleUtils 
 13  import Core.SentenceGraph 
 14  from ExampleBuilders.ExampleStats import ExampleStats 
 17      """  
 18      ExampleBuilder is the abstract base class for specialized example builders. 
 19      Example builders take some data and convert it to examples usable by e.g. SVMs. 
 20      An example builder writes three files, an example-file (in extended Joachim's 
 21      SVM format) and .class_names and .feature_names files, which contain the names 
 22      for the class and feature id-numbers. An example builder can also be given 
 23      pre-existing sets of class and feature ids (optionally in files) so that the 
 24      generated examples are consistent with other, previously generated examples. 
 25      """     
 26 -    def __init__(self, classSet=None, featureSet=None): 
  27          if(type(classSet) == types.StringType): 
 28              self.classSet = IdSet(filename=classSet) 
 29          else: 
 30              self.classSet = classSet 
 31           
 32          if(type(featureSet) == types.StringType): 
 33              self.featureSet = IdSet(filename=featureSet) 
 34          else: 
 35              self.featureSet = featureSet 
 36           
 37          self.featureTag = ""       
 38          self.exampleStats = ExampleStats() 
 39          self.parse = None 
 40          self.tokenization = None 
 41           
 42          self.classIdFilename = None 
 43          self.featureIdFilename = None 
 44           
 45          self.styles = None 
 46          self._defaultParameters = None 
 47          self._parameterValueLimits = None 
 48          self._setDefaultParameters(["sentenceLimit"]) 
  49       
 51           
 52          if self._defaultParameters == None: 
 53              self._defaultParameters = {} 
 54          if self._parameterValueLimits == None: 
 55              self._parameterValueLimits = {} 
 56          newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits) 
 57          self._defaultParameters.update(newParameters) 
 58          if valueLimits != None: 
 59              self._parameterValueLimits.update(valueLimits) 
  60       
 62          return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits) 
  63       
 65          self.features[self.featureSet.getId(self.featureTag+name)] = value 
  66       
 68          print >> sys.stderr, "Counting elements:", 
 69          if filename.endswith(".gz"): 
 70              f = gzip.open(filename, "rt") 
 71          else: 
 72              f = open(filename, "rt") 
 73          counts = {"documents":0, "sentences":0} 
 74          for line in f: 
 75              if "<document " in line: 
 76                  counts["documents"] += 1 
 77              elif "<sentence " in line: 
 78                  counts["sentences"] += 1 
 79          f.close() 
 80          print >> sys.stderr, counts 
 81          return counts 
  82   
 84          if self.classIdFilename != None: 
 85              print >> sys.stderr, "Saving class names to", self.classIdFilename 
 86              self.classSet.write(self.classIdFilename) 
 87          else: 
 88              print >> sys.stderr, "Class names not saved" 
 89          if self.featureIdFilename != None: 
 90              print >> sys.stderr, "Saving feature names to", self.featureIdFilename 
 91              self.featureSet.write(self.featureIdFilename) 
 92          else: 
 93              print >> sys.stderr, "Feature names not saved" 
  94   
 95 -    def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True): 
  96           
 97          if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)): 
 98              os.makedirs(os.path.dirname(output)) 
 99           
100          openStyle = "wt" 
101          if append: 
102               
103              openStyle = "at" 
104          if output.endswith(".gz"): 
105              outfile = gzip.open(output, openStyle) 
106          else: 
107              outfile = open(output, openStyle) 
108           
109           
110          self.exampleCount = 0 
111          if type(input) in types.StringTypes: 
112              self.elementCounts = self.getElementCounts(input) 
113              if self.elementCounts["sentences"] > 0: 
114                  self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples") 
115              else: 
116                  self.elementCounts = None 
117                  self.progress = ProgressCounter(None, "Build examples") 
118          else: 
119              self.elementCounts = None 
120              self.progress = ProgressCounter(None, "Build examples") 
121           
122          self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization)) 
123           
124          inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization)             
125           
126           
127          if gold != None: 
128              goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization) 
129              for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None): 
130                  assert inputSentences != None 
131                  assert goldSentences != None 
132                  self.processDocument(inputSentences, goldSentences, outfile) 
133          else: 
134              for inputSentences in inputIterator: 
135                  self.processDocument(inputSentences, None, outfile) 
136          outfile.close() 
137          self.progress.endUpdate() 
138           
139           
140          print >> sys.stderr, "Examples built:", self.exampleCount 
141          print >> sys.stderr, "Features:", len(self.featureSet.getNames()) 
142          print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles)) 
143          if self.exampleStats.getExampleCount() > 0: 
144              self.exampleStats.printStats() 
145       
146           
147          if allowNewIds: 
148              self.saveIds() 
 149       
151           
152          for i in range(len(sentences)): 
153              sentence = sentences[i] 
154              goldSentence = None 
155              if goldSentences != None: 
156                  goldSentence = goldSentences[i] 
157              self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ") 
158              self.processSentence(sentence, outfile, goldSentence) 
 159       
161           
162          if self.styles["sentenceLimit"]:  
163               
164              limitRules = self.styles["sentenceLimit"] 
165              if type(limitRules) in types.StringTypes: 
166                  limitRules = [limitRules] 
167               
168              sentenceElement = sentence.sentence 
169              sentenceAttributes = sorted(sentenceElement.attrib.keys()) 
170               
171              for rule in limitRules: 
172                  for sentAttr in sentenceAttributes: 
173                       
174                       
175                       
176                      if rule.startswith(sentAttr + "."):  
177                          value = rule.split(".", 1)[-1]  
178                          if value not in sentenceElement.get(sentAttr):  
179                              return  
180           
181          if sentence.sentenceGraph != None: 
182              goldGraph = None 
183              if goldSentence != None: 
184                  goldGraph = goldSentence.sentenceGraph 
185              self.exampleCount += self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph) 
 186   
187      @classmethod 
188 -    def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True): 
 189          print >> sys.stderr, "Running", cls.__name__ 
190          print >> sys.stderr, "  input:", input 
191          if gold != None: 
192              print >> sys.stderr, "  gold:", gold 
193          print >> sys.stderr, "  output:", output, "(append:", str(append) + ")" 
194          print >> sys.stderr, "  add new class/feature ids:", allowNewIds 
195          if not isinstance(style, types.StringTypes): 
196              style = Utils.Parameters.toString(style) 
197          print >> sys.stderr, "  style:", style 
198          if tokenization == None:  
199              print >> sys.stderr, "  parse:", parse 
200          else: 
201              print >> sys.stderr, "  parse:", parse + ", tokenization:", tokenization 
202          classSet, featureSet = cls.getIdSets(classIds, featureIds, allowNewIds)  
203          builder = cls(style=style, classSet=classSet, featureSet=featureSet) 
204           
205          builder.classIdFilename = classIds 
206          builder.featureIdFilename = featureIds 
207          builder.parse = parse ; builder.tokenization = tokenization 
208          builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds) 
209          return builder 
 210   
212          raise NotImplementedError 
 213       
216       
219       
220      @classmethod 
221 -    def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): 
 222           
223           
224           
225          if classIds != None and os.path.exists(classIds): 
226              print >> sys.stderr, "Using predefined class names from", classIds 
227              classSet = IdSet(allowNewIds=allowNewIds) 
228              classSet.load(classIds) 
229          else: 
230              print >> sys.stderr, "No predefined class names" 
231              classSet = None 
232           
233          if featureIds != None and os.path.exists(featureIds): 
234              print >> sys.stderr, "Using predefined feature names from", featureIds 
235              featureSet = IdSet(allowNewIds=allowNewIds) 
236              featureSet.load(featureIds) 
237          else: 
238              print >> sys.stderr, "No predefined feature names" 
239              featureSet = None 
240          return classSet, featureSet 
 241           
242   
243   
244   
245   
246   
247   
248   
249   
250   
251   
252   
253   
254   
255   
256   
257 -    def getSentences(self, input, parse, tokenization, removeNameInfo=False): 
 258          if type(input) != types.ListType: 
259               
260              corpusElements = Core.SentenceGraph.loadCorpus(input, parse, tokenization, removeNameInfo=removeNameInfo) 
261              sentences = [] 
262              for sentence in corpusElements.sentences: 
263                  if sentence.sentenceGraph != None:  
264                      sentences.append( [sentence.sentenceGraph,None] ) 
265              return sentences 
266          else:  
267              assert(removeNameInfo == False) 
268              return input 
 269   
271          print >> sys.stderr, "Defining predicted value range:", 
272          sentenceElements = [] 
273          for sentence in sentences: 
274              sentenceElements.append(sentence[0].sentenceElement) 
275          self.definePredictedValueRange(sentenceElements, "entity") 
276          print >> sys.stderr, self.getPredictedValueRange() 
  277   
279      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") 
280      optparser.add_option("-o", "--output", default=None, dest="output", help="Output file for the examples") 
281      optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="parse") 
282      optparser.add_option("-x", "--exampleBuilderParameters", default=None, dest="parameters", help="Parameters for the example builder") 
283      optparser.add_option("-b", "--exampleBuilder", default="SimpleDependencyExampleBuilder", dest="exampleBuilder", help="Example Builder Class") 
284      optparser.add_option("-c", "--classes", default=None, dest="classes", help="Class ids") 
285      optparser.add_option("-f", "--features", default=None, dest="features", help="Feature ids") 
286      optparser.add_option("-a", "--addIds", default=False, action="store_true", dest="addIds", help="Add new features") 
 287   
288  if __name__=="__main__": 
289       
290      try: 
291          import psyco 
292          psyco.full() 
293          print >> sys.stderr, "Found Psyco, using" 
294      except ImportError: 
295          print >> sys.stderr, "Psyco not installed" 
296   
297      from optparse import OptionParser 
298      optparser = OptionParser(usage="%prog [options]\nBuild machine learning examples from interaction XML.") 
299      addBasicOptions(optparser) 
300      (options, args) = optparser.parse_args() 
301       
302      print >> sys.stderr, "Importing modules" 
303      exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilderClass" 
304       
305       
306      ExampleBuilderClass.run(options.input, options.output, options.parse, None, options.parameters,  
307                              options.classes, options.features, allowNewIds=options.addIds ) 
308