1  """ 
  2  Detect events or relations from text. 
  3  """ 
  4  from train import workdir, getDetector, getSteps 
  5  import sys, os 
  6  import tempfile 
  7  import codecs 
  8  import Utils.Settings as Settings 
  9  import Utils.Stream as Stream 
 10  import Utils.Download 
 11  from Utils.Connection.Connection import getConnection 
 12  import Utils.Download 
 13  from Detectors.Preprocessor import Preprocessor 
 14   
 15 -def classify(input, model, output, workDir=None, step=None, omitSteps=None,  
 16               goldInput=None, detector=None, debug=False, clear=False,  
 17               preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None): 
  18      """ 
 19      Detect events or relations from text. 
 20       
 21      @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name. 
 22      @param model: A path to a model file or the name of a TEES default model. 
 23      @param output: The output file stem. Output files will be of the form output-* 
 24      @param workDir: If intermediate files need to be saved, they will go here. 
 25      @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY 
 26      @param omitSteps: step=substep parameters, where multiple substeps can be defined. 
 27      @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance 
 28      @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model. 
 29      @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved 
 30      @param clear: Remove existing workDir 
 31      @param preprocessorTag: preprocessor output file will be output + preprocessorTag 
 32      @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model. 
 33      @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model. 
 34      """ 
 35      input = os.path.abspath(input) 
 36      if goldInput != None: goldInput = os.path.abspath(goldInput) 
 37      if model != None: model = os.path.abspath(model) 
 38       
 39      if workDir != None:  
 40          workdir(workDir, clear) 
 41      Stream.openLog(output + "-log.txt")  
 42       
 43      input, preprocess = getInput(input) 
 44      model = getModel(model) 
 45       
 46      selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"]) 
 47      if not preprocess: 
 48          selector.markOmitSteps("PREPROCESS") 
 49       
 50      classifyInput = input 
 51      if selector.check("PREPROCESS"): 
 52          preprocessor = Preprocessor() 
 53          preprocessorOutput = output + preprocessorTag 
 54           
 55           
 56           
 57          if os.path.exists(preprocessorOutput) and not clear:  
 58               
 59              print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing." 
 60              classifyInput = preprocessorOutput  
 61          else: 
 62               
 63              print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist" 
 64              print >> sys.stderr, "------------ Preprocessing ------------" 
 65               
 66               
 67               
 68              classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"]) 
 69       
 70      if selector.check("CLASSIFY"): 
 71          detector = getDetector(detector, model)[0]()  
 72          detector.debug = debug 
 73          detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model) 
 74          detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir) 
  75   
 77      if model == None: 
 78          return None 
 79      if not os.path.exists(model): 
 80          print >> sys.stderr, "Model", model, "doesn't exist, looking for a default model" 
 81          modelName = os.path.basename(model) 
 82          found = None 
 83          if hasattr(Settings, "MODEL_DIR"): 
 84              for suffix in ["", "-test", ".zip", "-test.zip"]: 
 85                  predefined = os.path.join(Settings.MODEL_DIR, modelName + suffix) 
 86                  if os.path.exists(predefined): 
 87                      print >> sys.stderr, "Classifying with default model", predefined 
 88                      found = predefined 
 89                      model = found 
 90                      break 
 91              if found == None: 
 92                  print >> sys.stderr, "No default model found for definition", modelName 
 93          else: 
 94              print >> sys.stderr, "Default model directory MODEL_DIR not defined in Settings" 
 95          if found == None: 
 96              raise Exception("Model " + str(model) + " not found") 
 97      else: 
 98          print >> sys.stderr, "Classifying with model", model 
 99      return os.path.abspath(model) 
 100   
127   
129      print >> sys.stderr, "*************************** NOTE ***************************" 
130      print >> sys.stderr, "Do not attempt to do large-scale classification of PubMed" 
131      print >> sys.stderr, "abstracts with this feature. For that, use the downloadable" 
132      print >> sys.stderr, "PubMed release. This is a demonstration feature only, and" 
133      print >> sys.stderr, "abusing it will cause you to be banned from PubMed!" 
134      print >> sys.stderr, "************************************************************" 
135      print >> sys.stderr, "Downloading PubMed abstract", pmid 
136      tempDir = tempfile.gettempdir() 
137      url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + str(pmid) + "&retmode=xml" 
138      downloaded = os.path.join(tempDir, "pmid-" + str(pmid)) 
139      Utils.Download.download(url, downloaded + ".xml", False) 
140       
141      f = codecs.open(downloaded + ".xml", "rt", "utf-8") 
142      textElements = [] 
143      for line in f: 
144          line = line.strip() 
145          for tag in ["<ArticleTitle>", "<AbstractText>"]: 
146              if line.startswith(tag): 
147                  textElements.append(line.split(">", 1)[1].split("<")[0]) 
148      f.close() 
149       
150      f = codecs.open(downloaded + ".txt", "wt", "utf-8") 
151      f.write("\n".join(textElements)) 
152      f.close() 
153       
154      return downloaded + ".txt" 
 155   
156  if __name__=="__main__": 
157       
158      try: 
159          import psyco 
160          psyco.full() 
161          print >> sys.stderr, "Found Psyco, using" 
162      except ImportError: 
163          print >> sys.stderr, "Psyco not installed" 
164           
165      from optparse import OptionParser 
166      optparser = OptionParser(description="Predict events/relations") 
167      optparser.add_option("-i", "--input", default=None, dest="input", help="input") 
168      optparser.add_option("-o", "--output", default=None, dest="output", help="output file stem") 
169      optparser.add_option("-w", "--workdir", default=None, dest="workdir", help="output directory") 
170      optparser.add_option("-m", "--model", default=None, dest="model", help="TEES model") 
171      optparser.add_option("-d", "--detector", default=None, dest="detector", help="") 
172      optparser.add_option("-c", "--connection", default=None, dest="connection", help="") 
173      optparser.add_option("-g", "--gold", default=None, dest="gold", help="annotated version of the input file (optional)") 
174      optparser.add_option("-p", "--preprocessorParams", default=None, dest="preprocessorParams", help="") 
175      optparser.add_option("-b", "--bioNLPSTParams", default=None, dest="bioNLPSTParams", help="") 
176       
177      optparser.add_option("--step", default=None, dest="step", help="") 
178      optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="") 
179      optparser.add_option("--clearAll", default=False, action="store_true", dest="clearAll", help="Delete all files") 
180      optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="More verbose output") 
181      (options, args) = optparser.parse_args() 
182       
183      assert options.output != None 
184      classify(options.input, options.model, options.output, options.workdir, options.step, options.omitSteps,  
185               options.gold, options.detector, options.debug, options.clearAll, 
186               preprocessorParams=options.preprocessorParams, bioNLPSTParams=options.bioNLPSTParams) 
187