1  import sys, os 
  2  import itertools 
  3  import types 
  4  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  5  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  6  import Utils.STFormat.STTools 
  7  import Utils.STFormat.ConvertXML 
  8  import Utils.STFormat.Equiv 
  9  import Utils.ElementTreeUtils as ETUtils 
 10  import Tools.GeniaSentenceSplitter 
 11  import Tools.BLLIPParser 
 12  import Tools.StanfordParser 
 13  import Tools.BANNER 
 14  from ToolChain import ToolChain 
 15  import Utils.InteractionXML.DivideSets 
 16  import Utils.ProteinNameSplitter as ProteinNameSplitter 
 17  import Utils.FindHeads as FindHeads 
 18   
 19  import Utils.Stream as Stream 
 20   
 25       
 27          steps = [] 
 28          steps.append( ("CONVERT", self.convert, {"dataSetNames":None, "corpusName":None}, "documents.xml") ) 
 29          steps.append( ("SPLIT-SENTENCES", Tools.GeniaSentenceSplitter.makeSentences, {"debug":False, "postProcess":True}, "sentences.xml") ) 
 30          steps.append( ("NER", Tools.BANNER.run, {"elementName":"entity", "processElement":"sentence", "debug":False, "splitNewlines":True}, "ner.xml") ) 
 31          steps.append( ("PARSE", Tools.BLLIPParser.parse, {"parseName":"McCC", "requireEntities":False, "debug":False}, "parse.xml") ) 
 32          steps.append( ("CONVERT-PARSE", Tools.StanfordParser.convertXML, {"parser":"McCC", "debug":False}, "converted-parse.xml") ) 
 33          steps.append( ("SPLIT-NAMES", ProteinNameSplitter.mainFunc, {"parseName":"McCC", "removeOld":True}, "split-names.xml") ) 
 34          steps.append( ("FIND-HEADS", FindHeads.findHeads, {"parse":"McCC", "removeExisting":True}, "heads.xml") ) 
 35          steps.append( ("DIVIDE-SETS", self.divideSets, {"outputStem":None, "saveCombined":True}) ) 
 36          return steps 
  37       
 38 -    def process(self, source, output, parameters=None, model=None, sourceDataSetNames=None, fromStep=None, toStep=None, omitSteps=None): 
  39           
 40           
 41           
 42           
 43           
 44           
 45           
 46           
 47           
 48           
 49          xml = ToolChain.process(self, source, output, parameters, model, fromStep, toStep, omitSteps) 
 50           
 51           
 52           
 53          return xml 
  54           
 55 -    def convert(self, input, dataSetNames=None, corpusName=None, output=None): 
  56          if os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input: 
 57              print >> sys.stderr, "Converting ST-format to Interaction XML" 
 58               
 59              dataSetDirs = input 
 60              documents = [] 
 61              if type(dataSetDirs) in types.StringTypes: 
 62                  dataSetDirs = dataSetDirs.split(",") 
 63               
 64              if dataSetNames == None:  
 65                  dataSetNames = [] 
 66              elif type(dataSetNames) in types.StringTypes: 
 67                  dataSetNames = dataSetNames.split(",") 
 68               
 69              for dataSetDir, dataSetName in itertools.izip_longest(dataSetDirs, dataSetNames, fillvalue=None): 
 70                  print >> sys.stderr, "Reading", dataSetDir, "set,", 
 71                  docs = Utils.STFormat.STTools.loadSet(dataSetDir, dataSetName) 
 72                  print >> sys.stderr, len(docs), "documents" 
 73                  documents.extend(docs) 
 74              print >> sys.stderr, "Resolving equivalences" 
 75              Utils.STFormat.Equiv.process(documents) 
 76              if corpusName == None: 
 77                  corpusName = "TEES" 
 78              self.xml = Utils.STFormat.ConvertXML.toInteractionXML(documents, corpusName, output) 
 79          else: 
 80              print >> sys.stderr, "Processing source as interaction XML" 
 81              self.xml = ETUtils.ETFromObj(input) 
 82          return self.xml 
  83       
 84 -    def divideSets(self, input, outputStem, saveCombined=True): 
  85          if outputStem != None: 
 86              print >> sys.stderr, "Dividing into sets" 
 87              outDir, outputStem = os.path.split(outputStem) 
 88              Utils.InteractionXML.DivideSets.processCorpus(input, outDir, outputStem, ".xml", saveCombined=saveCombined) 
 89          else: 
 90              print >> sys.stderr, "No set division" 
   91   
 92  if __name__=="__main__": 
 93       
 94      try: 
 95          import psyco 
 96          psyco.full() 
 97          print >> sys.stderr, "Found Psyco, using" 
 98      except ImportError: 
 99          print >> sys.stderr, "Psyco not installed" 
100      from optparse import OptionParser 
101      optparser = OptionParser(description="A tool chain for making interaction XML, sentence splitting, NER and parsing") 
102      optparser.add_option("-i", "--input", default=None, dest="input", help="") 
103      optparser.add_option("-n", "--inputNames", default=None, dest="inputNames", help="") 
104      optparser.add_option("-c", "--corpus", default=None, dest="corpus", help="corpus name") 
105      optparser.add_option("-o", "--output", default=None, dest="output", help="output directory") 
106      optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="preprocessing parameters") 
107      optparser.add_option("-s", "--step", default=None, dest="step", help="") 
108      optparser.add_option("-t", "--toStep", default=None, dest="toStep", help="") 
109      optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="") 
110      optparser.add_option("--noLog", default=False, action="store_true", dest="noLog", help="") 
111      optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="") 
112      optparser.add_option("--requireEntities", default=False, action="store_true", dest="requireEntities", help="") 
113      (options, args) = optparser.parse_args() 
114      if options.omitSteps != None: 
115          options.omitSteps = options.omitSteps.split(",") 
116       
117      if not options.noLog: 
118          Stream.openLog(os.path.join(options.output + "-log.txt")) 
119           
120      preprocessor = Preprocessor() 
121      preprocessor.setArgForAllSteps("debug", options.debug) 
122      preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus 
123      preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities 
124      preprocessor.process(options.input, options.output, options.parameters, None, options.inputNames, fromStep=options.step, toStep=options.toStep, omitSteps=options.omitSteps) 
125