Package TEES :: Package Detectors :: Module Preprocessor
[hide private]

Source Code for Module TEES.Detectors.Preprocessor

  1  import sys, os 
  2  import itertools 
  3  import types 
  4  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  5  sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 
  6  import Utils.STFormat.STTools 
  7  import Utils.STFormat.ConvertXML 
  8  import Utils.STFormat.Equiv 
  9  import Utils.ElementTreeUtils as ETUtils 
 10  import Tools.GeniaSentenceSplitter 
 11  import Tools.BLLIPParser 
 12  import Tools.StanfordParser 
 13  import Tools.BANNER 
 14  from ToolChain import ToolChain 
 15  import Utils.InteractionXML.DivideSets 
 16  import Utils.ProteinNameSplitter as ProteinNameSplitter 
 17  import Utils.FindHeads as FindHeads 
 18  #from Test.Pipeline import log 
 19  import Utils.Stream as Stream 
 20   
21 -class Preprocessor(ToolChain):
22 - def __init__(self):
23 ToolChain.__init__(self) 24 self.modelParameterStringName = "preprocessorParams"
25
26 - def getDefaultSteps(self):
27 steps = [] 28 steps.append( ("CONVERT", self.convert, {"dataSetNames":None, "corpusName":None}, "documents.xml") ) 29 steps.append( ("SPLIT-SENTENCES", Tools.GeniaSentenceSplitter.makeSentences, {"debug":False, "postProcess":True}, "sentences.xml") ) 30 steps.append( ("NER", Tools.BANNER.run, {"elementName":"entity", "processElement":"sentence", "debug":False, "splitNewlines":True}, "ner.xml") ) 31 steps.append( ("PARSE", Tools.BLLIPParser.parse, {"parseName":"McCC", "requireEntities":False, "debug":False}, "parse.xml") ) 32 steps.append( ("CONVERT-PARSE", Tools.StanfordParser.convertXML, {"parser":"McCC", "debug":False}, "converted-parse.xml") ) 33 steps.append( ("SPLIT-NAMES", ProteinNameSplitter.mainFunc, {"parseName":"McCC", "removeOld":True}, "split-names.xml") ) 34 steps.append( ("FIND-HEADS", FindHeads.findHeads, {"parse":"McCC", "removeExisting":True}, "heads.xml") ) 35 steps.append( ("DIVIDE-SETS", self.divideSets, {"outputStem":None, "saveCombined":True}) ) 36 return steps
37
38 - def process(self, source, output, parameters=None, model=None, sourceDataSetNames=None, fromStep=None, toStep=None, omitSteps=None):
39 # Initialize variables and save existing default values 40 #self.intermediateFileTag = corpusName 41 #parameters = self.getParameters(parameters, model) 42 #parameters["CONVERT.dataSetNames"] = sourceDataSetNames 43 #parameters["CONVERT.corpusName"] = corpusName 44 #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] 45 #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] 46 #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames 47 #self.stepArgs("CONVERT")["corpusName"] = corpusName 48 # Run the tool chain 49 xml = ToolChain.process(self, source, output, parameters, model, fromStep, toStep, omitSteps) 50 # Reset variables to saved default values 51 #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames 52 #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName 53 return xml
54
55 - def convert(self, input, dataSetNames=None, corpusName=None, output=None):
56 if os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input: 57 print >> sys.stderr, "Converting ST-format to Interaction XML" 58 # Get input file (or files) 59 dataSetDirs = input 60 documents = [] 61 if type(dataSetDirs) in types.StringTypes: 62 dataSetDirs = dataSetDirs.split(",") 63 # Get the list of "train", "devel" etc names for these sets 64 if dataSetNames == None: 65 dataSetNames = [] 66 elif type(dataSetNames) in types.StringTypes: 67 dataSetNames = dataSetNames.split(",") 68 # Convert all input files into one corpus 69 for dataSetDir, dataSetName in itertools.izip_longest(dataSetDirs, dataSetNames, fillvalue=None): 70 print >> sys.stderr, "Reading", dataSetDir, "set,", 71 docs = Utils.STFormat.STTools.loadSet(dataSetDir, dataSetName) 72 print >> sys.stderr, len(docs), "documents" 73 documents.extend(docs) 74 print >> sys.stderr, "Resolving equivalences" 75 Utils.STFormat.Equiv.process(documents) 76 if corpusName == None: 77 corpusName = "TEES" 78 self.xml = Utils.STFormat.ConvertXML.toInteractionXML(documents, corpusName, output) 79 else: 80 print >> sys.stderr, "Processing source as interaction XML" 81 self.xml = ETUtils.ETFromObj(input) 82 return self.xml
83
84 - def divideSets(self, input, outputStem, saveCombined=True):
85 if outputStem != None: 86 print >> sys.stderr, "Dividing into sets" 87 outDir, outputStem = os.path.split(outputStem) 88 Utils.InteractionXML.DivideSets.processCorpus(input, outDir, outputStem, ".xml", saveCombined=saveCombined) 89 else: 90 print >> sys.stderr, "No set division"
91 92 if __name__=="__main__": 93 # Import Psyco if available 94 try: 95 import psyco 96 psyco.full() 97 print >> sys.stderr, "Found Psyco, using" 98 except ImportError: 99 print >> sys.stderr, "Psyco not installed" 100 from optparse import OptionParser 101 optparser = OptionParser(description="A tool chain for making interaction XML, sentence splitting, NER and parsing") 102 optparser.add_option("-i", "--input", default=None, dest="input", help="") 103 optparser.add_option("-n", "--inputNames", default=None, dest="inputNames", help="") 104 optparser.add_option("-c", "--corpus", default=None, dest="corpus", help="corpus name") 105 optparser.add_option("-o", "--output", default=None, dest="output", help="output directory") 106 optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="preprocessing parameters") 107 optparser.add_option("-s", "--step", default=None, dest="step", help="") 108 optparser.add_option("-t", "--toStep", default=None, dest="toStep", help="") 109 optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="") 110 optparser.add_option("--noLog", default=False, action="store_true", dest="noLog", help="") 111 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="") 112 optparser.add_option("--requireEntities", default=False, action="store_true", dest="requireEntities", help="") 113 (options, args) = optparser.parse_args() 114 if options.omitSteps != None: 115 options.omitSteps = options.omitSteps.split(",") 116 117 if not options.noLog: 118 Stream.openLog(os.path.join(options.output + "-log.txt")) 119 #log(False, True, os.path.join(options.output, options.corpus + "-log.txt")) 120 preprocessor = Preprocessor() 121 preprocessor.setArgForAllSteps("debug", options.debug) 122 preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus 123 preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities 124 preprocessor.process(options.input, options.output, options.parameters, None, options.inputNames, fromStep=options.step, toStep=options.toStep, omitSteps=options.omitSteps) 125