Package TEES :: Package Detectors :: Module SingleStageDetector
[hide private]

Source Code for Module TEES.Detectors.SingleStageDetector

  1  import sys, os 
  2  import shutil 
  3  import itertools 
  4  import gzip 
  5  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 
  6  import Utils.ElementTreeUtils as ETUtils 
  7  import Utils.Parameters as Parameters 
  8  from Core.Model import Model 
  9  import Core.ExampleUtils as ExampleUtils 
 10  import Utils.STFormat.ConvertXML 
 11  import Utils.STFormat.Compare 
 12  #from Murska.CSCConnection import CSCConnection 
 13  from StepSelector import StepSelector 
 14  #import Utils.Parameters as Parameters 
 15  import types 
 16  from Detector import Detector 
 17   
 18  from ExampleWriters.BioTextExampleWriter import BioTextExampleWriter 
 19  import Evaluators.EvaluateInteractionXML as EvaluateInteractionXML 
 20  import Utils.InteractionXML as InteractionXML 
 21   
22 -class SingleStageDetector(Detector):
23 """ 24 A Detector for a text mining problem that can be represented as 25 a single classification task. 26 """
27 - def __init__(self):
28 Detector.__init__(self) 29 self.deleteCombinedExamples = True
30
31 - def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None):
32 """ 33 Begin the training process leading to a new model. 34 """ 35 if self.checkStep(step, False): 36 if model != None: 37 if self.state != None and step != None: 38 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step 39 # Create combined model 40 model = self.openModel(model, "w") 41 assert model.mode in ["a", "w"], (model.path, model.mode) 42 # Information can be imported from an existing model. In this case, model is trained 43 # with the parameter already defined in the import source. This is used when training 44 # the combined model. 45 if importIdsFromModel != None: 46 model.importFrom(self.openModel(importIdsFromModel, "r"), [self.tag+"ids.classes", self.tag+"ids.features"], 47 [self.tag+"classifier-parameter", self.tag+"example-style", self.tag+"parse", self.tag+"task"]) 48 # Train the model with the parameters defined in the import source 49 model.addStr(self.tag+"classifier-parameters-train", model.getStr(self.tag+"classifier-parameter")) 50 if self.bioNLPSTParams != None and len(self.bioNLPSTParams) > 0: 51 model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams)) 52 # Catenate example files 53 if type(trainExampleFiles) in types.StringTypes: 54 combinedTrainExamples = trainExampleFiles 55 elif len(trainExampleFiles) == 1: 56 combinedTrainExamples = trainExampleFiles[0] 57 else: 58 combinedTrainExamples = self.workDir + os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz" 59 combinedTrainExamplesFile = gzip.open(combinedTrainExamples, 'wb') 60 for trainExampleFile in trainExampleFiles: 61 print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples 62 shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile) 63 combinedTrainExamplesFile.close() 64 # Upload training model 65 # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can 66 # use it, and also as annotation for the trained model. The final selected parameter will 67 # be stored as "*classifier-parameter" 68 classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag + "models" 69 classifier = self.Classifier(self.connection) 70 classifier.optimize(combinedTrainExamples, classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="SUBMIT", evaluator=self.evaluator) 71 model.save()
72
73 - def endModel(self, step, model, testExampleFile):
74 if self.checkStep(step, False): 75 if model != None: 76 if self.state != None and step != None: 77 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step 78 # Download combined model 79 model = self.openModel(model, "a") 80 assert model.mode in ["a", "w"] 81 classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag+ "models" 82 classifier = self.Classifier(self.connection) 83 optimized = classifier.optimize("DUMMY", classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="RESULTS", evaluator=self.evaluator, 84 determineThreshold=("TEES.threshold" in model.getStr(self.tag+"classifier-parameters-train"))) 85 self.addClassifierModel(model, optimized.model, optimized.parameters, optimized.threshold) 86 model.save() 87 # Check for catenated example file 88 if self.deleteCombinedExamples: 89 combinedTrainExamples = os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz" 90 if os.path.exists(combinedTrainExamples): 91 print >> sys.stderr, "Deleting catenated training example file", combinedTrainExamples 92 os.remove(combinedTrainExamples)
93
94 - def train(self, trainData=None, optData=None, model=None, combinedModel=None, exampleStyle=None, 95 classifierParameters=None, parse=None, tokenization=None, task=None, fromStep=None, toStep=None, 96 workDir=None):
97 self.initVariables(trainData=trainData, optData=optData, model=model, combinedModel=combinedModel, exampleStyle=exampleStyle, classifierParameters=classifierParameters, parse=parse, tokenization=tokenization) 98 self.setWorkDir(workDir) 99 self.enterState(self.STATE_TRAIN, ["EXAMPLES", "BEGIN-MODEL", "END-MODEL", "BEGIN-COMBINED-MODEL", "END-COMBINED-MODEL"], fromStep, toStep) 100 if self.checkStep("EXAMPLES"): 101 self.model = self.initModel(self.model, [("exampleStyle", self.tag+"example-style"), ("classifierParameters", self.tag+"classifier-parameters-train")]) 102 self.saveStr(self.tag+"parse", parse, self.model) 103 self.saveStr(self.tag+"task", task, self.model) 104 self.buildExamples(self.model, [optData, trainData], [self.workDir+self.tag+"opt-examples.gz", self.workDir+self.tag+"train-examples.gz"], saveIdsToModel=True) 105 self.model = self.openModel(model, "a") # Devel model already exists, with ids etc 106 self.beginModel("BEGIN-MODEL", self.model, [self.workDir+self.tag+"train-examples.gz"], self.workDir+self.tag+"opt-examples.gz") 107 self.endModel("END-MODEL", self.model, self.workDir+self.tag+"opt-examples.gz") 108 self.beginModel("BEGIN-COMBINED-MODEL", self.combinedModel, [self.workDir+self.tag+"train-examples.gz", self.workDir+self.tag+"opt-examples.gz"], self.workDir+self.tag+"opt-examples.gz", self.model) 109 self.endModel("END-COMBINED-MODEL", self.combinedModel, self.workDir+self.tag+"opt-examples.gz") 110 if workDir != None: 111 self.setWorkDir("") 112 self.exitState()
113
114 - def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None):
115 model = self.openModel(model, "r") 116 self.enterState(self.STATE_CLASSIFY) 117 self.setWorkDir(workDir) 118 if workDir == None: 119 self.setTempWorkDir() 120 model = self.openModel(model, "r") 121 if parse == None: parse = self.getStr(self.tag+"parse", model) 122 if task == None: task = self.getStr(self.tag+"task", model) 123 workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") 124 xml = self.classifyToXML(data, model, None, workOutputTag, 125 model.get(self.tag+"classifier-model"), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) 126 shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz") 127 EvaluateInteractionXML.run(self.evaluator, xml, data, parse) 128 stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) 129 if stParams["convert"]: #self.useBioNLPSTFormat: 130 Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag="a2") 131 if stParams["evaluate"]: #self.stEvaluator != None: 132 self.stEvaluator.evaluate(output+"-events.tar.gz", task) 133 self.deleteTempWorkDir() 134 self.exitState()
135
136 - def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None):
137 model = self.openModel(model, "r") 138 if parse == None: 139 parse = self.getStr(self.tag+"parse", model) 140 if exampleFileName == None: 141 exampleFileName = tag+self.tag+"examples" 142 if compressExamples: 143 exampleFileName += ".gz" 144 self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle) 145 if classifierModel == None: 146 classifierModel = model.get(self.tag+"classifier-model") 147 else: 148 assert os.path.exists(classifierModel), classifierModel 149 classifier = self.Classifier() 150 classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True) 151 threshold = model.getStr(self.tag+"threshold", defaultIfNotExist=None, asType=float) 152 predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust, threshold=threshold) 153 evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes")) 154 #outputFileName = tag+"-"+self.tag+"pred.xml.gz" 155 #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style")) 156 if exampleStyle == None: 157 exampleStyle = Parameters.get(model.getStr(self.tag+"example-style")) # no checking, but these should already have passed the ExampleBuilder 158 return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse, exampleStyle=exampleStyle)
159 # if evaluator.getData().getTP() + evaluator.getData().getFP() > 0: 160 # return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse) 161 # else: 162 # # TODO: e.g. interactions must be removed if task does unmerging 163 # print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input" 164 # if type(data) in types.StringTypes: # assume its a file 165 # shutil.copy(data, outputFileName) 166 # else: # assume its an elementtree 167 # ETUtils.write(data, outputFileName) 168 # #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written" 169 # return data #None 170