1 import sys, os
2 import shutil
3 import itertools
4 import gzip
5 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
6 import Utils.ElementTreeUtils as ETUtils
7 import Utils.Parameters as Parameters
8 from Core.Model import Model
9 import Core.ExampleUtils as ExampleUtils
10 import Utils.STFormat.ConvertXML
11 import Utils.STFormat.Compare
12
13 from StepSelector import StepSelector
14
15 import types
16 from Detector import Detector
17
18 from ExampleWriters.BioTextExampleWriter import BioTextExampleWriter
19 import Evaluators.EvaluateInteractionXML as EvaluateInteractionXML
20 import Utils.InteractionXML as InteractionXML
21
23 """
24 A Detector for a text mining problem that can be represented as
25 a single classification task.
26 """
30
31 - def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None):
32 """
33 Begin the training process leading to a new model.
34 """
35 if self.checkStep(step, False):
36 if model != None:
37 if self.state != None and step != None:
38 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step
39
40 model = self.openModel(model, "w")
41 assert model.mode in ["a", "w"], (model.path, model.mode)
42
43
44
45 if importIdsFromModel != None:
46 model.importFrom(self.openModel(importIdsFromModel, "r"), [self.tag+"ids.classes", self.tag+"ids.features"],
47 [self.tag+"classifier-parameter", self.tag+"example-style", self.tag+"parse", self.tag+"task"])
48
49 model.addStr(self.tag+"classifier-parameters-train", model.getStr(self.tag+"classifier-parameter"))
50 if self.bioNLPSTParams != None and len(self.bioNLPSTParams) > 0:
51 model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams))
52
53 if type(trainExampleFiles) in types.StringTypes:
54 combinedTrainExamples = trainExampleFiles
55 elif len(trainExampleFiles) == 1:
56 combinedTrainExamples = trainExampleFiles[0]
57 else:
58 combinedTrainExamples = self.workDir + os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz"
59 combinedTrainExamplesFile = gzip.open(combinedTrainExamples, 'wb')
60 for trainExampleFile in trainExampleFiles:
61 print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples
62 shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile)
63 combinedTrainExamplesFile.close()
64
65
66
67
68 classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag + "models"
69 classifier = self.Classifier(self.connection)
70 classifier.optimize(combinedTrainExamples, classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="SUBMIT", evaluator=self.evaluator)
71 model.save()
72
73 - def endModel(self, step, model, testExampleFile):
74 if self.checkStep(step, False):
75 if model != None:
76 if self.state != None and step != None:
77 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step
78
79 model = self.openModel(model, "a")
80 assert model.mode in ["a", "w"]
81 classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag+ "models"
82 classifier = self.Classifier(self.connection)
83 optimized = classifier.optimize("DUMMY", classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="RESULTS", evaluator=self.evaluator,
84 determineThreshold=("TEES.threshold" in model.getStr(self.tag+"classifier-parameters-train")))
85 self.addClassifierModel(model, optimized.model, optimized.parameters, optimized.threshold)
86 model.save()
87
88 if self.deleteCombinedExamples:
89 combinedTrainExamples = os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz"
90 if os.path.exists(combinedTrainExamples):
91 print >> sys.stderr, "Deleting catenated training example file", combinedTrainExamples
92 os.remove(combinedTrainExamples)
93
94 - def train(self, trainData=None, optData=None, model=None, combinedModel=None, exampleStyle=None,
95 classifierParameters=None, parse=None, tokenization=None, task=None, fromStep=None, toStep=None,
96 workDir=None):
97 self.initVariables(trainData=trainData, optData=optData, model=model, combinedModel=combinedModel, exampleStyle=exampleStyle, classifierParameters=classifierParameters, parse=parse, tokenization=tokenization)
98 self.setWorkDir(workDir)
99 self.enterState(self.STATE_TRAIN, ["EXAMPLES", "BEGIN-MODEL", "END-MODEL", "BEGIN-COMBINED-MODEL", "END-COMBINED-MODEL"], fromStep, toStep)
100 if self.checkStep("EXAMPLES"):
101 self.model = self.initModel(self.model, [("exampleStyle", self.tag+"example-style"), ("classifierParameters", self.tag+"classifier-parameters-train")])
102 self.saveStr(self.tag+"parse", parse, self.model)
103 self.saveStr(self.tag+"task", task, self.model)
104 self.buildExamples(self.model, [optData, trainData], [self.workDir+self.tag+"opt-examples.gz", self.workDir+self.tag+"train-examples.gz"], saveIdsToModel=True)
105 self.model = self.openModel(model, "a")
106 self.beginModel("BEGIN-MODEL", self.model, [self.workDir+self.tag+"train-examples.gz"], self.workDir+self.tag+"opt-examples.gz")
107 self.endModel("END-MODEL", self.model, self.workDir+self.tag+"opt-examples.gz")
108 self.beginModel("BEGIN-COMBINED-MODEL", self.combinedModel, [self.workDir+self.tag+"train-examples.gz", self.workDir+self.tag+"opt-examples.gz"], self.workDir+self.tag+"opt-examples.gz", self.model)
109 self.endModel("END-COMBINED-MODEL", self.combinedModel, self.workDir+self.tag+"opt-examples.gz")
110 if workDir != None:
111 self.setWorkDir("")
112 self.exitState()
113
114 - def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None):
115 model = self.openModel(model, "r")
116 self.enterState(self.STATE_CLASSIFY)
117 self.setWorkDir(workDir)
118 if workDir == None:
119 self.setTempWorkDir()
120 model = self.openModel(model, "r")
121 if parse == None: parse = self.getStr(self.tag+"parse", model)
122 if task == None: task = self.getStr(self.tag+"task", model)
123 workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
124 xml = self.classifyToXML(data, model, None, workOutputTag,
125 model.get(self.tag+"classifier-model"), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0)))
126 shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz")
127 EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
128 stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
129 if stParams["convert"]:
130 Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag="a2")
131 if stParams["evaluate"]:
132 self.stEvaluator.evaluate(output+"-events.tar.gz", task)
133 self.deleteTempWorkDir()
134 self.exitState()
135
136 - def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None):
137 model = self.openModel(model, "r")
138 if parse == None:
139 parse = self.getStr(self.tag+"parse", model)
140 if exampleFileName == None:
141 exampleFileName = tag+self.tag+"examples"
142 if compressExamples:
143 exampleFileName += ".gz"
144 self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle)
145 if classifierModel == None:
146 classifierModel = model.get(self.tag+"classifier-model")
147 else:
148 assert os.path.exists(classifierModel), classifierModel
149 classifier = self.Classifier()
150 classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True)
151 threshold = model.getStr(self.tag+"threshold", defaultIfNotExist=None, asType=float)
152 predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust, threshold=threshold)
153 evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes"))
154
155
156 if exampleStyle == None:
157 exampleStyle = Parameters.get(model.getStr(self.tag+"example-style"))
158 return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse, exampleStyle=exampleStyle)
159
160
161
162
163
164
165
166
167
168
169
170