1 import sys, os
2 import shutil
3 import types
4 import copy
5 from Detector import Detector
6 from EntityDetector import EntityDetector
7 from EdgeDetector import EdgeDetector
8 from UnmergingDetector import UnmergingDetector
9 from ModifierDetector import ModifierDetector
10
11 import Utils.Parameters as Parameters
12 import Utils.InteractionXML as InteractionXML
13 import Evaluators.EvaluateInteractionXML as EvaluateInteractionXML
14 import Utils.STFormat.ConvertXML
15 import Utils.STFormat.Compare
16 import Evaluators.BioNLP11GeniaTools
17
19 """
20 A multi-stage detector used for the BioNLP Shared Task type events.
21 """
33
40
42 Detector.setWorkDir(self, workDir)
43
44 for detector in [self.triggerDetector, self.edgeDetector, self.unmergingDetector, self.modifierDetector]:
45 if detector != None:
46 detector.setWorkDir(workDir)
47
48 - def train(self, trainData=None, optData=None,
49 model=None, combinedModel=None,
50 triggerExampleStyle=None, edgeExampleStyle=None, unmergingExampleStyle=None, modifierExampleStyle=None,
51 triggerClassifierParameters=None, edgeClassifierParameters=None,
52 unmergingClassifierParameters=None, modifierClassifierParameters=None,
53 recallAdjustParameters=None, unmerging=False, trainModifiers=False,
54 fullGrid=False, task=None,
55 parse=None, tokenization=None,
56 fromStep=None, toStep=None,
57 workDir=None):
58
59 self.initVariables(trainData=trainData, optData=optData, model=model, combinedModel=combinedModel,
60 triggerExampleStyle=triggerExampleStyle, edgeExampleStyle=edgeExampleStyle,
61 unmergingExampleStyle=unmergingExampleStyle, modifierExampleStyle=modifierExampleStyle,
62 triggerClassifierParameters=triggerClassifierParameters,
63 edgeClassifierParameters=edgeClassifierParameters,
64 unmergingClassifierParameters=unmergingClassifierParameters,
65 modifierClassifierParameters=modifierClassifierParameters,
66 recallAdjustParameters=recallAdjustParameters, unmerging=unmerging, trainModifiers=trainModifiers,
67 fullGrid=fullGrid, task=task, parse=parse, tokenization=tokenization)
68 self.setWorkDir(workDir)
69
70 self.enterState(self.STATE_TRAIN, ["EXAMPLES", "BEGIN-MODEL", "END-MODEL", "BEGIN-COMBINED-MODEL",
71 "SELF-TRAIN-EXAMPLES-FOR-UNMERGING", "UNMERGING-EXAMPLES", "BEGIN-UNMERGING-MODEL", "END-UNMERGING-MODEL",
72 "GRID", "BEGIN-COMBINED-MODEL-FULLGRID", "END-COMBINED-MODEL"], fromStep, toStep)
73 self.triggerDetector.enterState(self.STATE_COMPONENT_TRAIN)
74 self.edgeDetector.enterState(self.STATE_COMPONENT_TRAIN)
75 self.unmergingDetector.enterState(self.STATE_COMPONENT_TRAIN)
76 self.modifierDetector.enterState(self.STATE_COMPONENT_TRAIN)
77 if self.checkStep("EXAMPLES"):
78 self.model = self.initModel(self.model,
79 [("triggerExampleStyle", self.triggerDetector.tag+"example-style"),
80 ("triggerClassifierParameters", self.triggerDetector.tag+"classifier-parameters-train"),
81 ("edgeExampleStyle", self.edgeDetector.tag+"example-style"),
82 ("edgeClassifierParameters", self.edgeDetector.tag+"classifier-parameters-train"),
83 ("unmergingExampleStyle", self.unmergingDetector.tag+"example-style"),
84 ("unmergingClassifierParameters", self.unmergingDetector.tag+"classifier-parameters-train"),
85 ("modifierExampleStyle", self.modifierDetector.tag+"example-style"),
86 ("modifierClassifierParameters", self.modifierDetector.tag+"classifier-parameters-train")])
87 self.combinedModel = self.initModel(self.combinedModel)
88 tags = [self.triggerDetector.tag, self.edgeDetector.tag, self.unmergingDetector.tag]
89 if trainModifiers: tags += [self.modifierDetector.tag]
90 stringDict = {}
91 for tag in tags:
92 stringDict[tag+"parse"] = parse
93 stringDict[tag+"task"] = task
94 self.saveStrings(stringDict, self.model)
95 self.saveStrings(stringDict, self.combinedModel, False)
96 self.triggerDetector.buildExamples(self.model, [optData.replace("-nodup", ""), trainData.replace("-nodup", "")], [self.workDir+self.triggerDetector.tag+"opt-examples.gz", self.workDir+self.triggerDetector.tag+"train-examples.gz"], saveIdsToModel=True)
97 self.edgeDetector.buildExamples(self.model, [optData.replace("-nodup", ""), trainData.replace("-nodup", "")], [self.workDir+self.edgeDetector.tag+"opt-examples.gz", self.workDir+self.edgeDetector.tag+"train-examples.gz"], saveIdsToModel=True)
98 if trainModifiers:
99 self.modifierDetector.buildExamples(self.model, [optData, trainData], [self.workDir+self.modifierDetector.tag+"opt-examples.gz", self.workDir+self.modifierDetector.tag+"train-examples.gz"], saveIdsToModel=True)
100
101 self.model = self.openModel(model, "a")
102 self.combinedModel = self.openModel(combinedModel, "a")
103 if self.checkStep("BEGIN-MODEL"):
104
105
106
107 self.triggerDetector.beginModel(None, self.model, [self.workDir+self.triggerDetector.tag+"train-examples.gz"], self.workDir+self.triggerDetector.tag+"opt-examples.gz")
108 self.edgeDetector.beginModel(None, self.model, [self.workDir+self.edgeDetector.tag+"train-examples.gz"], self.workDir+self.edgeDetector.tag+"opt-examples.gz")
109 if trainModifiers:
110 self.modifierDetector.beginModel(None, self.model, [self.workDir+self.modifierDetector.tag+"train-examples.gz"], self.workDir+self.modifierDetector.tag+"opt-examples.gz")
111 if self.checkStep("END-MODEL"):
112 self.triggerDetector.endModel(None, self.model, self.workDir+self.triggerDetector.tag+"opt-examples.gz")
113 self.edgeDetector.endModel(None, self.model, self.workDir+self.edgeDetector.tag+"opt-examples.gz")
114 if trainModifiers:
115 self.modifierDetector.endModel(None, self.model, self.workDir+self.modifierDetector.tag+"opt-examples.gz")
116 if self.checkStep("BEGIN-COMBINED-MODEL"):
117 if not self.fullGrid:
118 print >> sys.stderr, "Training combined model before grid search"
119 self.triggerDetector.beginModel(None, self.combinedModel, [self.workDir+self.triggerDetector.tag+"train-examples.gz", self.workDir+self.triggerDetector.tag+"opt-examples.gz"], self.workDir+self.triggerDetector.tag+"opt-examples.gz", self.model)
120 self.edgeDetector.beginModel(None, self.combinedModel, [self.workDir+self.edgeDetector.tag+"train-examples.gz", self.workDir+self.edgeDetector.tag+"opt-examples.gz"], self.workDir+self.edgeDetector.tag+"opt-examples.gz", self.model)
121 else:
122 print >> sys.stderr, "Combined model will be trained after grid search"
123 if trainModifiers:
124 print >> sys.stderr, "Training combined model for modifier detection"
125 self.modifierDetector.beginModel(None, self.combinedModel, [self.workDir+self.modifierDetector.tag+"train-examples.gz", self.workDir+self.modifierDetector.tag+"opt-examples.gz"], self.workDir+self.modifierDetector.tag+"opt-examples.gz", self.model)
126 self.trainUnmergingDetector()
127 if self.checkStep("GRID"):
128 self.doGrid()
129 if self.checkStep("BEGIN-COMBINED-MODEL-FULLGRID"):
130 if self.fullGrid:
131 print >> sys.stderr, "Training combined model after grid search"
132 self.triggerDetector.beginModel(None, self.combinedModel, [self.workDir+self.triggerDetector.tag+"train-examples.gz", self.workDir+self.triggerDetector.tag+"opt-examples.gz"], self.workDir+self.triggerDetector.tag+"opt-examples.gz", self.model)
133 self.edgeDetector.beginModel(None, self.combinedModel, [self.workDir+self.edgeDetector.tag+"train-examples.gz", self.workDir+self.edgeDetector.tag+"opt-examples.gz"], self.workDir+self.edgeDetector.tag+"opt-examples.gz", self.model)
134 if trainModifiers:
135 print >> sys.stderr, "Training combined model for modifier detection"
136 self.modifierDetector.beginModel(None, self.combinedModel, [self.workDir+self.modifierDetector.tag+"train-examples.gz", self.workDir+self.modifierDetector.tag+"opt-examples.gz"], self.workDir+self.modifierDetector.tag+"opt-examples.gz", self.model)
137 else:
138 print >> sys.stderr, "Combined model has been trained before grid search"
139 if self.checkStep("END-COMBINED-MODEL"):
140 self.triggerDetector.endModel(None, self.combinedModel, self.workDir+self.triggerDetector.tag+"opt-examples.gz")
141 self.edgeDetector.endModel(None, self.combinedModel, self.workDir+self.edgeDetector.tag+"opt-examples.gz")
142 if trainModifiers:
143 self.modifierDetector.endModel(None, self.combinedModel, self.workDir+self.modifierDetector.tag+"opt-examples.gz")
144
145 if workDir != None:
146 self.setWorkDir("")
147 self.exitState()
148 self.triggerDetector.exitState()
149 self.edgeDetector.exitState()
150 self.unmergingDetector.exitState()
151 self.modifierDetector.exitState()
152
154 print >> sys.stderr, "--------- Parameter grid search ---------"
155
156 self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"])
157
158 if self.fullGrid:
159
160 ALL_PARAMS={
161 "trigger":[int(i) for i in Parameters.get(self.triggerClassifierParameters, valueListKey="c")["c"]],
162 "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
163 "edge":[int(i) for i in Parameters.get(self.edgeClassifierParameters, valueListKey="c")["c"]] }
164 else:
165 ALL_PARAMS={"trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter"), valueListKey="c")["c"],
166 "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
167 "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter"), valueListKey="c")["c"]}
168
169 paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
170 prevParams = None
171 EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model-c_")
172 TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model-c_")
173 bestResults = None
174 for i in range(len(paramCombinations)):
175 params = paramCombinations[i]
176 print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
177 print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params
178 print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
179
180 if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["booster"] != params["booster"]:
181 print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"])
182 xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM+str(params["trigger"]), recallAdjust=params["booster"])
183 prevParams = params
184
185
186
187 edgeClassifierModel=EDGE_MODEL_STEM+str(params["edge"])
188 xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData)
189 bestResults = self.evaluateGrid(xml, params, bestResults)
190
191 for tag1 in ["edge", "trigger", "unmerging"]:
192 for tag2 in ["examples", "pred.xml.gz"]:
193 if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2):
194 os.remove(self.workDir+"grid-"+tag1+"-"+tag2)
195 print >> sys.stderr, "Parameter grid search complete"
196 print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
197 print >> sys.stderr, "Best parameters:", bestResults[0]
198 print >> sys.stderr, "Best result:", bestResults[2]
199
200 self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model)
201 self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False)
202 if self.fullGrid:
203 self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"])
204 self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"])
205
206 for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]:
207 for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]:
208 if os.path.exists(stepTag+fileStem):
209 os.remove(stepTag+fileStem)
210
212 if xml != None:
213
214 EIXMLResult = EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.optData, self.parse)
215
216 if self.bioNLPSTParams["evaluate"]:
217 Utils.STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-flat-geniaformat", "a2")
218 stFormatDir = self.workDir+"grid-flat-geniaformat"
219
220 if self.unmerging:
221 xml = self.unmergingDetector.classifyToXML(xml, self.model, None, self.workDir+"grid-", goldData=self.optData)
222 if self.bioNLPSTParams["evaluate"]:
223 Utils.STFormat.ConvertXML.toSTFormat(xml, self.workDir+"grid-unmerging-geniaformat", "a2")
224 stFormatDir = self.workDir+"grid-unmerging-geniaformat"
225
226 stEvaluation = None
227 if self.bioNLPSTParams["evaluate"]:
228 stEvaluation = self.stEvaluator.evaluate(stFormatDir, self.task)
229 if stEvaluation != None:
230 if bestResults == None or stEvaluation[0] > bestResults[1][0]:
231 bestResults = (params, stEvaluation, stEvaluation[0])
232 else:
233 if bestResults == None or EIXMLResult.getData().fscore > bestResults[1].getData().fscore:
234 bestResults = (params, EIXMLResult, EIXMLResult.getData().fscore)
235 if self.bioNLPSTParams["evaluate"]:
236 shutil.rmtree(self.workDir+"grid-flat-geniaformat")
237 if os.path.exists(self.workDir+"grid-unmerging-geniaformat"):
238 shutil.rmtree(self.workDir+"grid-unmerging-geniaformat")
239 else:
240 print >> sys.stderr, "No predicted edges"
241 return bestResults
242
244 xml = None
245 if not self.unmerging:
246 print >> sys.stderr, "No unmerging"
247 if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING", self.unmerging) and self.unmerging:
248
249 if self.doUnmergingSelfTraining:
250
251 triggerStyle = copy.copy(Parameters.get(self.triggerExampleStyle))
252 edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle))
253 unmergingStyle = Parameters.get(self.unmergingExampleStyle)
254 if "sentenceLimit" in unmergingStyle and unmergingStyle["sentenceLimit"]:
255 triggerStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"]
256 edgeStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"]
257
258 xml = self.triggerDetector.classifyToXML(self.trainData, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=triggerStyle)
259 xml = self.edgeDetector.classifyToXML(xml, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=edgeStyle)
260 assert xml != None
261 EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.trainData, self.parse)
262 else:
263 print >> sys.stderr, "No self-training for unmerging"
264 if self.checkStep("UNMERGING-EXAMPLES", self.unmerging) and self.unmerging:
265
266 GOLD_TEST_FILE = self.optData.replace("-nodup", "")
267 GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "")
268 if self.doUnmergingSelfTraining:
269 if xml == None:
270 xml = self.workDir+"unmerging-extra-edge-pred.xml.gz"
271 self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), [self.trainData.replace("-nodup", ""), xml]],
272 [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"],
273 [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]],
274 exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True)
275 xml = None
276 else:
277 self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), self.trainData.replace("-nodup", "")],
278 [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"],
279 [GOLD_TEST_FILE, GOLD_TRAIN_FILE],
280 exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True)
281 xml = None
282
283 if self.checkStep("BEGIN-UNMERGING-MODEL", self.unmerging) and self.unmerging:
284 self.unmergingDetector.beginModel(None, self.model, self.workDir+"unmerging-train-examples.gz", self.workDir+"unmerging-opt-examples.gz")
285 if self.checkStep("END-UNMERGING-MODEL", self.unmerging) and self.unmerging:
286 self.unmergingDetector.endModel(None, self.model, self.workDir+"unmerging-opt-examples.gz")
287 print >> sys.stderr, "Adding unmerging classifier model to test-set event model"
288 if self.combinedModel != None:
289 self.combinedModel.addStr("unmerging-example-style", self.model.getStr("unmerging-example-style"))
290 self.combinedModel.insert(self.model.get("unmerging-ids.classes"), "unmerging-ids.classes")
291 self.combinedModel.insert(self.model.get("unmerging-ids.features"), "unmerging-ids.features")
292 self.unmergingDetector.addClassifierModel(self.combinedModel, self.model.get("unmerging-classifier-model"),
293 self.model.getStr("unmerging-classifier-parameter"))
294 self.combinedModel.save()
295
296 - def classify(self, data, model, output, parse=None, task=None, goldData=None, fromStep=None, toStep=None, omitSteps=None, workDir=None):
297
298 xml = None
299 model = self.openModel(model, "r")
300 self.initVariables(classifyData=data, model=model, xml=None, task=task, parse=parse)
301 self.enterState(self.STATE_CLASSIFY, ["TRIGGERS", "EDGES", "UNMERGING", "MODIFIERS", "ST-CONVERT"], fromStep, toStep, omitSteps)
302
303 self.setWorkDir(workDir)
304 if workDir == None:
305 self.setTempWorkDir()
306 workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
307 self.model = self.openModel(self.model, "r")
308 stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
309 if self.checkStep("TRIGGERS"):
310 xml = self.triggerDetector.classifyToXML(self.classifyData, self.model, None, workOutputTag, goldData=goldData, parse=self.parse, recallAdjust=float(self.getStr("recallAdjustParameter", self.model)))
311 if self.checkStep("EDGES"):
312 xml = self.getWorkFile(xml, workOutputTag + "trigger-pred.xml.gz")
313 xml = self.edgeDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse)
314 assert xml != None
315 if self.parse == None:
316 edgeParse = self.getStr(self.edgeDetector.tag+"parse", self.model)
317 else:
318 edgeParse = self.parse
319
320 if goldData != None:
321 EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, goldData, edgeParse)
322 else:
323 EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.classifyData, edgeParse)
324 if self.checkStep("UNMERGING"):
325 if self.model.hasMember("unmerging-classifier-model"):
326
327
328 xml = self.getWorkFile(None, workOutputTag + "edge-pred.xml.gz")
329
330
331
332
333 xml = self.unmergingDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse)
334 else:
335 print >> sys.stderr, "No model for unmerging"
336 if self.checkStep("MODIFIERS"):
337 if self.model.hasMember("modifier-classifier-model"):
338 xml = self.getWorkFile(xml, [workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
339 xml = self.modifierDetector.classifyToXML(xml, self.model, None, workOutputTag, goldData=goldData, parse=self.parse)
340 else:
341 print >> sys.stderr, "No model for modifier detection"
342 if self.checkStep("ST-CONVERT"):
343 if stParams["convert"]:
344 xml = self.getWorkFile(xml, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
345 Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events.tar.gz", outputTag="a2", writeScores=(stParams["scores"] == True))
346 if stParams["evaluate"]:
347 task = self.task
348 if task == None:
349 task = self.getStr(self.edgeDetector.tag+"task", self.model)
350 self.stEvaluator.evaluate(output + "-events.tar.gz", task)
351 else:
352 print >> sys.stderr, "No BioNLP shared task format conversion"
353 finalXMLFile = self.getWorkFile(None, [workOutputTag + "modifier-pred.xml.gz", workOutputTag + "unmerging-pred.xml.gz", workOutputTag + "edge-pred.xml.gz"])
354 if finalXMLFile != None:
355 shutil.copy2(finalXMLFile, output+"-pred.xml.gz")
356 self.deleteTempWorkDir()
357 self.exitState()
358
359 - def getWorkFile(self, fileObject, serializedPath=None):
360 """
361 Returns fileObject if it is not None, otherwise tries all paths in serializedPath
362 and returns the first one that exists. Use this to get an intermediate file in a
363 stepwise process.
364 """
365 if fileObject != None:
366 return fileObject
367 elif type(serializedPath) not in types.StringTypes:
368 for sPath in serializedPath:
369 if os.path.exists(sPath):
370 return sPath
371 assert False
372 else:
373 assert os.path.exists(serializedPath)
374 return serializedPath
375