1 """
2 Detect events or relations from text.
3 """
4 from train import workdir, getDetector, getSteps
5 import sys, os
6 import tempfile
7 import codecs
8 import Utils.Settings as Settings
9 import Utils.Stream as Stream
10 import Utils.Download
11 from Utils.Connection.Connection import getConnection
12 import Utils.Download
13 from Detectors.Preprocessor import Preprocessor
14
15 -def classify(input, model, output, workDir=None, step=None, omitSteps=None,
16 goldInput=None, detector=None, debug=False, clear=False,
17 preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None):
18 """
19 Detect events or relations from text.
20
21 @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
22 @param model: A path to a model file or the name of a TEES default model.
23 @param output: The output file stem. Output files will be of the form output-*
24 @param workDir: If intermediate files need to be saved, they will go here.
25 @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
26 @param omitSteps: step=substep parameters, where multiple substeps can be defined.
27 @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
28 @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
29 @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
30 @param clear: Remove existing workDir
31 @param preprocessorTag: preprocessor output file will be output + preprocessorTag
32 @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
33 @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
34 """
35 input = os.path.abspath(input)
36 if goldInput != None: goldInput = os.path.abspath(goldInput)
37 if model != None: model = os.path.abspath(model)
38
39 if workDir != None:
40 workdir(workDir, clear)
41 Stream.openLog(output + "-log.txt")
42
43 input, preprocess = getInput(input)
44 model = getModel(model)
45
46 selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"])
47 if not preprocess:
48 selector.markOmitSteps("PREPROCESS")
49
50 classifyInput = input
51 if selector.check("PREPROCESS"):
52 preprocessor = Preprocessor()
53 preprocessorOutput = output + preprocessorTag
54
55
56
57 if os.path.exists(preprocessorOutput) and not clear:
58
59 print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
60 classifyInput = preprocessorOutput
61 else:
62
63 print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
64 print >> sys.stderr, "------------ Preprocessing ------------"
65
66
67
68 classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"])
69
70 if selector.check("CLASSIFY"):
71 detector = getDetector(detector, model)[0]()
72 detector.debug = debug
73 detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model)
74 detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
75
77 if model == None:
78 return None
79 if not os.path.exists(model):
80 print >> sys.stderr, "Model", model, "doesn't exist, looking for a default model"
81 modelName = os.path.basename(model)
82 found = None
83 if hasattr(Settings, "MODEL_DIR"):
84 for suffix in ["", "-test", ".zip", "-test.zip"]:
85 predefined = os.path.join(Settings.MODEL_DIR, modelName + suffix)
86 if os.path.exists(predefined):
87 print >> sys.stderr, "Classifying with default model", predefined
88 found = predefined
89 model = found
90 break
91 if found == None:
92 print >> sys.stderr, "No default model found for definition", modelName
93 else:
94 print >> sys.stderr, "Default model directory MODEL_DIR not defined in Settings"
95 if found == None:
96 raise Exception("Model " + str(model) + " not found")
97 else:
98 print >> sys.stderr, "Classifying with model", model
99 return os.path.abspath(model)
100
127
129 print >> sys.stderr, "*************************** NOTE ***************************"
130 print >> sys.stderr, "Do not attempt to do large-scale classification of PubMed"
131 print >> sys.stderr, "abstracts with this feature. For that, use the downloadable"
132 print >> sys.stderr, "PubMed release. This is a demonstration feature only, and"
133 print >> sys.stderr, "abusing it will cause you to be banned from PubMed!"
134 print >> sys.stderr, "************************************************************"
135 print >> sys.stderr, "Downloading PubMed abstract", pmid
136 tempDir = tempfile.gettempdir()
137 url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + str(pmid) + "&retmode=xml"
138 downloaded = os.path.join(tempDir, "pmid-" + str(pmid))
139 Utils.Download.download(url, downloaded + ".xml", False)
140
141 f = codecs.open(downloaded + ".xml", "rt", "utf-8")
142 textElements = []
143 for line in f:
144 line = line.strip()
145 for tag in ["<ArticleTitle>", "<AbstractText>"]:
146 if line.startswith(tag):
147 textElements.append(line.split(">", 1)[1].split("<")[0])
148 f.close()
149
150 f = codecs.open(downloaded + ".txt", "wt", "utf-8")
151 f.write("\n".join(textElements))
152 f.close()
153
154 return downloaded + ".txt"
155
156 if __name__=="__main__":
157
158 try:
159 import psyco
160 psyco.full()
161 print >> sys.stderr, "Found Psyco, using"
162 except ImportError:
163 print >> sys.stderr, "Psyco not installed"
164
165 from optparse import OptionParser
166 optparser = OptionParser(description="Predict events/relations")
167 optparser.add_option("-i", "--input", default=None, dest="input", help="input")
168 optparser.add_option("-o", "--output", default=None, dest="output", help="output file stem")
169 optparser.add_option("-w", "--workdir", default=None, dest="workdir", help="output directory")
170 optparser.add_option("-m", "--model", default=None, dest="model", help="TEES model")
171 optparser.add_option("-d", "--detector", default=None, dest="detector", help="")
172 optparser.add_option("-c", "--connection", default=None, dest="connection", help="")
173 optparser.add_option("-g", "--gold", default=None, dest="gold", help="annotated version of the input file (optional)")
174 optparser.add_option("-p", "--preprocessorParams", default=None, dest="preprocessorParams", help="")
175 optparser.add_option("-b", "--bioNLPSTParams", default=None, dest="bioNLPSTParams", help="")
176
177 optparser.add_option("--step", default=None, dest="step", help="")
178 optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="")
179 optparser.add_option("--clearAll", default=False, action="store_true", dest="clearAll", help="Delete all files")
180 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="More verbose output")
181 (options, args) = optparser.parse_args()
182
183 assert options.output != None
184 classify(options.input, options.model, options.output, options.workdir, options.step, options.omitSteps,
185 options.gold, options.detector, options.debug, options.clearAll,
186 preprocessorParams=options.preprocessorParams, bioNLPSTParams=options.bioNLPSTParams)
187