TEES.classify

1 """ 2 Detect events or relations from text. 3 """ 4 from train import workdir, getDetector, getSteps 5 import sys, os 6 import tempfile 7 import codecs 8 import Utils.Settings as Settings 9 import Utils.Stream as Stream 10 import Utils.Download 11 from Utils.Connection.Connection import getConnection 12 import Utils.Download 13 from Detectors.Preprocessor import Preprocessor 14

15 -def classify(input, model, output, workDir=None, step=None, omitSteps=None, 16 goldInput=None, detector=None, debug=False, clear=False, 17 preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None):

18 """ 19 Detect events or relations from text. 20 21 @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name. 22 @param model: A path to a model file or the name of a TEES default model. 23 @param output: The output file stem. Output files will be of the form output-* 24 @param workDir: If intermediate files need to be saved, they will go here. 25 @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY 26 @param omitSteps: step=substep parameters, where multiple substeps can be defined. 27 @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance 28 @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model. 29 @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved 30 @param clear: Remove existing workDir 31 @param preprocessorTag: preprocessor output file will be output + preprocessorTag 32 @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model. 33 @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model. 34 """ 35 input = os.path.abspath(input) 36 if goldInput != None: goldInput = os.path.abspath(goldInput) 37 if model != None: model = os.path.abspath(model) 38 # Initialize working directory 39 if workDir != None: # use a permanent work directory 40 workdir(workDir, clear) 41 Stream.openLog(output + "-log.txt") # log in the output directory 42 # Get input files 43 input, preprocess = getInput(input) 44 model = getModel(model) 45 # Define processing steps 46 selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"]) 47 if not preprocess: 48 selector.markOmitSteps("PREPROCESS") 49 50 classifyInput = input 51 if selector.check("PREPROCESS"): 52 preprocessor = Preprocessor() 53 preprocessorOutput = output + preprocessorTag 54 #preprocessor.debug = debug 55 #preprocessor.source = input # This has to be defined already here, needs to be fixed later 56 #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities 57 if os.path.exists(preprocessorOutput) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")): 58 #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing." 59 print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing." 60 classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS") 61 else: 62 #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist" 63 print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist" 64 print >> sys.stderr, "------------ Preprocessing ------------" 65 # Remove some of the unnecessary intermediate files 66 #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None}) 67 # Process input into interaction XML 68 classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"]) 69 70 if selector.check("CLASSIFY"): 71 detector = getDetector(detector, model)[0]() # initialize detector object 72 detector.debug = debug 73 detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model) 74 detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)

75

76 -def getModel(model):

77 if model == None: 78 return None 79 if not os.path.exists(model): 80 print >> sys.stderr, "Model", model, "doesn't exist, looking for a default model" 81 modelName = os.path.basename(model) 82 found = None 83 if hasattr(Settings, "MODEL_DIR"): 84 for suffix in ["", "-test", ".zip", "-test.zip"]: 85 predefined = os.path.join(Settings.MODEL_DIR, modelName + suffix) 86 if os.path.exists(predefined): 87 print >> sys.stderr, "Classifying with default model", predefined 88 found = predefined 89 model = found 90 break 91 if found == None: 92 print >> sys.stderr, "No default model found for definition", modelName 93 else: 94 print >> sys.stderr, "Default model directory MODEL_DIR not defined in Settings" 95 if found == None: 96 raise Exception("Model " + str(model) + " not found") 97 else: 98 print >> sys.stderr, "Classifying with model", model 99 return os.path.abspath(model)

100

101 -def getInput(input, model=None):

102 if input == None: # Get a corpus corresponding to the model 103 assert model != None 104 input = model.split(".")[0] 105 106 if os.path.basename(input).isdigit(): # PMID 107 print >> sys.stderr, "Classifying PubMed abstract", os.path.basename(input) 108 input = getPubMed(os.path.basename(input)) 109 preprocess = True 110 elif not os.path.exists(input): # Use a predefined corpus 111 defaultInput = os.path.basename(input) 112 for suffix in ["", ".xml", ".xml.gz"]: 113 predefined = os.path.join(Settings.CORPUS_DIR, defaultInput + suffix) 114 found = None 115 if os.path.exists(predefined): 116 print >> sys.stderr, "Classifying default corpus file", predefined 117 found = predefined 118 preprocess = False 119 break 120 if found == None: 121 raise Exception("Default corpus file for input " + str(defaultInput) + " not found") 122 input = found 123 else: 124 print >> sys.stderr, "Classifying input", input 125 preprocess = True 126 return os.path.abspath(input), preprocess

127

128 -def getPubMed(pmid):

129 print >> sys.stderr, "*************************** NOTE ***************************" 130 print >> sys.stderr, "Do not attempt to do large-scale classification of PubMed" 131 print >> sys.stderr, "abstracts with this feature. For that, use the downloadable" 132 print >> sys.stderr, "PubMed release. This is a demonstration feature only, and" 133 print >> sys.stderr, "abusing it will cause you to be banned from PubMed!" 134 print >> sys.stderr, "************************************************************" 135 print >> sys.stderr, "Downloading PubMed abstract", pmid 136 tempDir = tempfile.gettempdir() 137 url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + str(pmid) + "&retmode=xml" 138 downloaded = os.path.join(tempDir, "pmid-" + str(pmid)) 139 Utils.Download.download(url, downloaded + ".xml", False) 140 # Read the text from the XML 141 f = codecs.open(downloaded + ".xml", "rt", "utf-8") 142 textElements = [] 143 for line in f: 144 line = line.strip() 145 for tag in ["<ArticleTitle>", "<AbstractText>"]: 146 if line.startswith(tag): 147 textElements.append(line.split(">", 1)[1].split("<")[0]) 148 f.close() 149 # Save the text file 150 f = codecs.open(downloaded + ".txt", "wt", "utf-8") 151 f.write("\n".join(textElements)) 152 f.close() 153 # Return text file name 154 return downloaded + ".txt"

155 156 if __name__=="__main__": 157 # Import Psyco if available 158 try: 159 import psyco 160 psyco.full() 161 print >> sys.stderr, "Found Psyco, using" 162 except ImportError: 163 print >> sys.stderr, "Psyco not installed" 164 165 from optparse import OptionParser 166 optparser = OptionParser(description="Predict events/relations") 167 optparser.add_option("-i", "--input", default=None, dest="input", help="input") 168 optparser.add_option("-o", "--output", default=None, dest="output", help="output file stem") 169 optparser.add_option("-w", "--workdir", default=None, dest="workdir", help="output directory") 170 optparser.add_option("-m", "--model", default=None, dest="model", help="TEES model") 171 optparser.add_option("-d", "--detector", default=None, dest="detector", help="") 172 optparser.add_option("-c", "--connection", default=None, dest="connection", help="") 173 optparser.add_option("-g", "--gold", default=None, dest="gold", help="annotated version of the input file (optional)") 174 optparser.add_option("-p", "--preprocessorParams", default=None, dest="preprocessorParams", help="") 175 optparser.add_option("-b", "--bioNLPSTParams", default=None, dest="bioNLPSTParams", help="") 176 # Debugging and process control 177 optparser.add_option("--step", default=None, dest="step", help="") 178 optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="") 179 optparser.add_option("--clearAll", default=False, action="store_true", dest="clearAll", help="Delete all files") 180 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="More verbose output") 181 (options, args) = optparser.parse_args() 182 183 assert options.output != None 184 classify(options.input, options.model, options.output, options.workdir, options.step, options.omitSteps, 185 options.gold, options.detector, options.debug, options.clearAll, 186 preprocessorParams=options.preprocessorParams, bioNLPSTParams=options.bioNLPSTParams) 187

Source Code for Module TEES.classify