TEES.Tools.BLLIPParser

1 import sys,os 2 import time 3 import shutil 4 import subprocess 5 import tempfile 6 import codecs 7 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 8 try: 9 import xml.etree.cElementTree as ET 10 except ImportError: 11 import cElementTree as ET 12 import Utils.ElementTreeUtils as ETUtils 13 import Utils.Settings as Settings 14 import Utils.Download as Download 15 import Tool 16 import StanfordParser 17 from ProcessUtils import * 18 19 20 escDict={"-LRB-":"(", 21 "-RRB-":")", 22 "-LCB-":"{", 23 "-RCB-":"}", 24 "-LSB-":"[", 25 "-RSB-":"]", 26 "``":"\"", 27 "''":"\""} 28

29 -def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):

30 url = Settings.URL["BLLIP_SOURCE"] 31 if downloadDir == None: 32 downloadDir = os.path.join(Settings.DATAPATH) + "/tools/download" 33 if destDir == None: 34 destDir = Settings.DATAPATH + "/tools/BLLIP" 35 items = Download.downloadAndExtract(url, destDir, downloadDir + "/bllip.zip", None, False) 36 print >> sys.stderr, "Installing BLLIP parser" 37 Tool.testPrograms("BLLIP parser", ["make", "flex"], {"flex":"flex --version"}) 38 parserPath = Download.getTopDir(destDir, items) 39 cwd = os.getcwd() 40 os.chdir(parserPath) 41 print >> sys.stderr, "Compiling first-stage parser" 42 subprocess.call("make", shell=True) 43 print >> sys.stderr, "Compiling second-stage parser" 44 subprocess.call("make reranker", shell=True) 45 os.chdir(cwd) 46 print >> sys.stderr, "Installing the McClosky biomedical parsing model" 47 url = "http://bllip.cs.brown.edu/download/bioparsingmodel-rel1.tar.gz" 48 Download.downloadAndExtract(url, destDir, downloadDir, None) 49 bioModelDir = os.path.abspath(destDir + "/biomodel") 50 # Check that everything works 51 Tool.finalizeInstall(["first-stage/PARSE/parseIt", "second-stage/programs/features/best-parses"], 52 {"first-stage/PARSE/parseIt":"first-stage/PARSE/parseIt " + bioModelDir + "/parser/ < /dev/null", 53 "second-stage/programs/features/best-parses":"second-stage/programs/features/best-parses -l " + bioModelDir + "/reranker/features.gz " + bioModelDir + "/reranker/weights.gz < /dev/null"}, 54 parserPath, {"BLLIP_PARSER_DIR":os.path.abspath(parserPath), 55 "MCCLOSKY_BIOPARSINGMODEL_DIR":bioModelDir}, updateLocalSettings)

56

57 -def readPenn(treeLine):

58 global escDict 59 escSymbols = sorted(escDict.keys()) 60 tokens = [] 61 phrases = [] 62 stack = [] 63 if treeLine.strip() != "": 64 # Add tokens 65 prevSplit = None 66 tokenCount = 0 67 splitCount = 0 68 splits = treeLine.split() 69 for split in splits: 70 if split[0] != "(": 71 tokenText = split 72 while tokenText[-1] == ")": 73 tokenText = tokenText[:-1] 74 if tokenText[-1] == ")": # this isn't the closing parenthesis for the current token 75 stackTop = stack.pop() 76 phrases.append( (stackTop[0], tokenCount, stackTop[1]) ) 77 origTokenText = tokenText 78 for escSymbol in escSymbols: 79 tokenText = tokenText.replace(escSymbol, escDict[escSymbol]) 80 81 posText = prevSplit 82 while posText[0] == "(": 83 posText = posText[1:] 84 for escSymbol in escSymbols: 85 posText = posText.replace(escSymbol, escDict[escSymbol]) 86 tokens.append( (tokenText, posText, origTokenText) ) 87 tokenCount += 1 88 elif splits[splitCount + 1][0] == "(": 89 stack.append( (tokenCount, split[1:]) ) 90 prevSplit = split 91 splitCount += 1 92 return tokens, phrases

93

94 -def insertTokens(tokens, sentence, tokenization, idStem="bt_", errorNotes=None):

95 tokenCount = 0 96 start = 0 97 prevStart = None 98 for tokenText, posTag, origTokenText in tokens: 99 sText = sentence.get("text") 100 # Determine offsets 101 cStart = sText.find(tokenText, start) 102 #assert cStart != -1, (tokenText, tokens, posTag, start, sText) 103 if cStart == -1: # Try again with original text (sometimes escaping can remove correct text) 104 cStart = sText.find(origTokenText, start) 105 if cStart == -1 and prevStart != None: # Try again with the previous position, sometimes the parser duplicates tokens 106 cStart = sText.find(origTokenText, prevStart) 107 if cStart != -1: 108 start = prevStart 109 print >> sys.stderr, "Token duplication", (tokenText, tokens, posTag, start, sText, errorNotes) 110 if cStart == -1: 111 print >> sys.stderr, "Token alignment error", (tokenText, tokens, posTag, start, sText, errorNotes) 112 for subElement in [x for x in tokenization]: 113 tokenization.remove(subElement) 114 return False 115 cEnd = cStart + len(tokenText) 116 prevStart = start 117 start = cStart + len(tokenText) 118 # Make element 119 token = ET.Element("token") 120 token.set("id", idStem + str(tokenCount)) 121 token.set("text", tokenText) 122 token.set("POS", posTag) 123 token.set("charOffset", str(cStart) + "-" + str(cEnd)) # NOTE: check 124 tokenization.append(token) 125 tokenCount += 1 126 return True

127

128 -def insertPhrases(phrases, parse, tokenElements, idStem="bp_"):

129 count = 0 130 phrases.sort() 131 for phrase in phrases: 132 phraseElement = ET.Element("phrase") 133 phraseElement.set("type", phrase[2]) 134 phraseElement.set("id", idStem + str(count)) 135 phraseElement.set("begin", str(phrase[0])) 136 phraseElement.set("end", str(phrase[1])) 137 t1 = None 138 t2 = None 139 if phrase[0] < len(tokenElements): 140 t1 = tokenElements[phrase[0]] 141 if phrase[1] < len(tokenElements): 142 t2 = tokenElements[phrase[1]] 143 if t1 != None and t2 != None: 144 phraseElement.set("charOffset", t1.get("charOffset").split("-")[0] + "-" + t2.get("charOffset").split("-")[-1]) 145 parse.append(phraseElement) 146 count += 1

147 148 149

150 -def insertParse(sentence, treeLine, parseName="McCC", tokenizationName = None, makePhraseElements=True, extraAttributes={}, docId=None):

151 # Find or create container elements 152 analyses = setDefaultElement(sentence, "analyses")#"sentenceanalyses") 153 #tokenizations = setDefaultElement(sentenceAnalyses, "tokenizations") 154 #parses = setDefaultElement(sentenceAnalyses, "parses") 155 # Check that the parse does not exist 156 for prevParse in analyses.findall("parse"): 157 assert prevParse.get("parser") != parseName 158 # Create a new parse element 159 parse = ET.Element("parse") 160 parse.set("parser", parseName) 161 if tokenizationName == None: 162 parse.set("tokenizer", parseName) 163 else: 164 parse.set("tokenizer", tokenizationName) 165 analyses.insert(getPrevElementIndex(analyses, "parse"), parse) 166 167 tokenByIndex = {} 168 parse.set("pennstring", treeLine.strip()) 169 for attr in sorted(extraAttributes.keys()): 170 parse.set(attr, extraAttributes[attr]) 171 if treeLine.strip() == "": 172 return False 173 else: 174 tokens, phrases = readPenn(treeLine) 175 # Get tokenization 176 if tokenizationName == None: # Parser-generated tokens 177 for prevTokenization in analyses.findall("tokenization"): 178 assert prevTokenization.get("tokenizer") != tokenizationName 179 tokenization = ET.Element("tokenization") 180 tokenization.set("tokenizer", parseName) 181 for attr in sorted(extraAttributes.keys()): # add the parser extra attributes to the parser generated tokenization 182 tokenization.set(attr, extraAttributes[attr]) 183 analyses.insert(getElementIndex(analyses, parse), tokenization) 184 # Insert tokens to parse 185 insertTokens(tokens, sentence, tokenization, errorNotes=(sentence.get("id"), docId)) 186 else: 187 tokenization = getElementByAttrib(analyses, "tokenization", {"tokenizer":tokenizationName}) 188 # Insert phrases to parse 189 if makePhraseElements: 190 insertPhrases(phrases, parse, tokenization.findall("token")) 191 return True

192

193 -def runBLLIPParser(input, output, tokenizer=False, pathBioModel=None):

194 if tokenizer: 195 print >> sys.stderr, "Running BLLIP parser with tokenization" 196 else: 197 print >> sys.stderr, "Running BLLIP parser without tokenization" 198 #args = ["./parse-50best-McClosky.sh"] 199 #return subprocess.Popen(args, 200 # stdin=codecs.open(input, "rt", "utf-8"), 201 # stdout=codecs.open(output, "wt", "utf-8"), shell=True) 202 203 assert os.path.exists(pathBioModel), pathBioModel 204 if tokenizer: 205 firstStageArgs = ["first-stage/PARSE/parseIt", "-l999", "-N50" , pathBioModel+"/parser/"] 206 else: 207 firstStageArgs = ["first-stage/PARSE/parseIt", "-l999", "-N50" , "-K", pathBioModel+"/parser/"] 208 secondStageArgs = ["second-stage/programs/features/best-parses", "-l", pathBioModel+"/reranker/features.gz", pathBioModel+"/reranker/weights.gz"] 209 210 firstStage = subprocess.Popen(firstStageArgs, 211 stdin=codecs.open(input, "rt", "utf-8"), 212 stdout=subprocess.PIPE) 213 secondStage = subprocess.Popen(secondStageArgs, 214 stdin=firstStage.stdout, 215 stdout=codecs.open(output, "wt", "utf-8")) 216 return ProcessWrapper([firstStage, secondStage])

217

218 -def getSentences(corpusRoot, requireEntities=False, skipIds=[], skipParsed=True):

219 for sentence in corpusRoot.getiterator("sentence"): 220 if sentence.get("id") in skipIds: 221 print >> sys.stderr, "Skipping sentence", sentence.get("id") 222 continue 223 if requireEntities: 224 if sentence.find("entity") == None: 225 continue 226 if skipParsed: 227 if ETUtils.getElementByAttrib(sentence, "parse", {"parser":"McCC"}) != None: 228 continue 229 yield sentence

230

231 -def parse(input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel=None, timestamp=True):

232 global escDict 233 print >> sys.stderr, "BLLIP parser" 234 parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S") 235 print >> sys.stderr, "BLLIP time stamp:", parseTimeStamp 236 237 if pathParser == None: 238 pathParser = Settings.BLLIP_PARSER_DIR 239 print >> sys.stderr, "BLLIP parser at:", pathParser 240 if pathBioModel == None: 241 pathBioModel = Settings.MCCLOSKY_BIOPARSINGMODEL_DIR 242 print >> sys.stderr, "Biomodel at:", pathBioModel 243 if requireEntities: 244 print >> sys.stderr, "Parsing only sentences with entities" 245 246 print >> sys.stderr, "Loading corpus", input 247 corpusTree = ETUtils.ETFromObj(input) 248 print >> sys.stderr, "Corpus file loaded" 249 corpusRoot = corpusTree.getroot() 250 251 # Write text to input file 252 workdir = tempfile.mkdtemp() 253 if debug: 254 print >> sys.stderr, "BLLIP parser workdir", workdir 255 infileName = os.path.join(workdir, "parser-input.txt") 256 infile = codecs.open(infileName, "wt", "utf-8") 257 numCorpusSentences = 0 258 if tokenizationName == None or tokenizationName == "PARSED_TEXT": # Parser does tokenization 259 if tokenizationName == None: 260 print >> sys.stderr, "Parser does the tokenization" 261 else: 262 print >> sys.stderr, "Parsing tokenized text" 263 #for sentence in corpusRoot.getiterator("sentence"): 264 for sentence in getSentences(corpusRoot, requireEntities, skipIds, skipParsed): 265 infile.write("<s> " + sentence.get("text") + " </s>\n") 266 numCorpusSentences += 1 267 else: # Use existing tokenization 268 print >> sys.stderr, "Using existing tokenization", tokenizationName 269 for sentence in getSentences(corpusRoot, requireEntities, skipIds, skipParsed): 270 tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":tokenizationName}) 271 assert tokenization.get("tokenizer") == tokenizationName 272 s = "" 273 for token in tokenization.findall("token"): 274 s += token.get("text") + " " 275 infile.write("<s> " + s + "</s>\n") 276 numCorpusSentences += 1 277 infile.close() 278 279 #PARSERROOT=/home/smp/tools/McClosky-Charniak/reranking-parser 280 #BIOPARSINGMODEL=/home/smp/tools/McClosky-Charniak/reranking-parser/biomodel 281 #${PARSERROOT}/first-stage/PARSE/parseIt -K -l399 -N50 ${BIOPARSINGMODEL}/parser/ $* | ${PARSERROOT}/second-stage/programs/features/best-parses -l ${BIOPARSINGMODEL}/reranker/features.gz ${BIOPARSINGMODEL}/reranker/weights.gz 282 283 # Run parser 284 #print >> sys.stderr, "Running parser", pathParser + "/parse.sh" 285 cwd = os.getcwd() 286 os.chdir(pathParser) 287 if tokenizationName == None: 288 bllipOutput = runSentenceProcess(runBLLIPParser, pathParser, infileName, workdir, False, "BLLIPParser", "Parsing", timeout=timeout, processArgs={"tokenizer":True, "pathBioModel":pathBioModel}) 289 else: 290 if tokenizationName == "PARSED_TEXT": # The sentence strings are already tokenized 291 tokenizationName = None 292 bllipOutput = runSentenceProcess(runBLLIPParser, pathParser, infileName, workdir, False, "BLLIPParser", "Parsing", timeout=timeout, processArgs={"tokenizer":False, "pathBioModel":pathBioModel}) 293 # args = [charniakJohnsonParserDir + "/parse-50best-McClosky.sh"] 294 # #bioParsingModel = charniakJohnsonParserDir + "/first-stage/DATA-McClosky" 295 # #args = charniakJohnsonParserDir + "/first-stage/PARSE/parseIt -K -l399 -N50 " + bioParsingModel + "/parser | " + charniakJohnsonParserDir + "/second-stage/programs/features/best-parses -l " + bioParsingModel + "/reranker/features.gz " + bioParsingModel + "/reranker/weights.gz" 296 os.chdir(cwd) 297 298 treeFile = codecs.open(bllipOutput, "rt", "utf-8") 299 print >> sys.stderr, "Inserting parses" 300 # Add output to sentences 301 failCount = 0 302 for sentence in getSentences(corpusRoot, requireEntities, skipIds, skipParsed): 303 treeLine = treeFile.readline() 304 extraAttributes={"source":"TEES"} # parser was run through this wrapper 305 if timestamp: 306 extraAttributes["date"] = parseTimeStamp # links the parse to the log file 307 if not insertParse(sentence, treeLine, parseName, makePhraseElements=makePhraseElements, extraAttributes=extraAttributes): 308 failCount += 1 309 310 treeFile.close() 311 # Remove work directory 312 if not debug: 313 shutil.rmtree(workdir) 314 315 print >> sys.stderr, "Parsed", numCorpusSentences, "sentences (" + str(failCount) + " failed)" 316 if failCount == 0: 317 print >> sys.stderr, "All sentences were parsed succesfully" 318 else: 319 print >> sys.stderr, "Warning, parsing failed for", failCount, "out of", numCorpusSentences, "sentences" 320 print >> sys.stderr, "The \"pennstring\" attribute of these sentences has an empty string." 321 if output != None: 322 print >> sys.stderr, "Writing output to", output 323 ETUtils.write(corpusRoot, output) 324 return corpusTree

325

326 -def insertParses(input, parsePath, output=None, parseName="McCC", tokenizationName = None, makePhraseElements=True, extraAttributes={}):

327 import tarfile 328 from SentenceSplitter import openFile 329 """ 330 Divide text in the "text" attributes of document and section 331 elements into sentence elements. These sentence elements are 332 inserted into their respective parent elements. 333 """ 334 print >> sys.stderr, "Loading corpus", input 335 corpusTree = ETUtils.ETFromObj(input) 336 print >> sys.stderr, "Corpus file loaded" 337 corpusRoot = corpusTree.getroot() 338 339 print >> sys.stderr, "Inserting parses from", parsePath 340 if parsePath.find(".tar.gz") != -1: 341 tarFilePath, parsePath = parsePath.split(".tar.gz") 342 tarFilePath += ".tar.gz" 343 tarFile = tarfile.open(tarFilePath) 344 if parsePath[0] == "/": 345 parsePath = parsePath[1:] 346 else: 347 tarFile = None 348 349 docCount = 0 350 failCount = 0 351 docsWithSentences = 0 352 numCorpusSentences = 0 353 sentencesCreated = 0 354 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] 355 counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion") 356 for document in sourceElements: 357 docCount += 1 358 origId = document.get("pmid") 359 if origId == None: 360 origId = document.get("origId") 361 origId = str(origId) 362 counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ") 363 docId = document.get("id") 364 if docId == None: 365 docId = "CORPUS.d" + str(docCount) 366 367 f = openFile(os.path.join(parsePath, origId + ".ptb"), tarFile) 368 if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension 369 f = openFile(os.path.join(parsePath, origId + ".pstree"), tarFile) 370 if f == None: # no parse found 371 continue 372 parseStrings = f.readlines() 373 f.close() 374 sentences = document.findall("sentence") 375 numCorpusSentences += len(sentences) 376 assert len(sentences) == len(parseStrings) 377 # TODO: Following for-loop is the same as when used with a real parser, and should 378 # be moved to its own function. 379 for sentence, treeLine in zip(sentences, parseStrings): 380 if not insertParse(sentence, treeLine, makePhraseElements=makePhraseElements, extraAttributes=extraAttributes, docId=origId): 381 failCount += 1 382 383 if tarFile != None: 384 tarFile.close() 385 #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" 386 #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences" 387 388 print >> sys.stderr, "Inserted parses for", numCorpusSentences, "sentences (" + str(failCount) + " failed)" 389 if failCount == 0: 390 print >> sys.stderr, "All sentences have a parse" 391 else: 392 print >> sys.stderr, "Warning, a failed parse exists for", failCount, "out of", numCorpusSentences, "sentences" 393 print >> sys.stderr, "The \"pennstring\" attribute of these sentences has an empty string." 394 if output != None: 395 print >> sys.stderr, "Writing output to", output 396 ETUtils.write(corpusRoot, output) 397 return corpusTree

398 399 if __name__=="__main__": 400 import sys 401 402 from optparse import OptionParser, OptionGroup 403 # Import Psyco if available 404 try: 405 import psyco 406 psyco.full() 407 print >> sys.stderr, "Found Psyco, using" 408 except ImportError: 409 print >> sys.stderr, "Psyco not installed" 410 411 optparser = OptionParser(description="BLLIP parser wrapper") 412 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 413 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 414 optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="Name of tokenization element.") 415 optparser.add_option("-s", "--stanford", default=False, action="store_true", dest="stanford", help="Run stanford conversion.") 416 optparser.add_option("--timestamp", default=False, action="store_true", dest="timestamp", help="Mark parses with a timestamp.") 417 optparser.add_option("--pathParser", default=None, dest="pathParser", help="") 418 optparser.add_option("--pathBioModel", default=None, dest="pathBioModel", help="") 419 group = OptionGroup(optparser, "Install Options", "") 420 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER") 421 group.add_option("--installDir", default=None, dest="installDir", help="Install directory") 422 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory") 423 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files") 424 optparser.add_option_group(group) 425 (options, args) = optparser.parse_args() 426 427 if options.install: 428 install(options.installDir, options.downloadDir, redownload=options.redownload) 429 else: 430 xml = parse(input=options.input, output=options.output, tokenizationName=options.tokenization, pathParser=options.pathParser, pathBioModel=options.pathBioModel, timestamp=options.timestamp) 431 if options.stanford: 432 import StanfordParser 433 StanfordParser.convertXML(parser="McClosky", input=xml, output=options.output) 434

Source Code for Module TEES.Tools.BLLIPParser