TEES.Tools.StanfordParser

1 import sys, os 2 import shutil 3 import subprocess 4 import tempfile 5 import tarfile 6 import codecs 7 from ProcessUtils import * 8 try: 9 import xml.etree.cElementTree as ET 10 except ImportError: 11 import cElementTree as ET 12 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),".."))) 13 import Utils.ElementTreeUtils as ETUtils 14 import Utils.Settings as Settings 15 import Utils.Download as Download 16 import Utils.Settings as Settings 17 import Tool 18 #stanfordParserDir = "/home/jari/biotext/tools/stanford-parser-2010-08-20" 19 #stanfordParserDir = "/home/jari/temp_exec/stanford-parser-2010-08-20" 20 #stanfordParserDir = Settings.STANFORD_PARSER_DIR 21 #stanfordParserArgs = ["java", "-mx150m", "-cp", 22 # "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 23 # "-CCprocessed", "-treeFile", "-keepPunct"] 24 #stanfordParserArgs = ["java", "-mx500m", "-cp", 25 # "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 26 # "-CCprocessed", "-keepPunct", "-treeFile"] 27 28 escDict={"-LRB-":"(", 29 "-RRB-":")", 30 "-LCB-":"{", 31 "-RCB-":"}", 32 "-LSB-":"[", 33 "-RSB-":"]", 34 "``":"\"", 35 "''":"\""} 36

37 -def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):

38 print >> sys.stderr, "Installing Stanford Parser" 39 if downloadDir == None: 40 downloadDir = os.path.join(Settings.DATAPATH, "tools/download/") 41 if destDir == None: 42 destDir = os.path.join(Settings.DATAPATH, "tools/") 43 items = Download.downloadAndExtract(Settings.URL["STANFORD_PARSER"], destDir, downloadDir) 44 stanfordPath = Download.getTopDir(destDir, items) 45 Tool.finalizeInstall(["stanford-parser.jar"], 46 {"stanford-parser.jar":"java -cp stanford-parser.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure"}, 47 stanfordPath, {"STANFORD_PARSER_DIR":stanfordPath}, updateLocalSettings)

48 49 50 # url = URL["STANFORD_PARSER"] 51 # packageName = url.split("/")[-1].split(".")[0] 52 # # Download 53 # if downloadDir == None: 54 # downloadDir = os.path.join(Settings.DATAPATH, "tools/download/") 55 # downloadFile = Download.download(url, downloadDir, clear=redownload) 56 # # Prepare destination 57 # if destDir == None: 58 # destDir = os.path.join(Settings.DATAPATH, "tools/") 59 # installDir = os.path.join(destDir, packageName) 60 # if os.path.exists(installDir): 61 # print >> sys.stderr, "Removing existing installation at", installDir 62 # shutil.rmtree(installDir) 63 # # Unpack 64 # print >> sys.stderr, "Extracting", downloadFile, "to", destDir 65 # f = tarfile.open(downloadFile, 'r:gz') 66 # f.extractall(destDir) 67 # f.close() 68 # 69 # if test(destDir): 70 # Settings.setLocal("STANFORD_PARSER_DIR", destDir, updateLocalSettings) 71

72 -def runStanford(input, output, stanfordParserArgs):

73 #global stanfordParserArgs 74 ##args = ["java", "-mx150m", "-cp", "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", "-CCprocessed", "-treeFile", input] 75 #args = ["java", "-mx500m", "-cp", "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", "-CCprocessed", "-treeFile", input] 76 #return subprocess.Popen(args, stdout=codecs.open(output, "wt", "utf-8")) 77 return subprocess.Popen(stanfordParserArgs + [input], stdout=codecs.open(output, "wt", "utf-8"))

78 #return subprocess.Popen(stanfordParserArgs + [input], stdout=codecs.open(output, "wt", "latin1", "replace")) 79

80 -def getUnicode(string):

81 try: 82 string = string.encode('raw_unicode_escape').decode('utf-8') # fix latin1? 83 except: 84 pass 85 return string

86

87 -def addDependencies(outfile, parse, tokenByIndex=None, sentenceId=None, skipExtra=0):

88 global escDict 89 escSymbols = sorted(escDict.keys()) 90 91 # A list of tokens for debugging 92 tokens = [] 93 for key in sorted(tokenByIndex): 94 tokens.append(tokenByIndex[key].get("text")) 95 96 depCount = 1 97 line = outfile.readline() 98 #line = line.encode('raw_unicode_escape').decode('utf-8') # fix latin1? 99 line = getUnicode(line) 100 deps = [] 101 # BioNLP'09 Shared Task GENIA uses _two_ newlines to denote a failed parse (usually it's one, 102 # the same as the BLLIP parser. To survive this, skipExtra can be used to define the number 103 # of lines to skip, if the first line of a dependency parse is empty (indicating a failed parse) 104 if line.strip() == "" and skipExtra > 0: 105 for i in range(skipExtra): 106 outfile.readline() 107 while line.strip() != "": 108 # Add dependencies 109 depType, rest = line.strip()[:-1].split("(") 110 t1, t2 = rest.split(", ") 111 t1Word, t1Index = t1.rsplit("-", 1) 112 for escSymbol in escSymbols: 113 t1Word = t1Word.replace(escSymbol, escDict[escSymbol]) 114 while not t1Index[-1].isdigit(): t1Index = t1Index[:-1] # invalid literal for int() with base 10: "7'" 115 t1Index = int(t1Index) 116 t2Word, t2Index = t2.rsplit("-", 1) 117 for escSymbol in escSymbols: 118 t2Word = t2Word.replace(escSymbol, escDict[escSymbol]) 119 while not t2Index[-1].isdigit(): t2Index = t2Index[:-1] # invalid literal for int() with base 10: "7'" 120 t2Index = int(t2Index) 121 # Make element 122 #if depType == "root": 123 # assert t1Word == "ROOT" 124 # if tokenByIndex != None and t2Index-1 in tokenByIndex: 125 # tokenByIndex[t2Index-1].set("stanford-root", "True") 126 if depType != "root": 127 dep = ET.Element("dependency") 128 dep.set("id", "sd_" + str(depCount)) 129 alignmentError = False 130 if tokenByIndex != None: 131 if t1Index-1 not in tokenByIndex: 132 print >> sys.stderr, "Token not found", (t1Word, depCount, sentenceId) 133 deps = [] 134 while line.strip() != "": line = outfile.readline() 135 break 136 if t2Index-1 not in tokenByIndex: 137 print >> sys.stderr, "Token not found", (t2Word, depCount, sentenceId) 138 deps = [] 139 while line.strip() != "": line = outfile.readline() 140 break 141 if t1Word != tokenByIndex[t1Index-1].get("text"): 142 print >> sys.stderr, "Alignment error", (t1Word, tokenByIndex[t1Index-1].get("text"), t1Index-1, depCount, sentenceId, tokens) 143 alignmentError = True 144 if parse.get("stanfordAlignmentError") == None: 145 parse.set("stanfordAlignmentError", t1Word) 146 if t2Word != tokenByIndex[t2Index-1].get("text"): 147 print >> sys.stderr, "Alignment error", (t2Word, tokenByIndex[t2Index-1].get("text"), t2Index-1, depCount, sentenceId, tokens) 148 alignmentError = True 149 if parse.get("stanfordAlignmentError") == None: 150 parse.set("stanfordAlignmentError", t2Word) 151 dep.set("t1", tokenByIndex[t1Index-1].get("id")) 152 dep.set("t2", tokenByIndex[t2Index-1].get("id")) 153 else: 154 dep.set("t1", "bt_" + str(t1Index)) 155 dep.set("t2", "bt_" + str(t2Index)) 156 dep.set("type", depType) 157 parse.insert(depCount-1, dep) 158 depCount += 1 159 if not alignmentError: 160 deps.append(dep) 161 line = outfile.readline() 162 try: 163 line = getUnicode(line) 164 #line = line.encode('raw_unicode_escape').decode('utf-8') # fix latin1? 165 except: 166 print "Type", type(line) 167 print "Repr", repr(line) 168 print line 169 raise 170 return deps

171 172 #def convert(input, output=None): 173 # global stanfordParserDir, stanfordParserArgs 174 # 175 # workdir = tempfile.mkdtemp() 176 # if output == None: 177 # output = os.path.join(workdir, "stanford-output.txt") 178 # 179 # input = os.path.abspath(input) 180 # numCorpusSentences = 0 181 # inputFile = codecs.open(input, "rt", "utf-8") 182 # for line in inputFile: 183 # numCorpusSentences += 1 184 # inputFile.close() 185 # cwd = os.getcwd() 186 # os.chdir(stanfordParserDir) 187 # #args = ["java", "-mx150m", "-cp", 188 # # "stanford-parser.jar", "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 189 # # "-CCprocessed", "-treeFile", "-keepPunct", 190 # # input] 191 # args = stanfordParserArgs + [input] 192 # #subprocess.call(args, 193 # process = subprocess.Popen(args, 194 # stdout=codecs.open(output, "wt", "utf-8")) 195 # waitForProcess(process, numCorpusSentences, True, output, "StanfordParser", "Stanford Conversion") 196 # os.chdir(cwd) 197 # 198 # lines = None 199 # if output == None: 200 # outFile = codecs.open(output, "rt", "utf-8") 201 # lines = outFile.readlines() 202 # outFile.close() 203 # 204 # shutil.rmtree(workdir) 205 # return lines 206

207 -def convertXML(parser, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None):

208 #global stanfordParserDir, stanfordParserArgs 209 if stanfordParserDir == None: 210 stanfordParserDir = Settings.STANFORD_PARSER_DIR 211 if stanfordParserArgs == None: 212 stanfordParserArgs = ["java", "-mx500m", "-cp", "stanford-parser.jar", 213 "edu.stanford.nlp.trees.EnglishGrammaticalStructure", 214 "-CCprocessed", "-keepPunct", "-treeFile"] 215 print >> sys.stderr, "Running Stanford conversion" 216 print >> sys.stderr, "Stanford tools at:", stanfordParserDir 217 print >> sys.stderr, "Stanford tools arguments:", " ".join(stanfordParserArgs) 218 parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S") 219 print >> sys.stderr, "Stanford time stamp:", parseTimeStamp 220 221 print >> sys.stderr, "Loading corpus", input 222 corpusTree = ETUtils.ETFromObj(input) 223 print >> sys.stderr, "Corpus file loaded" 224 corpusRoot = corpusTree.getroot() 225 226 workdir = tempfile.mkdtemp() 227 if debug: 228 print >> sys.stderr, "Stanford parser workdir", workdir 229 stanfordInput = os.path.join(workdir, "input") 230 stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8") 231 232 # Put penn tree lines in input file 233 existingCount = 0 234 for sentence in corpusRoot.getiterator("sentence"): 235 if sentence.find("sentenceanalyses") != None: # old format 236 sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses") 237 parses = setDefaultElement(sentenceAnalyses, "parses") 238 parse = getElementByAttrib(parses, "parse", {"parser":parser}) 239 else: 240 analyses = setDefaultElement(sentence, "analyses") 241 parse = getElementByAttrib(analyses, "parse", {"parser":parser}) 242 if parse == None: 243 continue 244 if len(parse.findall("dependency")) > 0: 245 if reparse: # remove existing stanford conversion 246 for dep in parse.findall("dependency"): 247 parse.remove(dep) 248 del parse.attrib["stanford"] 249 else: # don't reparse 250 existingCount += 1 251 continue 252 pennTree = parse.get("pennstring") 253 if pennTree == None or pennTree == "": 254 continue 255 stanfordInputFile.write(pennTree + "\n") 256 stanfordInputFile.close() 257 if existingCount != 0: 258 print >> sys.stderr, "Skipping", existingCount, "already converted sentences." 259 260 # Run Stanford parser 261 stanfordOutput = runSentenceProcess(runStanford, stanfordParserDir, stanfordInput, 262 workdir, True, "StanfordParser", 263 "Stanford Conversion", timeout=600, 264 outputArgs={"encoding":"latin1", "errors":"replace"}, 265 processArgs={"stanfordParserArgs":stanfordParserArgs}) 266 #stanfordOutputFile = codecs.open(stanfordOutput, "rt", "utf-8") 267 stanfordOutputFile = codecs.open(stanfordOutput, "rt", "latin1", "replace") 268 269 # Get output and insert dependencies 270 noDepCount = 0 271 failCount = 0 272 sentenceCount = 0 273 for document in corpusRoot.findall("document"): 274 for sentence in document.findall("sentence"): 275 # Get parse 276 if sentence.find("sentenceanalyses") != None: # old format 277 sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses") 278 parses = setDefaultElement(sentenceAnalyses, "parses") 279 parse = getElementByAttrib(parses, "parse", {"parser":parser}) 280 else: 281 analyses = setDefaultElement(sentence, "analyses") 282 parse = getElementByAttrib(analyses, "parse", {"parser":parser}) 283 if parse == None: 284 parse = ET.SubElement(analyses, "parse") 285 parse.set("parser", "None") 286 if reparse: 287 assert len(parse.findall("dependency")) == 0 288 elif len(parse.findall("dependency")) > 0: # don't reparse 289 continue 290 pennTree = parse.get("pennstring") 291 if pennTree == None or pennTree == "": 292 parse.set("stanford", "no_penn") 293 continue 294 parse.set("stanfordSource", "TEES") # parser was run through this wrapper 295 parse.set("stanfordDate", parseTimeStamp) # links the parse to the log file 296 # Get tokens 297 if sentence.find("analyses") != None: 298 tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")}) 299 else: 300 tokenization = getElementByAttrib(sentence.find("sentenceanalyses").find("tokenizations"), "tokenization", {"tokenizer":parse.get("tokenizer")}) 301 assert tokenization != None 302 count = 0 303 tokenByIndex = {} 304 for token in tokenization.findall("token"): 305 tokenByIndex[count] = token 306 count += 1 307 # Insert dependencies 308 origId = document.get("pmid") 309 if origId == None: 310 origId = document.get("origId") 311 origId = str(origId) 312 deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), origId)) 313 if len(deps) == 0: 314 parse.set("stanford", "no_dependencies") 315 noDepCount += 1 316 if parse.get("stanfordAlignmentError") != None: 317 failCount += 1 318 else: 319 parse.set("stanford", "ok") 320 if parse.get("stanfordAlignmentError") != None: 321 failCount += 1 322 parse.set("stanford", "partial") 323 sentenceCount += 1 324 stanfordOutputFile.close() 325 # Remove work directory 326 if not debug: 327 shutil.rmtree(workdir) 328 329 print >> sys.stderr, "Stanford conversion was done for", sentenceCount, "sentences,", noDepCount, "had no dependencies,", failCount, "failed" 330 331 if output != None: 332 print >> sys.stderr, "Writing output to", output 333 ETUtils.write(corpusRoot, output) 334 return corpusTree

335

336 -def insertParse(sentence, stanfordOutputFile, parser, extraAttributes={}, skipExtra=0):

337 # Get parse 338 analyses = setDefaultElement(sentence, "analyses") 339 #parses = setDefaultElement(sentenceAnalyses, "parses") 340 parse = getElementByAttrib(analyses, "parse", {"parser":parser}) 341 if parse == None: 342 parse = ET.SubElement(analyses, "parse") 343 parse.set("parser", "None") 344 # Remove existing dependencies 345 if len(parse.findall("dependency")) > 0: 346 for dependency in parse.findall("dependency"): 347 parse.remove(dependency) 348 # If no penn tree exists, the stanford parsing can't have happened either 349 pennTree = parse.get("pennstring") 350 if pennTree == None or pennTree == "": 351 parse.set("stanford", "no_penn") 352 # Must not exit early, so that reading of the stanfordOutputFile stays in sync with the sentences 353 #if len(parse.findall("dependency")) > 0: # don't reparse 354 # return True 355 #pennTree = parse.get("pennstring") 356 #if pennTree == None or pennTree == "": 357 # parse.set("stanford", "no_penn") 358 # return False 359 for attr in sorted(extraAttributes.keys()): 360 parse.set(attr, extraAttributes[attr]) 361 # Get tokens 362 tokenByIndex = {} 363 tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")}) 364 if tokenization != None: 365 count = 0 366 for token in tokenization.findall("token"): 367 tokenByIndex[count] = token 368 count += 1 369 # Insert dependencies 370 deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), sentence.get("origId")), skipExtra=skipExtra) 371 if len(deps) == 0: 372 parse.set("stanford", "no_dependencies") 373 else: 374 parse.set("stanford", "ok") 375 return True

376

377 -def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}, skipExtra=0):

378 import tarfile 379 from SentenceSplitter import openFile 380 """ 381 Divide text in the "text" attributes of document and section 382 elements into sentence elements. These sentence elements are 383 inserted into their respective parent elements. 384 """ 385 print >> sys.stderr, "Loading corpus", input 386 corpusTree = ETUtils.ETFromObj(input) 387 print >> sys.stderr, "Corpus file loaded" 388 corpusRoot = corpusTree.getroot() 389 390 print >> sys.stderr, "Inserting parses from", parsePath 391 if parsePath.find(".tar.gz") != -1: 392 tarFilePath, parsePath = parsePath.split(".tar.gz") 393 tarFilePath += ".tar.gz" 394 tarFile = tarfile.open(tarFilePath) 395 if parsePath[0] == "/": 396 parsePath = parsePath[1:] 397 else: 398 tarFile = None 399 400 docCount = 0 401 failCount = 0 402 sentenceCount = 0 403 docsWithStanford = 0 404 sentencesCreated = 0 405 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] 406 counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion") 407 for document in sourceElements: 408 docCount += 1 409 docId = document.get("id") 410 origId = document.get("pmid") 411 if origId == None: 412 origId = document.get("origId") 413 origId = str(origId) 414 if docId == None: 415 docId = "CORPUS.d" + str(docCount) 416 417 f = openFile(os.path.join(parsePath, origId + ".sd"), tarFile) 418 if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension 419 f = openFile(os.path.join(parsePath, origId + ".dep"), tarFile) 420 if f != None: 421 sentences = document.findall("sentence") 422 # TODO: Following for-loop is the same as when used with a real parser, and should 423 # be moved to its own function. 424 for sentence in sentences: 425 sentenceCount += 1 426 counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + origId + "): ") 427 if not insertParse(sentence, f, parseName, extraAttributes={}, skipExtra=skipExtra): 428 failCount += 1 429 f.close() 430 counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ") 431 432 if tarFile != None: 433 tarFile.close() 434 #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" 435 #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses" 436 437 print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences" #, failCount, "failed" 438 439 if output != None: 440 print >> sys.stderr, "Writing output to", output 441 ETUtils.write(corpusRoot, output) 442 return corpusTree

443 444 445 if __name__=="__main__": 446 import sys 447 448 from optparse import OptionParser, OptionGroup 449 # Import Psyco if available 450 try: 451 import psyco 452 psyco.full() 453 print >> sys.stderr, "Found Psyco, using" 454 except ImportError: 455 print >> sys.stderr, "Psyco not installed" 456 457 optparser = OptionParser(description="Stanford Parser dependency converter wrapper") 458 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 459 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 460 optparser.add_option("-p", "--parse", default=None, dest="parse", help="Name of parse element.") 461 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="") 462 optparser.add_option("--reparse", default=False, action="store_true", dest="reparse", help="") 463 group = OptionGroup(optparser, "Install Options", "") 464 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER") 465 group.add_option("--installDir", default=None, dest="installDir", help="Install directory") 466 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory") 467 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files") 468 optparser.add_option_group(group) 469 (options, args) = optparser.parse_args() 470 471 if options.install: 472 install(options.installDir, options.downloadDir, redownload=options.redownload) 473 else: 474 convertXML(input=options.input, output=options.output, parser=options.parse, debug=options.debug, reparse=options.reparse) 475

Source Code for Module TEES.Tools.StanfordParser