1 import sys,os
2 import time
3 import shutil
4 import subprocess
5 import tempfile
6 import codecs
7 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
8 try:
9 import xml.etree.cElementTree as ET
10 except ImportError:
11 import cElementTree as ET
12 import Utils.ElementTreeUtils as ETUtils
13 import Utils.Settings as Settings
14 import Utils.Download as Download
15 import Tool
16 import StanfordParser
17 from ProcessUtils import *
18
19
20 escDict={"-LRB-":"(",
21 "-RRB-":")",
22 "-LCB-":"{",
23 "-RCB-":"}",
24 "-LSB-":"[",
25 "-RSB-":"]",
26 "``":"\"",
27 "''":"\""}
28
29 -def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):
30 url = Settings.URL["BLLIP_SOURCE"]
31 if downloadDir == None:
32 downloadDir = os.path.join(Settings.DATAPATH) + "/tools/download"
33 if destDir == None:
34 destDir = Settings.DATAPATH + "/tools/BLLIP"
35 items = Download.downloadAndExtract(url, destDir, downloadDir + "/bllip.zip", None, False)
36 print >> sys.stderr, "Installing BLLIP parser"
37 Tool.testPrograms("BLLIP parser", ["make", "flex"], {"flex":"flex --version"})
38 parserPath = Download.getTopDir(destDir, items)
39 cwd = os.getcwd()
40 os.chdir(parserPath)
41 print >> sys.stderr, "Compiling first-stage parser"
42 subprocess.call("make", shell=True)
43 print >> sys.stderr, "Compiling second-stage parser"
44 subprocess.call("make reranker", shell=True)
45 os.chdir(cwd)
46 print >> sys.stderr, "Installing the McClosky biomedical parsing model"
47 url = "http://bllip.cs.brown.edu/download/bioparsingmodel-rel1.tar.gz"
48 Download.downloadAndExtract(url, destDir, downloadDir, None)
49 bioModelDir = os.path.abspath(destDir + "/biomodel")
50
51 Tool.finalizeInstall(["first-stage/PARSE/parseIt", "second-stage/programs/features/best-parses"],
52 {"first-stage/PARSE/parseIt":"first-stage/PARSE/parseIt " + bioModelDir + "/parser/ < /dev/null",
53 "second-stage/programs/features/best-parses":"second-stage/programs/features/best-parses -l " + bioModelDir + "/reranker/features.gz " + bioModelDir + "/reranker/weights.gz < /dev/null"},
54 parserPath, {"BLLIP_PARSER_DIR":os.path.abspath(parserPath),
55 "MCCLOSKY_BIOPARSINGMODEL_DIR":bioModelDir}, updateLocalSettings)
56
58 global escDict
59 escSymbols = sorted(escDict.keys())
60 tokens = []
61 phrases = []
62 stack = []
63 if treeLine.strip() != "":
64
65 prevSplit = None
66 tokenCount = 0
67 splitCount = 0
68 splits = treeLine.split()
69 for split in splits:
70 if split[0] != "(":
71 tokenText = split
72 while tokenText[-1] == ")":
73 tokenText = tokenText[:-1]
74 if tokenText[-1] == ")":
75 stackTop = stack.pop()
76 phrases.append( (stackTop[0], tokenCount, stackTop[1]) )
77 origTokenText = tokenText
78 for escSymbol in escSymbols:
79 tokenText = tokenText.replace(escSymbol, escDict[escSymbol])
80
81 posText = prevSplit
82 while posText[0] == "(":
83 posText = posText[1:]
84 for escSymbol in escSymbols:
85 posText = posText.replace(escSymbol, escDict[escSymbol])
86 tokens.append( (tokenText, posText, origTokenText) )
87 tokenCount += 1
88 elif splits[splitCount + 1][0] == "(":
89 stack.append( (tokenCount, split[1:]) )
90 prevSplit = split
91 splitCount += 1
92 return tokens, phrases
93
94 -def insertTokens(tokens, sentence, tokenization, idStem="bt_", errorNotes=None):
95 tokenCount = 0
96 start = 0
97 prevStart = None
98 for tokenText, posTag, origTokenText in tokens:
99 sText = sentence.get("text")
100
101 cStart = sText.find(tokenText, start)
102
103 if cStart == -1:
104 cStart = sText.find(origTokenText, start)
105 if cStart == -1 and prevStart != None:
106 cStart = sText.find(origTokenText, prevStart)
107 if cStart != -1:
108 start = prevStart
109 print >> sys.stderr, "Token duplication", (tokenText, tokens, posTag, start, sText, errorNotes)
110 if cStart == -1:
111 print >> sys.stderr, "Token alignment error", (tokenText, tokens, posTag, start, sText, errorNotes)
112 for subElement in [x for x in tokenization]:
113 tokenization.remove(subElement)
114 return False
115 cEnd = cStart + len(tokenText)
116 prevStart = start
117 start = cStart + len(tokenText)
118
119 token = ET.Element("token")
120 token.set("id", idStem + str(tokenCount))
121 token.set("text", tokenText)
122 token.set("POS", posTag)
123 token.set("charOffset", str(cStart) + "-" + str(cEnd))
124 tokenization.append(token)
125 tokenCount += 1
126 return True
127
129 count = 0
130 phrases.sort()
131 for phrase in phrases:
132 phraseElement = ET.Element("phrase")
133 phraseElement.set("type", phrase[2])
134 phraseElement.set("id", idStem + str(count))
135 phraseElement.set("begin", str(phrase[0]))
136 phraseElement.set("end", str(phrase[1]))
137 t1 = None
138 t2 = None
139 if phrase[0] < len(tokenElements):
140 t1 = tokenElements[phrase[0]]
141 if phrase[1] < len(tokenElements):
142 t2 = tokenElements[phrase[1]]
143 if t1 != None and t2 != None:
144 phraseElement.set("charOffset", t1.get("charOffset").split("-")[0] + "-" + t2.get("charOffset").split("-")[-1])
145 parse.append(phraseElement)
146 count += 1
147
148
149
150 -def insertParse(sentence, treeLine, parseName="McCC", tokenizationName = None, makePhraseElements=True, extraAttributes={}, docId=None):
151
152 analyses = setDefaultElement(sentence, "analyses")
153
154
155
156 for prevParse in analyses.findall("parse"):
157 assert prevParse.get("parser") != parseName
158
159 parse = ET.Element("parse")
160 parse.set("parser", parseName)
161 if tokenizationName == None:
162 parse.set("tokenizer", parseName)
163 else:
164 parse.set("tokenizer", tokenizationName)
165 analyses.insert(getPrevElementIndex(analyses, "parse"), parse)
166
167 tokenByIndex = {}
168 parse.set("pennstring", treeLine.strip())
169 for attr in sorted(extraAttributes.keys()):
170 parse.set(attr, extraAttributes[attr])
171 if treeLine.strip() == "":
172 return False
173 else:
174 tokens, phrases = readPenn(treeLine)
175
176 if tokenizationName == None:
177 for prevTokenization in analyses.findall("tokenization"):
178 assert prevTokenization.get("tokenizer") != tokenizationName
179 tokenization = ET.Element("tokenization")
180 tokenization.set("tokenizer", parseName)
181 for attr in sorted(extraAttributes.keys()):
182 tokenization.set(attr, extraAttributes[attr])
183 analyses.insert(getElementIndex(analyses, parse), tokenization)
184
185 insertTokens(tokens, sentence, tokenization, errorNotes=(sentence.get("id"), docId))
186 else:
187 tokenization = getElementByAttrib(analyses, "tokenization", {"tokenizer":tokenizationName})
188
189 if makePhraseElements:
190 insertPhrases(phrases, parse, tokenization.findall("token"))
191 return True
192
193 -def runBLLIPParser(input, output, tokenizer=False, pathBioModel=None):
194 if tokenizer:
195 print >> sys.stderr, "Running BLLIP parser with tokenization"
196 else:
197 print >> sys.stderr, "Running BLLIP parser without tokenization"
198
199
200
201
202
203 assert os.path.exists(pathBioModel), pathBioModel
204 if tokenizer:
205 firstStageArgs = ["first-stage/PARSE/parseIt", "-l999", "-N50" , pathBioModel+"/parser/"]
206 else:
207 firstStageArgs = ["first-stage/PARSE/parseIt", "-l999", "-N50" , "-K", pathBioModel+"/parser/"]
208 secondStageArgs = ["second-stage/programs/features/best-parses", "-l", pathBioModel+"/reranker/features.gz", pathBioModel+"/reranker/weights.gz"]
209
210 firstStage = subprocess.Popen(firstStageArgs,
211 stdin=codecs.open(input, "rt", "utf-8"),
212 stdout=subprocess.PIPE)
213 secondStage = subprocess.Popen(secondStageArgs,
214 stdin=firstStage.stdout,
215 stdout=codecs.open(output, "wt", "utf-8"))
216 return ProcessWrapper([firstStage, secondStage])
217
218 -def getSentences(corpusRoot, requireEntities=False, skipIds=[], skipParsed=True):
219 for sentence in corpusRoot.getiterator("sentence"):
220 if sentence.get("id") in skipIds:
221 print >> sys.stderr, "Skipping sentence", sentence.get("id")
222 continue
223 if requireEntities:
224 if sentence.find("entity") == None:
225 continue
226 if skipParsed:
227 if ETUtils.getElementByAttrib(sentence, "parse", {"parser":"McCC"}) != None:
228 continue
229 yield sentence
230
231 -def parse(input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel=None, timestamp=True):
232 global escDict
233 print >> sys.stderr, "BLLIP parser"
234 parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S")
235 print >> sys.stderr, "BLLIP time stamp:", parseTimeStamp
236
237 if pathParser == None:
238 pathParser = Settings.BLLIP_PARSER_DIR
239 print >> sys.stderr, "BLLIP parser at:", pathParser
240 if pathBioModel == None:
241 pathBioModel = Settings.MCCLOSKY_BIOPARSINGMODEL_DIR
242 print >> sys.stderr, "Biomodel at:", pathBioModel
243 if requireEntities:
244 print >> sys.stderr, "Parsing only sentences with entities"
245
246 print >> sys.stderr, "Loading corpus", input
247 corpusTree = ETUtils.ETFromObj(input)
248 print >> sys.stderr, "Corpus file loaded"
249 corpusRoot = corpusTree.getroot()
250
251
252 workdir = tempfile.mkdtemp()
253 if debug:
254 print >> sys.stderr, "BLLIP parser workdir", workdir
255 infileName = os.path.join(workdir, "parser-input.txt")
256 infile = codecs.open(infileName, "wt", "utf-8")
257 numCorpusSentences = 0
258 if tokenizationName == None or tokenizationName == "PARSED_TEXT":
259 if tokenizationName == None:
260 print >> sys.stderr, "Parser does the tokenization"
261 else:
262 print >> sys.stderr, "Parsing tokenized text"
263
264 for sentence in getSentences(corpusRoot, requireEntities, skipIds, skipParsed):
265 infile.write("<s> " + sentence.get("text") + " </s>\n")
266 numCorpusSentences += 1
267 else:
268 print >> sys.stderr, "Using existing tokenization", tokenizationName
269 for sentence in getSentences(corpusRoot, requireEntities, skipIds, skipParsed):
270 tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":tokenizationName})
271 assert tokenization.get("tokenizer") == tokenizationName
272 s = ""
273 for token in tokenization.findall("token"):
274 s += token.get("text") + " "
275 infile.write("<s> " + s + "</s>\n")
276 numCorpusSentences += 1
277 infile.close()
278
279
280
281
282
283
284
285 cwd = os.getcwd()
286 os.chdir(pathParser)
287 if tokenizationName == None:
288 bllipOutput = runSentenceProcess(runBLLIPParser, pathParser, infileName, workdir, False, "BLLIPParser", "Parsing", timeout=timeout, processArgs={"tokenizer":True, "pathBioModel":pathBioModel})
289 else:
290 if tokenizationName == "PARSED_TEXT":
291 tokenizationName = None
292 bllipOutput = runSentenceProcess(runBLLIPParser, pathParser, infileName, workdir, False, "BLLIPParser", "Parsing", timeout=timeout, processArgs={"tokenizer":False, "pathBioModel":pathBioModel})
293
294
295
296 os.chdir(cwd)
297
298 treeFile = codecs.open(bllipOutput, "rt", "utf-8")
299 print >> sys.stderr, "Inserting parses"
300
301 failCount = 0
302 for sentence in getSentences(corpusRoot, requireEntities, skipIds, skipParsed):
303 treeLine = treeFile.readline()
304 extraAttributes={"source":"TEES"}
305 if timestamp:
306 extraAttributes["date"] = parseTimeStamp
307 if not insertParse(sentence, treeLine, parseName, makePhraseElements=makePhraseElements, extraAttributes=extraAttributes):
308 failCount += 1
309
310 treeFile.close()
311
312 if not debug:
313 shutil.rmtree(workdir)
314
315 print >> sys.stderr, "Parsed", numCorpusSentences, "sentences (" + str(failCount) + " failed)"
316 if failCount == 0:
317 print >> sys.stderr, "All sentences were parsed succesfully"
318 else:
319 print >> sys.stderr, "Warning, parsing failed for", failCount, "out of", numCorpusSentences, "sentences"
320 print >> sys.stderr, "The \"pennstring\" attribute of these sentences has an empty string."
321 if output != None:
322 print >> sys.stderr, "Writing output to", output
323 ETUtils.write(corpusRoot, output)
324 return corpusTree
325
326 -def insertParses(input, parsePath, output=None, parseName="McCC", tokenizationName = None, makePhraseElements=True, extraAttributes={}):
327 import tarfile
328 from SentenceSplitter import openFile
329 """
330 Divide text in the "text" attributes of document and section
331 elements into sentence elements. These sentence elements are
332 inserted into their respective parent elements.
333 """
334 print >> sys.stderr, "Loading corpus", input
335 corpusTree = ETUtils.ETFromObj(input)
336 print >> sys.stderr, "Corpus file loaded"
337 corpusRoot = corpusTree.getroot()
338
339 print >> sys.stderr, "Inserting parses from", parsePath
340 if parsePath.find(".tar.gz") != -1:
341 tarFilePath, parsePath = parsePath.split(".tar.gz")
342 tarFilePath += ".tar.gz"
343 tarFile = tarfile.open(tarFilePath)
344 if parsePath[0] == "/":
345 parsePath = parsePath[1:]
346 else:
347 tarFile = None
348
349 docCount = 0
350 failCount = 0
351 docsWithSentences = 0
352 numCorpusSentences = 0
353 sentencesCreated = 0
354 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
355 counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion")
356 for document in sourceElements:
357 docCount += 1
358 origId = document.get("pmid")
359 if origId == None:
360 origId = document.get("origId")
361 origId = str(origId)
362 counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ")
363 docId = document.get("id")
364 if docId == None:
365 docId = "CORPUS.d" + str(docCount)
366
367 f = openFile(os.path.join(parsePath, origId + ".ptb"), tarFile)
368 if f == None:
369 f = openFile(os.path.join(parsePath, origId + ".pstree"), tarFile)
370 if f == None:
371 continue
372 parseStrings = f.readlines()
373 f.close()
374 sentences = document.findall("sentence")
375 numCorpusSentences += len(sentences)
376 assert len(sentences) == len(parseStrings)
377
378
379 for sentence, treeLine in zip(sentences, parseStrings):
380 if not insertParse(sentence, treeLine, makePhraseElements=makePhraseElements, extraAttributes=extraAttributes, docId=origId):
381 failCount += 1
382
383 if tarFile != None:
384 tarFile.close()
385
386
387
388 print >> sys.stderr, "Inserted parses for", numCorpusSentences, "sentences (" + str(failCount) + " failed)"
389 if failCount == 0:
390 print >> sys.stderr, "All sentences have a parse"
391 else:
392 print >> sys.stderr, "Warning, a failed parse exists for", failCount, "out of", numCorpusSentences, "sentences"
393 print >> sys.stderr, "The \"pennstring\" attribute of these sentences has an empty string."
394 if output != None:
395 print >> sys.stderr, "Writing output to", output
396 ETUtils.write(corpusRoot, output)
397 return corpusTree
398
399 if __name__=="__main__":
400 import sys
401
402 from optparse import OptionParser, OptionGroup
403
404 try:
405 import psyco
406 psyco.full()
407 print >> sys.stderr, "Found Psyco, using"
408 except ImportError:
409 print >> sys.stderr, "Psyco not installed"
410
411 optparser = OptionParser(description="BLLIP parser wrapper")
412 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
413 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
414 optparser.add_option("-t", "--tokenization", default=None, dest="tokenization", help="Name of tokenization element.")
415 optparser.add_option("-s", "--stanford", default=False, action="store_true", dest="stanford", help="Run stanford conversion.")
416 optparser.add_option("--timestamp", default=False, action="store_true", dest="timestamp", help="Mark parses with a timestamp.")
417 optparser.add_option("--pathParser", default=None, dest="pathParser", help="")
418 optparser.add_option("--pathBioModel", default=None, dest="pathBioModel", help="")
419 group = OptionGroup(optparser, "Install Options", "")
420 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER")
421 group.add_option("--installDir", default=None, dest="installDir", help="Install directory")
422 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory")
423 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files")
424 optparser.add_option_group(group)
425 (options, args) = optparser.parse_args()
426
427 if options.install:
428 install(options.installDir, options.downloadDir, redownload=options.redownload)
429 else:
430 xml = parse(input=options.input, output=options.output, tokenizationName=options.tokenization, pathParser=options.pathParser, pathBioModel=options.pathBioModel, timestamp=options.timestamp)
431 if options.stanford:
432 import StanfordParser
433 StanfordParser.convertXML(parser="McClosky", input=xml, output=options.output)
434