1 import sys, os
2 import shutil
3 import subprocess
4 import tempfile
5 import tarfile
6 import codecs
7 from ProcessUtils import *
8 try:
9 import xml.etree.cElementTree as ET
10 except ImportError:
11 import cElementTree as ET
12 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),"..")))
13 import Utils.ElementTreeUtils as ETUtils
14 import Utils.Settings as Settings
15 import Utils.Download as Download
16 import Utils.Settings as Settings
17 import Tool
18
19
20
21
22
23
24
25
26
27
28 escDict={"-LRB-":"(",
29 "-RRB-":")",
30 "-LCB-":"{",
31 "-RCB-":"}",
32 "-LSB-":"[",
33 "-RSB-":"]",
34 "``":"\"",
35 "''":"\""}
36
37 -def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):
38 print >> sys.stderr, "Installing Stanford Parser"
39 if downloadDir == None:
40 downloadDir = os.path.join(Settings.DATAPATH, "tools/download/")
41 if destDir == None:
42 destDir = os.path.join(Settings.DATAPATH, "tools/")
43 items = Download.downloadAndExtract(Settings.URL["STANFORD_PARSER"], destDir, downloadDir)
44 stanfordPath = Download.getTopDir(destDir, items)
45 Tool.finalizeInstall(["stanford-parser.jar"],
46 {"stanford-parser.jar":"java -cp stanford-parser.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure"},
47 stanfordPath, {"STANFORD_PARSER_DIR":stanfordPath}, updateLocalSettings)
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
73
74
75
76
77 return subprocess.Popen(stanfordParserArgs + [input], stdout=codecs.open(output, "wt", "utf-8"))
78
79
81 try:
82 string = string.encode('raw_unicode_escape').decode('utf-8')
83 except:
84 pass
85 return string
86
87 -def addDependencies(outfile, parse, tokenByIndex=None, sentenceId=None, skipExtra=0):
88 global escDict
89 escSymbols = sorted(escDict.keys())
90
91
92 tokens = []
93 for key in sorted(tokenByIndex):
94 tokens.append(tokenByIndex[key].get("text"))
95
96 depCount = 1
97 line = outfile.readline()
98
99 line = getUnicode(line)
100 deps = []
101
102
103
104 if line.strip() == "" and skipExtra > 0:
105 for i in range(skipExtra):
106 outfile.readline()
107 while line.strip() != "":
108
109 depType, rest = line.strip()[:-1].split("(")
110 t1, t2 = rest.split(", ")
111 t1Word, t1Index = t1.rsplit("-", 1)
112 for escSymbol in escSymbols:
113 t1Word = t1Word.replace(escSymbol, escDict[escSymbol])
114 while not t1Index[-1].isdigit(): t1Index = t1Index[:-1]
115 t1Index = int(t1Index)
116 t2Word, t2Index = t2.rsplit("-", 1)
117 for escSymbol in escSymbols:
118 t2Word = t2Word.replace(escSymbol, escDict[escSymbol])
119 while not t2Index[-1].isdigit(): t2Index = t2Index[:-1]
120 t2Index = int(t2Index)
121
122
123
124
125
126 if depType != "root":
127 dep = ET.Element("dependency")
128 dep.set("id", "sd_" + str(depCount))
129 alignmentError = False
130 if tokenByIndex != None:
131 if t1Index-1 not in tokenByIndex:
132 print >> sys.stderr, "Token not found", (t1Word, depCount, sentenceId)
133 deps = []
134 while line.strip() != "": line = outfile.readline()
135 break
136 if t2Index-1 not in tokenByIndex:
137 print >> sys.stderr, "Token not found", (t2Word, depCount, sentenceId)
138 deps = []
139 while line.strip() != "": line = outfile.readline()
140 break
141 if t1Word != tokenByIndex[t1Index-1].get("text"):
142 print >> sys.stderr, "Alignment error", (t1Word, tokenByIndex[t1Index-1].get("text"), t1Index-1, depCount, sentenceId, tokens)
143 alignmentError = True
144 if parse.get("stanfordAlignmentError") == None:
145 parse.set("stanfordAlignmentError", t1Word)
146 if t2Word != tokenByIndex[t2Index-1].get("text"):
147 print >> sys.stderr, "Alignment error", (t2Word, tokenByIndex[t2Index-1].get("text"), t2Index-1, depCount, sentenceId, tokens)
148 alignmentError = True
149 if parse.get("stanfordAlignmentError") == None:
150 parse.set("stanfordAlignmentError", t2Word)
151 dep.set("t1", tokenByIndex[t1Index-1].get("id"))
152 dep.set("t2", tokenByIndex[t2Index-1].get("id"))
153 else:
154 dep.set("t1", "bt_" + str(t1Index))
155 dep.set("t2", "bt_" + str(t2Index))
156 dep.set("type", depType)
157 parse.insert(depCount-1, dep)
158 depCount += 1
159 if not alignmentError:
160 deps.append(dep)
161 line = outfile.readline()
162 try:
163 line = getUnicode(line)
164
165 except:
166 print "Type", type(line)
167 print "Repr", repr(line)
168 print line
169 raise
170 return deps
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207 -def convertXML(parser, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None):
208
209 if stanfordParserDir == None:
210 stanfordParserDir = Settings.STANFORD_PARSER_DIR
211 if stanfordParserArgs == None:
212 stanfordParserArgs = ["java", "-mx500m", "-cp", "stanford-parser.jar",
213 "edu.stanford.nlp.trees.EnglishGrammaticalStructure",
214 "-CCprocessed", "-keepPunct", "-treeFile"]
215 print >> sys.stderr, "Running Stanford conversion"
216 print >> sys.stderr, "Stanford tools at:", stanfordParserDir
217 print >> sys.stderr, "Stanford tools arguments:", " ".join(stanfordParserArgs)
218 parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S")
219 print >> sys.stderr, "Stanford time stamp:", parseTimeStamp
220
221 print >> sys.stderr, "Loading corpus", input
222 corpusTree = ETUtils.ETFromObj(input)
223 print >> sys.stderr, "Corpus file loaded"
224 corpusRoot = corpusTree.getroot()
225
226 workdir = tempfile.mkdtemp()
227 if debug:
228 print >> sys.stderr, "Stanford parser workdir", workdir
229 stanfordInput = os.path.join(workdir, "input")
230 stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8")
231
232
233 existingCount = 0
234 for sentence in corpusRoot.getiterator("sentence"):
235 if sentence.find("sentenceanalyses") != None:
236 sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
237 parses = setDefaultElement(sentenceAnalyses, "parses")
238 parse = getElementByAttrib(parses, "parse", {"parser":parser})
239 else:
240 analyses = setDefaultElement(sentence, "analyses")
241 parse = getElementByAttrib(analyses, "parse", {"parser":parser})
242 if parse == None:
243 continue
244 if len(parse.findall("dependency")) > 0:
245 if reparse:
246 for dep in parse.findall("dependency"):
247 parse.remove(dep)
248 del parse.attrib["stanford"]
249 else:
250 existingCount += 1
251 continue
252 pennTree = parse.get("pennstring")
253 if pennTree == None or pennTree == "":
254 continue
255 stanfordInputFile.write(pennTree + "\n")
256 stanfordInputFile.close()
257 if existingCount != 0:
258 print >> sys.stderr, "Skipping", existingCount, "already converted sentences."
259
260
261 stanfordOutput = runSentenceProcess(runStanford, stanfordParserDir, stanfordInput,
262 workdir, True, "StanfordParser",
263 "Stanford Conversion", timeout=600,
264 outputArgs={"encoding":"latin1", "errors":"replace"},
265 processArgs={"stanfordParserArgs":stanfordParserArgs})
266
267 stanfordOutputFile = codecs.open(stanfordOutput, "rt", "latin1", "replace")
268
269
270 noDepCount = 0
271 failCount = 0
272 sentenceCount = 0
273 for document in corpusRoot.findall("document"):
274 for sentence in document.findall("sentence"):
275
276 if sentence.find("sentenceanalyses") != None:
277 sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
278 parses = setDefaultElement(sentenceAnalyses, "parses")
279 parse = getElementByAttrib(parses, "parse", {"parser":parser})
280 else:
281 analyses = setDefaultElement(sentence, "analyses")
282 parse = getElementByAttrib(analyses, "parse", {"parser":parser})
283 if parse == None:
284 parse = ET.SubElement(analyses, "parse")
285 parse.set("parser", "None")
286 if reparse:
287 assert len(parse.findall("dependency")) == 0
288 elif len(parse.findall("dependency")) > 0:
289 continue
290 pennTree = parse.get("pennstring")
291 if pennTree == None or pennTree == "":
292 parse.set("stanford", "no_penn")
293 continue
294 parse.set("stanfordSource", "TEES")
295 parse.set("stanfordDate", parseTimeStamp)
296
297 if sentence.find("analyses") != None:
298 tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
299 else:
300 tokenization = getElementByAttrib(sentence.find("sentenceanalyses").find("tokenizations"), "tokenization", {"tokenizer":parse.get("tokenizer")})
301 assert tokenization != None
302 count = 0
303 tokenByIndex = {}
304 for token in tokenization.findall("token"):
305 tokenByIndex[count] = token
306 count += 1
307
308 origId = document.get("pmid")
309 if origId == None:
310 origId = document.get("origId")
311 origId = str(origId)
312 deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), origId))
313 if len(deps) == 0:
314 parse.set("stanford", "no_dependencies")
315 noDepCount += 1
316 if parse.get("stanfordAlignmentError") != None:
317 failCount += 1
318 else:
319 parse.set("stanford", "ok")
320 if parse.get("stanfordAlignmentError") != None:
321 failCount += 1
322 parse.set("stanford", "partial")
323 sentenceCount += 1
324 stanfordOutputFile.close()
325
326 if not debug:
327 shutil.rmtree(workdir)
328
329 print >> sys.stderr, "Stanford conversion was done for", sentenceCount, "sentences,", noDepCount, "had no dependencies,", failCount, "failed"
330
331 if output != None:
332 print >> sys.stderr, "Writing output to", output
333 ETUtils.write(corpusRoot, output)
334 return corpusTree
335
336 -def insertParse(sentence, stanfordOutputFile, parser, extraAttributes={}, skipExtra=0):
337
338 analyses = setDefaultElement(sentence, "analyses")
339
340 parse = getElementByAttrib(analyses, "parse", {"parser":parser})
341 if parse == None:
342 parse = ET.SubElement(analyses, "parse")
343 parse.set("parser", "None")
344
345 if len(parse.findall("dependency")) > 0:
346 for dependency in parse.findall("dependency"):
347 parse.remove(dependency)
348
349 pennTree = parse.get("pennstring")
350 if pennTree == None or pennTree == "":
351 parse.set("stanford", "no_penn")
352
353
354
355
356
357
358
359 for attr in sorted(extraAttributes.keys()):
360 parse.set(attr, extraAttributes[attr])
361
362 tokenByIndex = {}
363 tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
364 if tokenization != None:
365 count = 0
366 for token in tokenization.findall("token"):
367 tokenByIndex[count] = token
368 count += 1
369
370 deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, (sentence.get("id"), sentence.get("origId")), skipExtra=skipExtra)
371 if len(deps) == 0:
372 parse.set("stanford", "no_dependencies")
373 else:
374 parse.set("stanford", "ok")
375 return True
376
377 -def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}, skipExtra=0):
378 import tarfile
379 from SentenceSplitter import openFile
380 """
381 Divide text in the "text" attributes of document and section
382 elements into sentence elements. These sentence elements are
383 inserted into their respective parent elements.
384 """
385 print >> sys.stderr, "Loading corpus", input
386 corpusTree = ETUtils.ETFromObj(input)
387 print >> sys.stderr, "Corpus file loaded"
388 corpusRoot = corpusTree.getroot()
389
390 print >> sys.stderr, "Inserting parses from", parsePath
391 if parsePath.find(".tar.gz") != -1:
392 tarFilePath, parsePath = parsePath.split(".tar.gz")
393 tarFilePath += ".tar.gz"
394 tarFile = tarfile.open(tarFilePath)
395 if parsePath[0] == "/":
396 parsePath = parsePath[1:]
397 else:
398 tarFile = None
399
400 docCount = 0
401 failCount = 0
402 sentenceCount = 0
403 docsWithStanford = 0
404 sentencesCreated = 0
405 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
406 counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion")
407 for document in sourceElements:
408 docCount += 1
409 docId = document.get("id")
410 origId = document.get("pmid")
411 if origId == None:
412 origId = document.get("origId")
413 origId = str(origId)
414 if docId == None:
415 docId = "CORPUS.d" + str(docCount)
416
417 f = openFile(os.path.join(parsePath, origId + ".sd"), tarFile)
418 if f == None:
419 f = openFile(os.path.join(parsePath, origId + ".dep"), tarFile)
420 if f != None:
421 sentences = document.findall("sentence")
422
423
424 for sentence in sentences:
425 sentenceCount += 1
426 counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + origId + "): ")
427 if not insertParse(sentence, f, parseName, extraAttributes={}, skipExtra=skipExtra):
428 failCount += 1
429 f.close()
430 counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ")
431
432 if tarFile != None:
433 tarFile.close()
434
435
436
437 print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences"
438
439 if output != None:
440 print >> sys.stderr, "Writing output to", output
441 ETUtils.write(corpusRoot, output)
442 return corpusTree
443
444
445 if __name__=="__main__":
446 import sys
447
448 from optparse import OptionParser, OptionGroup
449
450 try:
451 import psyco
452 psyco.full()
453 print >> sys.stderr, "Found Psyco, using"
454 except ImportError:
455 print >> sys.stderr, "Psyco not installed"
456
457 optparser = OptionParser(description="Stanford Parser dependency converter wrapper")
458 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
459 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
460 optparser.add_option("-p", "--parse", default=None, dest="parse", help="Name of parse element.")
461 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="")
462 optparser.add_option("--reparse", default=False, action="store_true", dest="reparse", help="")
463 group = OptionGroup(optparser, "Install Options", "")
464 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER")
465 group.add_option("--installDir", default=None, dest="installDir", help="Install directory")
466 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory")
467 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files")
468 optparser.add_option_group(group)
469 (options, args) = optparser.parse_args()
470
471 if options.install:
472 install(options.installDir, options.downloadDir, redownload=options.redownload)
473 else:
474 convertXML(input=options.input, output=options.output, parser=options.parse, debug=options.debug, reparse=options.reparse)
475