1 parse__version__ = "$Revision: 1.3 $"
2
3 import sys,os
4 import time, datetime
5 import sys
6 try:
7 import xml.etree.cElementTree as ET
8 except ImportError:
9 import cElementTree as ET
10 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
11 import Utils.ElementTreeUtils as ETUtils
12
13 import shutil
14 import subprocess
15 import tempfile
16 import codecs
17
18 import Utils.Settings as Settings
19 import Utils.Download as Download
20 import Tool
21
22
25
26 -def install(destDir=None, downloadDir=None, redownload=False, compile=False, javaHome=None, updateLocalSettings=False):
27 print >> sys.stderr, "Installing BANNER"
28 if downloadDir == None:
29 downloadDir = os.path.join(Settings.DATAPATH, "tools/download")
30 if destDir == None:
31 destDir = Settings.DATAPATH
32 if compile:
33 Download.downloadAndExtract(Settings.URL["BANNER_SOURCE"], destDir + "/tools/BANNER", downloadDir + "/banner.tar.gz", "trunk", False, redownload=redownload)
34 print >> sys.stderr, "Compiling BANNER with ANT"
35 Tool.testPrograms("BANNER", ["ant"], {"ant":"ant -version"})
36
37 if javaHome == None or javaHome.strip() == "":
38 subprocess.call("cd " + destDir + "/tools/BANNER; ant -f build_ext.xml", shell=True)
39 else:
40 subprocess.call("cd " + destDir + "/tools/BANNER; export JAVA_HOME=" + javaHome + "; ant -f build_ext.xml", shell=True)
41 else:
42 print >> sys.stderr, "Downloading precompiled BANNER"
43 Download.downloadAndExtract(Settings.URL["BANNER_COMPILED"], destDir + "/tools", downloadDir, redownload=redownload)
44 Tool.finalizeInstall([], None, destDir + "/tools/BANNER", {"BANNER_DIR":destDir + "/tools/BANNER"}, updateLocalSettings)
45
46
47
48
49
50
52 conf = ET.Element("banner-configuration")
53 banner = ET.SubElement(conf, "banner")
54 eval = ET.SubElement(banner, "eval")
55 datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset"
56
57 dataset = ET.SubElement(eval, "dataset")
58 ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt"
59 ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval"
60 ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval"
61 codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close()
62
63 ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt"
64 ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt"
65 ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt"
66 ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt"
67 codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close()
68 ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html"
69 ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt"
70 ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin"
71 ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser"
72 ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger"
73 ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger"
74 ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer"
75 ET.SubElement(eval, "useParenthesisPostProcessing").text = "true"
76 ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true"
77 ET.SubElement(eval, "useNumericNormalization").text = "true"
78 ET.SubElement(eval, "tagFormat").text = "IOB"
79 ET.SubElement(eval, "crfOrder").text = "2"
80 if not oldVersion:
81 ET.SubElement(eval, "mentionTypes").text = "Required"
82 ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception"
83 ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception"
84 ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger"
85
86 tagging = ET.SubElement(banner, "tagging")
87 dictionary = ET.SubElement(tagging, "dictionary")
88 dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger")
89 ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true"
90 ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false"
91 ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false"
92 ET.SubElement(dictionaryTagger, "canonize").text = "false"
93 ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true"
94 ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false"
95 ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt"
96 ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE"
97
98 filename = workdir + "/banner_config.xml"
99 ETUtils.write(conf, workdir + "/banner_config.xml")
100 return workdir + "/banner_config.xml"
101
102 -def makeEntityElements(beginOffset, endOffset, text, splitNewlines=False, elementName="entity"):
103
104
105 bannerOffset = str(beginOffset) + "-" + str(endOffset)
106 currentEndOffset = beginOffset
107 elements = []
108 if splitNewlines:
109 entityStrings = text[beginOffset:endOffset+1].split("\n")
110 else:
111 entityStrings = [text[beginOffset:endOffset+1]]
112
113 currentBeginOffset = beginOffset
114 for entityString in entityStrings:
115 currentEndOffset += len(entityString)
116 if entityString.strip() != "":
117 ent = ET.Element(elementName)
118 ent.set("id", None)
119
120 entityBeginOffset = currentBeginOffset
121 entityEndOffset = currentEndOffset
122 if len(entityString.rstrip()) < len(entityString):
123 entityEndOffset -= len(entityString) - len(entityString.rstrip())
124 if len(entityString.lstrip()) < len(entityString):
125 entityBeginOffset += len(entityString) - len(entityString.lstrip())
126
127 ent.set("charOffset", str(entityBeginOffset) + "-" + str(entityEndOffset))
128 if ent.get("charOffset") != bannerOffset:
129 ent.set("origBANNEROffset", bannerOffset)
130 ent.set("type", "Protein")
131 ent.set("isName", "True")
132 ent.set("source", "BANNER")
133 ent.set("text", text[entityBeginOffset:entityEndOffset])
134 assert ent.get("text") in text, (ent.get("text"), text)
135 elements.append(ent)
136 currentBeginOffset += len(entityString) + 1
137 currentEndOffset += 1
138 return elements
139
140 -def fixOffset(origBannerEntity, bannerEntityText, begin, end, sentenceText, verbose=False):
141
142
143 origEnd = end
144 end = begin + len(bannerEntityText)
145 assert len(sentenceText[begin:end]) == len(bannerEntityText), (bannerEntity, sentenceText[begin:end], begin, end, sentenceText)
146 slippage = 0
147 found = True
148 while bannerEntityText != sentenceText[begin:end]:
149 found = False
150 slippage += 1
151 if sentenceText[begin+slippage:end+slippage] == bannerEntityText:
152 found = True
153 break
154 if sentenceText[begin-slippage:end-slippage] == bannerEntityText:
155 found = True
156 slippage = -slippage
157 break
158 assert found, (origBannerEntity, bannerEntityText, sentenceText[begin:end], begin, end, sentenceText)
159 if verbose:
160 print >> sys.stderr, "Fixed BANNER entity,", str(origBannerEntity) + ", slippage", slippage, "end diff", origEnd - end
161 return begin + slippage, end + slippage - 1
162
163 -def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, bannerPath=None, trovePath=None):
164 print >> sys.stderr, "Loading corpus", input
165 corpusTree = ETUtils.ETFromObj(input)
166 print >> sys.stderr, "Corpus file loaded"
167 corpusRoot = corpusTree.getroot()
168
169
170 workdir = tempfile.mkdtemp()
171 if debug:
172 print >> sys.stderr, "BANNER work directory at", workdir
173 infile = codecs.open(os.path.join(workdir, "input.txt"), "wt", "utf-8")
174 idCount = 0
175 for sentence in corpusRoot.getiterator(processElement):
176 infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n")
177 idCount += 1
178 infile.close()
179
180
181 if bannerPath == None:
182 bannerPath = Settings.BANNER_DIR
183 libPath = "/lib/"
184
185
186
187 assert os.path.exists(bannerPath + libPath + "banner.jar"), bannerPath
188 oldVersion = True
189 classPath = bannerPath + "/bin"
190 for filename in os.listdir(bannerPath + libPath):
191
192
193 if filename == "uima":
194 oldVersion = False
195 classPath += ":" + bannerPath + libPath + "*"
196
197
198
199
200
201
202
203
204 if oldVersion:
205 if trovePath == None:
206 trovePath = Settings.JAVA_TROVE_PATH
207 assert os.path.exists(trovePath), trovePath
208 classPath += ":" + trovePath
209 print >> sys.stderr, "Trove library at", trovePath
210
211 config = makeConfigXML(workdir, bannerPath, oldVersion)
212
213
214 print >> sys.stderr, "Running BANNER", bannerPath
215 cwd = os.getcwd()
216 os.chdir(bannerPath)
217 if oldVersion:
218 args = ["java", "-cp", classPath, "banner.eval.TestModel", config]
219 else:
220 args = ["java", "-cp", classPath, "banner.eval.BANNER", "test", config]
221 print >> sys.stderr, "BANNER command:", " ".join(args)
222 startTime = time.time()
223 exitCode = subprocess.call(args)
224 assert exitCode == 0, exitCode
225 print >> sys.stderr, "BANNER time:", str(datetime.timedelta(seconds=time.time()-startTime))
226 os.chdir(cwd)
227
228
229 sDict = {}
230 sentenceHasEntities = {}
231 sCount = 0
232 for sentence in corpusRoot.getiterator(processElement):
233 sDict["U" + str(sCount)] = sentence
234 sentenceHasEntities["U" + str(sCount)] = False
235 sCount += 1
236
237 sentencesWithEntities = 0
238 totalEntities = 0
239 nonSplitCount = 0
240 splitEventCount = 0
241
242
243
244
245
246
247
248
249 print >> sys.stderr, "Inserting entities"
250 if oldVersion:
251 outfile = codecs.open(os.path.join(workdir, "output.txt"), "rt", "utf-8")
252 idfile = codecs.open(os.path.join(workdir, "ids.txt"), "rt", "utf-8")
253
254 for line in outfile:
255 bannerId = idfile.readline().strip()
256 sentence = sDict[bannerId]
257
258
259 sentenceId = sentence.get("id")
260
261 sText = sentence.get("text")
262 start = 0
263 entityCount = 0
264 beginOffset = None
265
266 splits = line.strip().split()
267 for split in splits:
268 tokenText, tag = split.rsplit("|", 1)
269
270 cStart = sText.find(tokenText, start)
271 assert cStart != -1, (tokenText, tag, sText, line)
272 cEnd = cStart + len(tokenText) - 1
273 start = cStart + len(tokenText)
274
275 if tag == "O":
276 if beginOffset != None:
277
278
279
280
281
282
283
284
285 entities = makeEntityElements(beginOffset, prevEnd, sText, splitNewlines, elementName)
286 assert len(entities) > 0
287 nonSplitCount += 1
288 if len(entities) > 1:
289 splitEventCount += 1
290 for ent in entities:
291 ent.set("id", sentenceId + ".e" + str(entityCount))
292 sentence.append(ent)
293 if not sentenceHasEntities[bannerId]:
294 sentencesWithEntities += 1
295 sentenceHasEntities[bannerId] = True
296 totalEntities += 1
297 entityCount += 1
298 beginOffset = None
299 else:
300 if beginOffset == None:
301 beginOffset = cStart
302 prevEnd = cEnd
303 outfile.close()
304 idfile.close()
305 else:
306 sentenceEntityCount = {}
307 mentionfile = codecs.open(os.path.join(workdir, "mention.txt"), "rt", "utf-8")
308 for line in mentionfile:
309 bannerId, offsets, word = line.strip().split("|")
310 offsets = offsets.split()
311 sentence = sDict[bannerId]
312 offsets[0], offsets[1] = fixOffset(line.strip(), word, int(offsets[0]), int(offsets[1]), sentence.get("text"))
313 entities = makeEntityElements(int(offsets[0]), int(offsets[1]), sentence.get("text"), splitNewlines, elementName)
314 entityText = "\n".join([x.get("text") for x in entities])
315 assert entityText == word, (entityText, word, bannerId, offsets, sentence.get("id"), sentence.get("text"))
316 assert len(entities) > 0, (line.strip(), sentence.get("text"))
317 nonSplitCount += 1
318 if len(entities) > 1:
319 splitEventCount += 1
320 if bannerId not in sentenceEntityCount:
321 sentenceEntityCount[bannerId] = 0
322 for ent in entities:
323 ent.set("id", sentence.get("id") + ".e" + str(sentenceEntityCount[bannerId]))
324 sentence.append(ent)
325 if not sentenceHasEntities[bannerId]:
326 sentencesWithEntities += 1
327 sentenceHasEntities[bannerId] = True
328 totalEntities += 1
329 sentenceEntityCount[bannerId] += 1
330 mentionfile.close()
331
332 print >> sys.stderr, "BANNER found", nonSplitCount, "entities in", sentencesWithEntities, processElement + "-elements",
333 print >> sys.stderr, "(" + str(sCount) + " have no entities)"
334 print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "BANNER entities with newlines)"
335
336
337 if not debug:
338 shutil.rmtree(workdir)
339 else:
340 print >> sys.stderr, "BANNER working directory for debugging at", workdir
341
342 if output != None:
343 print >> sys.stderr, "Writing output to", output
344 ETUtils.write(corpusRoot, output)
345 return corpusTree
346
347 if __name__=="__main__":
348 import sys
349
350 from optparse import OptionParser, OptionGroup
351
352 try:
353 import psyco
354 psyco.full()
355 print >> sys.stderr, "Found Psyco, using"
356 except ImportError:
357 print >> sys.stderr, "Psyco not installed"
358
359 optparser = OptionParser(description="BANNER named entity recognizer wrapper")
360 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in Interaction XML format", metavar="FILE")
361 optparser.add_option("--inputCorpusName", default="PMC11", dest="inputCorpusName", help="")
362 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in Interaction XML format.")
363 optparser.add_option("-e", "--elementName", default="entity", dest="elementName", help="BANNER created element tag in Interaction XML")
364 optparser.add_option("-p", "--processElement", default="sentence", dest="processElement", help="input element tag (usually \"sentence\" or \"document\")")
365 optparser.add_option("-s", "--split", default=False, action="store_true", dest="splitNewlines", help="Split BANNER entities at newlines")
366 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="Preserve temporary working directory")
367 optparser.add_option("--pathBANNER", default=None, dest="pathBANNER", help="")
368 optparser.add_option("--pathTrove", default=None, dest="pathTrove", help="")
369 group = OptionGroup(optparser, "Install", "")
370 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER")
371 group.add_option("--installDir", default=None, dest="installDir", help="Install directory")
372 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory")
373 group.add_option("--javaHome", default=None, dest="javaHome", help="JAVA_HOME setting for ANT, used when compiling BANNER")
374 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files")
375 optparser.add_option_group(group)
376 (options, args) = optparser.parse_args()
377
378 if not options.install:
379 if os.path.isdir(options.input) or options.input.endswith(".tar.gz"):
380 print >> sys.stderr, "Converting ST-format"
381 import STFormat.ConvertXML
382 import STFormat.STTools
383 options.input = STFormat.ConvertXML.toInteractionXML(STFormat.STTools.loadSet(options.input), options.inputCorpusName)
384 print >> sys.stderr, "Running BANNER"
385 run(input=options.input, output=options.output, elementName=options.elementName,
386 processElement=options.processElement, splitNewlines=options.splitNewlines, debug=options.debug,
387 bannerPath=options.pathBANNER, trovePath=options.pathTrove)
388 else:
389 install(options.installDir, options.downloadDir, javaHome=options.javaHome, redownload=options.redownload)
390