TEES.Tools.BANNER

1 parse__version__ = "$Revision: 1.3 $" 2 3 import sys,os 4 import time, datetime 5 import sys 6 try: 7 import xml.etree.cElementTree as ET 8 except ImportError: 9 import cElementTree as ET 10 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 11 import Utils.ElementTreeUtils as ETUtils 12 13 import shutil 14 import subprocess 15 import tempfile 16 import codecs 17 18 import Utils.Settings as Settings 19 import Utils.Download as Download 20 import Tool 21 #bannerDir = Settings.BANNER_DIR 22

23 -def test(progDir):

24 return True

25

26 -def install(destDir=None, downloadDir=None, redownload=False, compile=False, javaHome=None, updateLocalSettings=False):

27 print >> sys.stderr, "Installing BANNER" 28 if downloadDir == None: 29 downloadDir = os.path.join(Settings.DATAPATH, "tools/download") 30 if destDir == None: 31 destDir = Settings.DATAPATH 32 if compile: 33 Download.downloadAndExtract(Settings.URL["BANNER_SOURCE"], destDir + "/tools/BANNER", downloadDir + "/banner.tar.gz", "trunk", False, redownload=redownload) 34 print >> sys.stderr, "Compiling BANNER with ANT" 35 Tool.testPrograms("BANNER", ["ant"], {"ant":"ant -version"}) 36 #/usr/lib/jvm/java-6-openjdk 37 if javaHome == None or javaHome.strip() == "": 38 subprocess.call("cd " + destDir + "/tools/BANNER; ant -f build_ext.xml", shell=True) 39 else: 40 subprocess.call("cd " + destDir + "/tools/BANNER; export JAVA_HOME=" + javaHome + "; ant -f build_ext.xml", shell=True) 41 else: 42 print >> sys.stderr, "Downloading precompiled BANNER" 43 Download.downloadAndExtract(Settings.URL["BANNER_COMPILED"], destDir + "/tools", downloadDir, redownload=redownload) 44 Tool.finalizeInstall([], None, destDir + "/tools/BANNER", {"BANNER_DIR":destDir + "/tools/BANNER"}, updateLocalSettings)

45 46 # Newer versions of BANNER don't need trove 47 #print >> sys.stderr, "Downloading Java trove library" 48 #url = Settings.URL["BANNER_SOURCE"] 49 #Download.downloadAndExtract(url, destDir + "/tools/trove/", downloadDir) 50

51 -def makeConfigXML(workdir, bannerDir, oldVersion=True):

52 conf = ET.Element("banner-configuration") 53 banner = ET.SubElement(conf, "banner") 54 eval = ET.SubElement(banner, "eval") 55 datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset" 56 # Dataset 57 dataset = ET.SubElement(eval, "dataset") 58 ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt" 59 ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval" 60 ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval" 61 codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close() 62 # More eval level stuff 63 ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt" 64 ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt" 65 ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt" 66 ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt" 67 codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close() 68 ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html" 69 ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt" 70 ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin" 71 ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser" 72 ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger" 73 ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger" 74 ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer" 75 ET.SubElement(eval, "useParenthesisPostProcessing").text = "true" 76 ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true" 77 ET.SubElement(eval, "useNumericNormalization").text = "true" 78 ET.SubElement(eval, "tagFormat").text = "IOB" 79 ET.SubElement(eval, "crfOrder").text = "2" 80 if not oldVersion: 81 ET.SubElement(eval, "mentionTypes").text = "Required" 82 ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception" 83 ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception" 84 ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger" 85 # End eval element 86 tagging = ET.SubElement(banner, "tagging") 87 dictionary = ET.SubElement(tagging, "dictionary") 88 dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger") 89 ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true" 90 ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false" 91 ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false" 92 ET.SubElement(dictionaryTagger, "canonize").text = "false" 93 ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true" 94 ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false" 95 ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt" 96 ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE" 97 # Write to file 98 filename = workdir + "/banner_config.xml" 99 ETUtils.write(conf, workdir + "/banner_config.xml") 100 return workdir + "/banner_config.xml"

101

102 -def makeEntityElements(beginOffset, endOffset, text, splitNewlines=False, elementName="entity"):

103 # NOTE! Entity ids are not set by this function 104 # beginOffset and endOffset in interaction XML format 105 bannerOffset = str(beginOffset) + "-" + str(endOffset) 106 currentEndOffset = beginOffset 107 elements = [] 108 if splitNewlines: 109 entityStrings = text[beginOffset:endOffset+1].split("\n") # TODO should support also other newlines 110 else: 111 entityStrings = [text[beginOffset:endOffset+1]] 112 # Make elements 113 currentBeginOffset = beginOffset 114 for entityString in entityStrings: 115 currentEndOffset += len(entityString) 116 if entityString.strip() != "": 117 ent = ET.Element(elementName) 118 ent.set("id", None) # this should crash the XML writing, if id isn't later redefined 119 # Modify offsets to remove leading/trailing whitespace 120 entityBeginOffset = currentBeginOffset 121 entityEndOffset = currentEndOffset 122 if len(entityString.rstrip()) < len(entityString): 123 entityEndOffset -= len(entityString) - len(entityString.rstrip()) 124 if len(entityString.lstrip()) < len(entityString): 125 entityBeginOffset += len(entityString) - len(entityString.lstrip()) 126 # Make the element 127 ent.set("charOffset", str(entityBeginOffset) + "-" + str(entityEndOffset)) 128 if ent.get("charOffset") != bannerOffset: 129 ent.set("origBANNEROffset", bannerOffset) 130 ent.set("type", "Protein") 131 ent.set("isName", "True") 132 ent.set("source", "BANNER") 133 ent.set("text", text[entityBeginOffset:entityEndOffset]) 134 assert ent.get("text") in text, (ent.get("text"), text) 135 elements.append(ent) 136 currentBeginOffset += len(entityString) + 1 # +1 for the newline 137 currentEndOffset += 1 # +1 for the newline 138 return elements

139

140 -def fixOffset(origBannerEntity, bannerEntityText, begin, end, sentenceText, verbose=False):

141 # The BANNER offsets appear to refer to text, from which all whitespace has been removed. 142 # Here we try to fix this situation. 143 origEnd = end 144 end = begin + len(bannerEntityText) # the end offset seems random, let's take the length from the begin-one 145 assert len(sentenceText[begin:end]) == len(bannerEntityText), (bannerEntity, sentenceText[begin:end], begin, end, sentenceText) 146 slippage = 0 147 found = True 148 while bannerEntityText != sentenceText[begin:end]: 149 found = False 150 slippage += 1 151 if sentenceText[begin+slippage:end+slippage] == bannerEntityText: 152 found = True 153 break 154 if sentenceText[begin-slippage:end-slippage] == bannerEntityText: 155 found = True 156 slippage = -slippage 157 break 158 assert found, (origBannerEntity, bannerEntityText, sentenceText[begin:end], begin, end, sentenceText) 159 if verbose: 160 print >> sys.stderr, "Fixed BANNER entity,", str(origBannerEntity) + ", slippage", slippage, "end diff", origEnd - end 161 return begin + slippage, end + slippage - 1

162

163 -def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, bannerPath=None, trovePath=None):

164 print >> sys.stderr, "Loading corpus", input 165 corpusTree = ETUtils.ETFromObj(input) 166 print >> sys.stderr, "Corpus file loaded" 167 corpusRoot = corpusTree.getroot() 168 169 # Write text to input file 170 workdir = tempfile.mkdtemp() 171 if debug: 172 print >> sys.stderr, "BANNER work directory at", workdir 173 infile = codecs.open(os.path.join(workdir, "input.txt"), "wt", "utf-8") 174 idCount = 0 175 for sentence in corpusRoot.getiterator(processElement): 176 infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n") 177 idCount += 1 178 infile.close() 179 180 # Define classpath for java 181 if bannerPath == None: 182 bannerPath = Settings.BANNER_DIR 183 libPath = "/lib/" 184 # if not os.path.exists(bannerPath + libPath): 185 # libPath = "/libs/" 186 # assert os.path.exists(bannerPath + libPath) 187 assert os.path.exists(bannerPath + libPath + "banner.jar"), bannerPath 188 oldVersion = True 189 classPath = bannerPath + "/bin" 190 for filename in os.listdir(bannerPath + libPath): 191 #if filename.endswith(".jar"): 192 # classPath += ":" + bannerPath + libPath + filename 193 if filename == "uima": 194 oldVersion = False 195 classPath += ":" + bannerPath + libPath + "*" 196 # classPath += ":" + bannerPath + libPath + "banner.jar" 197 # classPath += ":" + bannerPath + libPath + "dragontool.jar" 198 # classPath += ":" + bannerPath + libPath + "heptag.jar" 199 # classPath += ":" + bannerPath + libPath + "commons-collections-3.2.1.jar" 200 # classPath += ":" + bannerPath + libPath + "commons-configuration-1.6.jar" 201 # classPath += ":" + bannerPath + libPath + "commons-lang-2.4.jar" 202 # classPath += ":" + bannerPath + libPath + "mallet.jar" 203 # classPath += ":" + bannerPath + libPath + "commons-logging-1.1.1.jar" 204 if oldVersion: 205 if trovePath == None: 206 trovePath = Settings.JAVA_TROVE_PATH 207 assert os.path.exists(trovePath), trovePath 208 classPath += ":" + trovePath # ":/usr/share/java/trove.jar" 209 print >> sys.stderr, "Trove library at", trovePath 210 211 config = makeConfigXML(workdir, bannerPath, oldVersion) 212 213 # Run parser 214 print >> sys.stderr, "Running BANNER", bannerPath 215 cwd = os.getcwd() 216 os.chdir(bannerPath) 217 if oldVersion: # old version 218 args = ["java", "-cp", classPath, "banner.eval.TestModel", config] 219 else: 220 args = ["java", "-cp", classPath, "banner.eval.BANNER", "test", config] 221 print >> sys.stderr, "BANNER command:", " ".join(args) 222 startTime = time.time() 223 exitCode = subprocess.call(args) 224 assert exitCode == 0, exitCode 225 print >> sys.stderr, "BANNER time:", str(datetime.timedelta(seconds=time.time()-startTime)) 226 os.chdir(cwd) 227 228 # Put sentences in dictionary 229 sDict = {} 230 sentenceHasEntities = {} 231 sCount = 0 232 for sentence in corpusRoot.getiterator(processElement): 233 sDict["U" + str(sCount)] = sentence 234 sentenceHasEntities["U" + str(sCount)] = False 235 sCount += 1 236 237 sentencesWithEntities = 0 238 totalEntities = 0 239 nonSplitCount = 0 240 splitEventCount = 0 241 242 # TODO: mention.txt appears to contain predicted entities directly 243 # To be able to feed BANNER documents (or poorly chopped sentences) 244 # one should probably remove newlines, as BANNER separates its input 245 # on newlines. Replacing all \r and \n characters should preserve the 246 # character offsets. 247 248 # Read BANNER results 249 print >> sys.stderr, "Inserting entities" 250 if oldVersion: 251 outfile = codecs.open(os.path.join(workdir, "output.txt"), "rt", "utf-8") 252 idfile = codecs.open(os.path.join(workdir, "ids.txt"), "rt", "utf-8") 253 # Add output to sentences 254 for line in outfile: 255 bannerId = idfile.readline().strip() 256 sentence = sDict[bannerId] 257 258 # Find or create container elements 259 sentenceId = sentence.get("id") 260 261 sText = sentence.get("text") 262 start = 0 263 entityCount = 0 264 beginOffset = None 265 # Add tokens 266 splits = line.strip().split() 267 for split in splits: 268 tokenText, tag = split.rsplit("|", 1) 269 # Determine offsets by aligning BANNER-generated tokens to original text 270 cStart = sText.find(tokenText, start) 271 assert cStart != -1, (tokenText, tag, sText, line) 272 cEnd = cStart + len(tokenText) - 1 273 start = cStart + len(tokenText) 274 275 if tag == "O": 276 if beginOffset != None: 277 ## Make element 278 #ent = ET.Element(elementName) 279 #ent.set("id", sentenceId + ".e" + str(entityCount)) 280 #ent.set("charOffset", str(beginOffset) + "-" + str(prevEnd)) 281 #ent.set("type", "Protein") 282 #ent.set("isName", "True") 283 #ent.set("source", "BANNER") 284 #ent.set("text", sText[beginOffset:prevEnd+1]) 285 entities = makeEntityElements(beginOffset, prevEnd, sText, splitNewlines, elementName) 286 assert len(entities) > 0 287 nonSplitCount += 1 288 if len(entities) > 1: 289 splitEventCount += 1 290 for ent in entities: 291 ent.set("id", sentenceId + ".e" + str(entityCount)) 292 sentence.append(ent) 293 if not sentenceHasEntities[bannerId]: 294 sentencesWithEntities += 1 295 sentenceHasEntities[bannerId] = True 296 totalEntities += 1 297 entityCount += 1 298 beginOffset = None 299 else: 300 if beginOffset == None: 301 beginOffset = cStart 302 prevEnd = cEnd 303 outfile.close() 304 idfile.close() 305 else: 306 sentenceEntityCount = {} 307 mentionfile = codecs.open(os.path.join(workdir, "mention.txt"), "rt", "utf-8") 308 for line in mentionfile: 309 bannerId, offsets, word = line.strip().split("|") 310 offsets = offsets.split() 311 sentence = sDict[bannerId] 312 offsets[0], offsets[1] = fixOffset(line.strip(), word, int(offsets[0]), int(offsets[1]), sentence.get("text")) 313 entities = makeEntityElements(int(offsets[0]), int(offsets[1]), sentence.get("text"), splitNewlines, elementName) 314 entityText = "\n".join([x.get("text") for x in entities]) 315 assert entityText == word, (entityText, word, bannerId, offsets, sentence.get("id"), sentence.get("text")) 316 assert len(entities) > 0, (line.strip(), sentence.get("text")) 317 nonSplitCount += 1 318 if len(entities) > 1: 319 splitEventCount += 1 320 if bannerId not in sentenceEntityCount: 321 sentenceEntityCount[bannerId] = 0 322 for ent in entities: 323 ent.set("id", sentence.get("id") + ".e" + str(sentenceEntityCount[bannerId])) 324 sentence.append(ent) 325 if not sentenceHasEntities[bannerId]: 326 sentencesWithEntities += 1 327 sentenceHasEntities[bannerId] = True 328 totalEntities += 1 329 sentenceEntityCount[bannerId] += 1 330 mentionfile.close() 331 332 print >> sys.stderr, "BANNER found", nonSplitCount, "entities in", sentencesWithEntities, processElement + "-elements", 333 print >> sys.stderr, "(" + str(sCount) + " have no entities)" 334 print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "BANNER entities with newlines)" 335 336 # Remove work directory 337 if not debug: 338 shutil.rmtree(workdir) 339 else: 340 print >> sys.stderr, "BANNER working directory for debugging at", workdir 341 342 if output != None: 343 print >> sys.stderr, "Writing output to", output 344 ETUtils.write(corpusRoot, output) 345 return corpusTree

346 347 if __name__=="__main__": 348 import sys 349 350 from optparse import OptionParser, OptionGroup 351 # Import Psyco if available 352 try: 353 import psyco 354 psyco.full() 355 print >> sys.stderr, "Found Psyco, using" 356 except ImportError: 357 print >> sys.stderr, "Psyco not installed" 358 359 optparser = OptionParser(description="BANNER named entity recognizer wrapper") 360 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in Interaction XML format", metavar="FILE") 361 optparser.add_option("--inputCorpusName", default="PMC11", dest="inputCorpusName", help="") 362 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in Interaction XML format.") 363 optparser.add_option("-e", "--elementName", default="entity", dest="elementName", help="BANNER created element tag in Interaction XML") 364 optparser.add_option("-p", "--processElement", default="sentence", dest="processElement", help="input element tag (usually \"sentence\" or \"document\")") 365 optparser.add_option("-s", "--split", default=False, action="store_true", dest="splitNewlines", help="Split BANNER entities at newlines") 366 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="Preserve temporary working directory") 367 optparser.add_option("--pathBANNER", default=None, dest="pathBANNER", help="") 368 optparser.add_option("--pathTrove", default=None, dest="pathTrove", help="") 369 group = OptionGroup(optparser, "Install", "") 370 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER") 371 group.add_option("--installDir", default=None, dest="installDir", help="Install directory") 372 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory") 373 group.add_option("--javaHome", default=None, dest="javaHome", help="JAVA_HOME setting for ANT, used when compiling BANNER") 374 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files") 375 optparser.add_option_group(group) 376 (options, args) = optparser.parse_args() 377 378 if not options.install: 379 if os.path.isdir(options.input) or options.input.endswith(".tar.gz"): 380 print >> sys.stderr, "Converting ST-format" 381 import STFormat.ConvertXML 382 import STFormat.STTools 383 options.input = STFormat.ConvertXML.toInteractionXML(STFormat.STTools.loadSet(options.input), options.inputCorpusName) 384 print >> sys.stderr, "Running BANNER" 385 run(input=options.input, output=options.output, elementName=options.elementName, 386 processElement=options.processElement, splitNewlines=options.splitNewlines, debug=options.debug, 387 bannerPath=options.pathBANNER, trovePath=options.pathTrove) 388 else: 389 install(options.installDir, options.downloadDir, javaHome=options.javaHome, redownload=options.redownload) 390

Source Code for Module TEES.Tools.BANNER