  1  parse__version__ = "$Revision: 1.3 $" 
  3  import sys,os 
  4  import time, datetime 
  5  import sys 
  6  try: 
  7      import xml.etree.cElementTree as ET 
  8  except ImportError: 
  9      import cElementTree as ET 
 10  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 
 11  import Utils.ElementTreeUtils as ETUtils 
 13  import shutil 
 14  import subprocess 
 15  import tempfile 
 16  import codecs 
 18  import Utils.Settings as Settings 
 19  import Utils.Download as Download 
 20  import Tool 
 21  #bannerDir = Settings.BANNER_DIR 
23 -def test(progDir):
24 return True
26 -def install(destDir=None, downloadDir=None, redownload=False, compile=False, javaHome=None, updateLocalSettings=False):
27 print >> sys.stderr, "Installing BANNER" 28 if downloadDir == None: 29 downloadDir = os.path.join(Settings.DATAPATH, "tools/download") 30 if destDir == None: 31 destDir = Settings.DATAPATH 32 if compile: 33 Download.downloadAndExtract(Settings.URL["BANNER_SOURCE"], destDir + "/tools/BANNER", downloadDir + "/banner.tar.gz", "trunk", False, redownload=redownload) 34 print >> sys.stderr, "Compiling BANNER with ANT" 35 Tool.testPrograms("BANNER", ["ant"], {"ant":"ant -version"}) 36 #/usr/lib/jvm/java-6-openjdk 37 if javaHome == None or javaHome.strip() == "": 38"cd " + destDir + "/tools/BANNER; ant -f build_ext.xml", shell=True) 39 else: 40"cd " + destDir + "/tools/BANNER; export JAVA_HOME=" + javaHome + "; ant -f build_ext.xml", shell=True) 41 else: 42 print >> sys.stderr, "Downloading precompiled BANNER" 43 Download.downloadAndExtract(Settings.URL["BANNER_COMPILED"], destDir + "/tools", downloadDir, redownload=redownload) 44 Tool.finalizeInstall([], None, destDir + "/tools/BANNER", {"BANNER_DIR":destDir + "/tools/BANNER"}, updateLocalSettings)
45 46 # Newer versions of BANNER don't need trove 47 #print >> sys.stderr, "Downloading Java trove library" 48 #url = Settings.URL["BANNER_SOURCE"] 49 #Download.downloadAndExtract(url, destDir + "/tools/trove/", downloadDir) 50
51 -def makeConfigXML(workdir, bannerDir, oldVersion=True):
52 conf = ET.Element("banner-configuration") 53 banner = ET.SubElement(conf, "banner") 54 eval = ET.SubElement(banner, "eval") 55 datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset" 56 # Dataset 57 dataset = ET.SubElement(eval, "dataset") 58 ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt" 59 ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval" 60 ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval" 61, "empty.eval"), "wt", "utf-8").close() 62 # More eval level stuff 63 ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt" 64 ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt" 65 ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt" 66 ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt" 67, "output.txt"), "wt", "utf-8").close() 68 ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html" 69 ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt" 70 ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin" 71 ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser" 72 ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger" 73 ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger" 74 ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer" 75 ET.SubElement(eval, "useParenthesisPostProcessing").text = "true" 76 ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true" 77 ET.SubElement(eval, "useNumericNormalization").text = "true" 78 ET.SubElement(eval, "tagFormat").text = "IOB" 79 ET.SubElement(eval, "crfOrder").text = "2" 80 if not oldVersion: 81 ET.SubElement(eval, "mentionTypes").text = "Required" 82 ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception" 83 ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception" 84 ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger" 85 # End eval element 86 tagging = ET.SubElement(banner, "tagging") 87 dictionary = ET.SubElement(tagging, "dictionary") 88 dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger") 89 ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true" 90 ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false" 91 ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false" 92 ET.SubElement(dictionaryTagger, "canonize").text = "false" 93 ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true" 94 ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false" 95 ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt" 96 ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE" 97 # Write to file 98 filename = workdir + "/banner_config.xml" 99 ETUtils.write(conf, workdir + "/banner_config.xml") 100 return workdir + "/banner_config.xml"
102 -def makeEntityElements(beginOffset, endOffset, text, splitNewlines=False, elementName="entity"):
103 # NOTE! Entity ids are not set by this function 104 # beginOffset and endOffset in interaction XML format 105 bannerOffset = str(beginOffset) + "-" + str(endOffset) 106 currentEndOffset = beginOffset 107 elements = [] 108 if splitNewlines: 109 entityStrings = text[beginOffset:endOffset+1].split("\n") # TODO should support also other newlines 110 else: 111 entityStrings = [text[beginOffset:endOffset+1]] 112 # Make elements 113 currentBeginOffset = beginOffset 114 for entityString in entityStrings: 115 currentEndOffset += len(entityString) 116 if entityString.strip() != "": 117 ent = ET.Element(elementName) 118 ent.set("id", None) # this should crash the XML writing, if id isn't later redefined 119 # Modify offsets to remove leading/trailing whitespace 120 entityBeginOffset = currentBeginOffset 121 entityEndOffset = currentEndOffset 122 if len(entityString.rstrip()) < len(entityString): 123 entityEndOffset -= len(entityString) - len(entityString.rstrip()) 124 if len(entityString.lstrip()) < len(entityString): 125 entityBeginOffset += len(entityString) - len(entityString.lstrip()) 126 # Make the element 127 ent.set("charOffset", str(entityBeginOffset) + "-" + str(entityEndOffset)) 128 if ent.get("charOffset") != bannerOffset: 129 ent.set("origBANNEROffset", bannerOffset) 130 ent.set("type", "Protein") 131 ent.set("isName", "True") 132 ent.set("source", "BANNER") 133 ent.set("text", text[entityBeginOffset:entityEndOffset]) 134 assert ent.get("text") in text, (ent.get("text"), text) 135 elements.append(ent) 136 currentBeginOffset += len(entityString) + 1 # +1 for the newline 137 currentEndOffset += 1 # +1 for the newline 138 return elements
140 -def fixOffset(origBannerEntity, bannerEntityText, begin, end, sentenceText, verbose=False):
141 # The BANNER offsets appear to refer to text, from which all whitespace has been removed. 142 # Here we try to fix this situation. 143 origEnd = end 144 end = begin + len(bannerEntityText) # the end offset seems random, let's take the length from the begin-one 145 assert len(sentenceText[begin:end]) == len(bannerEntityText), (bannerEntity, sentenceText[begin:end], begin, end, sentenceText) 146 slippage = 0 147 found = True 148 while bannerEntityText != sentenceText[begin:end]: 149 found = False 150 slippage += 1 151 if sentenceText[begin+slippage:end+slippage] == bannerEntityText: 152 found = True 153 break 154 if sentenceText[begin-slippage:end-slippage] == bannerEntityText: 155 found = True 156 slippage = -slippage 157 break 158 assert found, (origBannerEntity, bannerEntityText, sentenceText[begin:end], begin, end, sentenceText) 159 if verbose: 160 print >> sys.stderr, "Fixed BANNER entity,", str(origBannerEntity) + ", slippage", slippage, "end diff", origEnd - end 161 return begin + slippage, end + slippage - 1
163 -def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, bannerPath=None, trovePath=None):
164 print >> sys.stderr, "Loading corpus", input 165 corpusTree = ETUtils.ETFromObj(input) 166 print >> sys.stderr, "Corpus file loaded" 167 corpusRoot = corpusTree.getroot() 168 169 # Write text to input file 170 workdir = tempfile.mkdtemp() 171 if debug: 172 print >> sys.stderr, "BANNER work directory at", workdir 173 infile =, "input.txt"), "wt", "utf-8") 174 idCount = 0 175 for sentence in corpusRoot.getiterator(processElement): 176 infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n") 177 idCount += 1 178 infile.close() 179 180 # Define classpath for java 181 if bannerPath == None: 182 bannerPath = Settings.BANNER_DIR 183 libPath = "/lib/" 184 # if not os.path.exists(bannerPath + libPath): 185 # libPath = "/libs/" 186 # assert os.path.exists(bannerPath + libPath) 187 assert os.path.exists(bannerPath + libPath + "banner.jar"), bannerPath 188 oldVersion = True 189 classPath = bannerPath + "/bin" 190 for filename in os.listdir(bannerPath + libPath): 191 #if filename.endswith(".jar"): 192 # classPath += ":" + bannerPath + libPath + filename 193 if filename == "uima": 194 oldVersion = False 195 classPath += ":" + bannerPath + libPath + "*" 196 # classPath += ":" + bannerPath + libPath + "banner.jar" 197 # classPath += ":" + bannerPath + libPath + "dragontool.jar" 198 # classPath += ":" + bannerPath + libPath + "heptag.jar" 199 # classPath += ":" + bannerPath + libPath + "commons-collections-3.2.1.jar" 200 # classPath += ":" + bannerPath + libPath + "commons-configuration-1.6.jar" 201 # classPath += ":" + bannerPath + libPath + "commons-lang-2.4.jar" 202 # classPath += ":" + bannerPath + libPath + "mallet.jar" 203 # classPath += ":" + bannerPath + libPath + "commons-logging-1.1.1.jar" 204 if oldVersion: 205 if trovePath == None: 206 trovePath = Settings.JAVA_TROVE_PATH 207 assert os.path.exists(trovePath), trovePath 208 classPath += ":" + trovePath # ":/usr/share/java/trove.jar" 209 print >> sys.stderr, "Trove library at", trovePath 210 211 config = makeConfigXML(workdir, bannerPath, oldVersion) 212 213 # Run parser 214 print >> sys.stderr, "Running BANNER", bannerPath 215 cwd = os.getcwd() 216 os.chdir(bannerPath) 217 if oldVersion: # old version 218 args = ["java", "-cp", classPath, "banner.eval.TestModel", config] 219 else: 220 args = ["java", "-cp", classPath, "banner.eval.BANNER", "test", config] 221 print >> sys.stderr, "BANNER command:", " ".join(args) 222 startTime = time.time() 223 exitCode = 224 assert exitCode == 0, exitCode 225 print >> sys.stderr, "BANNER time:", str(datetime.timedelta(seconds=time.time()-startTime)) 226 os.chdir(cwd) 227 228 # Put sentences in dictionary 229 sDict = {} 230 sentenceHasEntities = {} 231 sCount = 0 232 for sentence in corpusRoot.getiterator(processElement): 233 sDict["U" + str(sCount)] = sentence 234 sentenceHasEntities["U" + str(sCount)] = False 235 sCount += 1 236 237 sentencesWithEntities = 0 238 totalEntities = 0 239 nonSplitCount = 0 240 splitEventCount = 0 241 242 # TODO: mention.txt appears to contain predicted entities directly 243 # To be able to feed BANNER documents (or poorly chopped sentences) 244 # one should probably remove newlines, as BANNER separates its input 245 # on newlines. Replacing all \r and \n characters should preserve the 246 # character offsets. 247 248 # Read BANNER results 249 print >> sys.stderr, "Inserting entities" 250 if oldVersion: 251 outfile =, "output.txt"), "rt", "utf-8") 252 idfile =, "ids.txt"), "rt", "utf-8") 253 # Add output to sentences 254 for line in outfile: 255 bannerId = idfile.readline().strip() 256 sentence = sDict[bannerId] 257 258 # Find or create container elements 259 sentenceId = sentence.get("id") 260 261 sText = sentence.get("text") 262 start = 0 263 entityCount = 0 264 beginOffset = None 265 # Add tokens 266 splits = line.strip().split() 267 for split in splits: 268 tokenText, tag = split.rsplit("|", 1) 269 # Determine offsets by aligning BANNER-generated tokens to original text 270 cStart = sText.find(tokenText, start) 271 assert cStart != -1, (tokenText, tag, sText, line) 272 cEnd = cStart + len(tokenText) - 1 273 start = cStart + len(tokenText) 274 275 if tag == "O": 276 if beginOffset != None: 277 ## Make element 278 #ent = ET.Element(elementName) 279 #ent.set("id", sentenceId + ".e" + str(entityCount)) 280 #ent.set("charOffset", str(beginOffset) + "-" + str(prevEnd)) 281 #ent.set("type", "Protein") 282 #ent.set("isName", "True") 283 #ent.set("source", "BANNER") 284 #ent.set("text", sText[beginOffset:prevEnd+1]) 285 entities = makeEntityElements(beginOffset, prevEnd, sText, splitNewlines, elementName) 286 assert len(entities) > 0 287 nonSplitCount += 1 288 if len(entities) > 1: 289 splitEventCount += 1 290 for ent in entities: 291 ent.set("id", sentenceId + ".e" + str(entityCount)) 292 sentence.append(ent) 293 if not sentenceHasEntities[bannerId]: 294 sentencesWithEntities += 1 295 sentenceHasEntities[bannerId] = True 296 totalEntities += 1 297 entityCount += 1 298 beginOffset = None 299 else: 300 if beginOffset == None: 301 beginOffset = cStart 302 prevEnd = cEnd 303 outfile.close() 304 idfile.close() 305 else: 306 sentenceEntityCount = {} 307 mentionfile =, "mention.txt"), "rt", "utf-8") 308 for line in mentionfile: 309 bannerId, offsets, word = line.strip().split("|") 310 offsets = offsets.split() 311 sentence = sDict[bannerId] 312 offsets[0], offsets[1] = fixOffset(line.strip(), word, int(offsets[0]), int(offsets[1]), sentence.get("text")) 313 entities = makeEntityElements(int(offsets[0]), int(offsets[1]), sentence.get("text"), splitNewlines, elementName) 314 entityText = "\n".join([x.get("text") for x in entities]) 315 assert entityText == word, (entityText, word, bannerId, offsets, sentence.get("id"), sentence.get("text")) 316 assert len(entities) > 0, (line.strip(), sentence.get("text")) 317 nonSplitCount += 1 318 if len(entities) > 1: 319 splitEventCount += 1 320 if bannerId not in sentenceEntityCount: 321 sentenceEntityCount[bannerId] = 0 322 for ent in entities: 323 ent.set("id", sentence.get("id") + ".e" + str(sentenceEntityCount[bannerId])) 324 sentence.append(ent) 325 if not sentenceHasEntities[bannerId]: 326 sentencesWithEntities += 1 327 sentenceHasEntities[bannerId] = True 328 totalEntities += 1 329 sentenceEntityCount[bannerId] += 1 330 mentionfile.close() 331 332 print >> sys.stderr, "BANNER found", nonSplitCount, "entities in", sentencesWithEntities, processElement + "-elements", 333 print >> sys.stderr, "(" + str(sCount) + " have no entities)" 334 print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "BANNER entities with newlines)" 335 336 # Remove work directory 337 if not debug: 338 shutil.rmtree(workdir) 339 else: 340 print >> sys.stderr, "BANNER working directory for debugging at", workdir 341 342 if output != None: 343 print >> sys.stderr, "Writing output to", output 344 ETUtils.write(corpusRoot, output) 345 return corpusTree
346 347 if __name__=="__main__": 348 import sys 349 350 from optparse import OptionParser, OptionGroup 351 # Import Psyco if available 352 try: 353 import psyco 354 psyco.full() 355 print >> sys.stderr, "Found Psyco, using" 356 except ImportError: 357 print >> sys.stderr, "Psyco not installed" 358 359 optparser = OptionParser(description="BANNER named entity recognizer wrapper") 360 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in Interaction XML format", metavar="FILE") 361 optparser.add_option("--inputCorpusName", default="PMC11", dest="inputCorpusName", help="") 362 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in Interaction XML format.") 363 optparser.add_option("-e", "--elementName", default="entity", dest="elementName", help="BANNER created element tag in Interaction XML") 364 optparser.add_option("-p", "--processElement", default="sentence", dest="processElement", help="input element tag (usually \"sentence\" or \"document\")") 365 optparser.add_option("-s", "--split", default=False, action="store_true", dest="splitNewlines", help="Split BANNER entities at newlines") 366 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="Preserve temporary working directory") 367 optparser.add_option("--pathBANNER", default=None, dest="pathBANNER", help="") 368 optparser.add_option("--pathTrove", default=None, dest="pathTrove", help="") 369 group = OptionGroup(optparser, "Install", "") 370 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER") 371 group.add_option("--installDir", default=None, dest="installDir", help="Install directory") 372 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory") 373 group.add_option("--javaHome", default=None, dest="javaHome", help="JAVA_HOME setting for ANT, used when compiling BANNER") 374 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files") 375 optparser.add_option_group(group) 376 (options, args) = optparser.parse_args() 377 378 if not options.install: 379 if os.path.isdir(options.input) or options.input.endswith(".tar.gz"): 380 print >> sys.stderr, "Converting ST-format" 381 import STFormat.ConvertXML 382 import STFormat.STTools 383 options.input = STFormat.ConvertXML.toInteractionXML(STFormat.STTools.loadSet(options.input), options.inputCorpusName) 384 print >> sys.stderr, "Running BANNER" 385 run(input=options.input, output=options.output, elementName=options.elementName, 386 processElement=options.processElement, splitNewlines=options.splitNewlines, debug=options.debug, 387 bannerPath=options.pathBANNER, trovePath=options.pathTrove) 388 else: 389 install(options.installDir, options.downloadDir, javaHome=options.javaHome, redownload=options.redownload) 390