TEES.Tools.GeniaSentenceSplitter

1 __version__ = "$Revision: 1.7 $" 2 3 import sys,os 4 import shutil 5 import subprocess 6 import tempfile 7 import codecs 8 thisPath = os.path.dirname(os.path.abspath(__file__)) 9 sys.path.append(os.path.abspath(os.path.join(thisPath,".."))) 10 try: 11 import xml.etree.cElementTree as ET 12 except ImportError: 13 import cElementTree as ET 14 import Utils.ElementTreeUtils as ETUtils 15 import Utils.Range as Range 16 import Tool 17 import Utils.Settings as Settings 18 from Utils.ProgressCounter import ProgressCounter 19 import Utils.Download as Download 20

21 -def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):

22 print >> sys.stderr, "Installing GENIA Sentence Splitter" 23 if downloadDir == None: 24 downloadDir = os.path.join(Settings.DATAPATH, "tools/download/") 25 if destDir == None: 26 destDir = os.path.join(Settings.DATAPATH, "tools/geniass") 27 Download.downloadAndExtract(Settings.URL["GENIA_SENTENCE_SPLITTER"], destDir, downloadDir, "geniass") 28 print >> sys.stderr, "Compiling GENIA Sentence Splitter" 29 Tool.testPrograms("Genia Sentence Splitter", ["make", "ruby"]) 30 cwd = os.getcwd() 31 os.chdir(destDir) 32 print >> sys.stderr, "Compiling Genia Sentence Splitter" 33 subprocess.call("make", shell=True) 34 os.chdir(cwd) 35 Tool.finalizeInstall(["./run_geniass.sh"], 36 {"./run_geniass.sh":"./run_geniass.sh README /dev/null " + Settings.RUBY_PATH}, 37 destDir, {"GENIA_SENTENCE_SPLITTER_DIR":destDir}, updateLocalSettings)

38

39 -def moveElements(document):

40 entMap = {} 41 entSentence = {} 42 entSentenceIndex = {} 43 sentences = document.findall("sentence") 44 sentenceCount = 0 45 for sentence in sentences: 46 sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) 47 # Move entities 48 entCount = 0 49 for entity in document.findall("entity"): 50 entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) 51 if Range.overlap(sentenceOffset, entityOffset): 52 document.remove(entity) 53 sentence.append(entity) 54 entityId = entity.get("id") 55 entityIdLastPart = entityId.rsplit(".", 1)[-1] 56 if entityIdLastPart.startswith("e"): 57 entity.set("id", sentence.get("id") + "." + entityIdLastPart) 58 entMap[entityId] = sentence.get("id") + "." + entityIdLastPart 59 else: 60 entity.set("docId", entityId) 61 entity.set("id", sentence.get("id") + ".e" + str(entCount)) 62 entMap[entityId] = sentence.get("id") + ".e" + str(entCount) 63 entSentence[entityId] = sentence 64 entSentenceIndex[entityId] = sentenceCount 65 newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) 66 entity.set("origOffset", entity.get("charOffset")) 67 entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) 68 entCount += 1 69 sentenceCount += 1 70 # Move interactions 71 intCount = 0 72 for interaction in document.findall("interaction"): 73 if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: 74 targetSentence = entSentence[interaction.get("e1")] 75 else: 76 targetSentence = entSentence[interaction.get("e2")] 77 document.remove(interaction) 78 targetSentence.append(interaction) 79 interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) 80 interaction.set("e1", entMap[interaction.get("e1")]) 81 interaction.set("e2", entMap[interaction.get("e2")]) 82 intCount += 1

83

84 -def makeSentence(text, begin, end, prevSentence=None, prevEnd=None):

85 # Make sentence element 86 e = ET.Element("sentence") 87 e.set("text", text[begin:end]) 88 e.set("charOffset", str(begin) + "-" + str(end)) # NOTE: check 89 # Set tail string for previous sentence 90 if prevSentence != None and begin - prevEnd > 1: 91 prevSentence.set("tail", text[prevEnd+1:begin]) 92 # Set head string for first sentence in document 93 if begin > 0 and prevSentence == None: 94 e.set("head", text[0:begin]) 95 assert "\n" not in e.get("text"), e.get("text") 96 assert "\r" not in e.get("text"), e.get("text") 97 return e

98

99 -def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False):

100 """ 101 Run GENIA Sentence Splitter 102 103 Divide text in the "text" attributes of document and section 104 elements into sentence elements. These sentence elements are 105 inserted into their respective parent elements. 106 """ 107 global sentenceSplitterDir 108 109 print >> sys.stderr, "Loading corpus", input 110 corpusTree = ETUtils.ETFromObj(input) 111 print >> sys.stderr, "Corpus file loaded" 112 corpusRoot = corpusTree.getroot() 113 114 print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR, 115 if postProcess: 116 print >> sys.stderr, "(Using post-processing)" 117 else: 118 print >> sys.stderr, "(No post-processing)" 119 docCount = 0 120 sentencesCreated = 0 121 redivideCount = 0 122 emptySentenceCount = 0 123 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] 124 counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter") 125 counter.showMilliseconds = True 126 # Create working directory 127 workdir = tempfile.mkdtemp() 128 for document in sourceElements: 129 counter.update(1, "Splitting Documents ("+document.get("id")+"): ") 130 docId = document.get("id") 131 if docId == None: 132 docId = "CORPUS.d" + str(docCount) 133 docTag = "-" + str(docCount) 134 assert document.find("sentence") == None 135 text = document.get("text") 136 if text == None or text.strip() == "": 137 continue 138 #print type(text) 139 # Write text to workfile 140 #workdir = tempfile.mkdtemp() 141 workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8") 142 # From http://themoritzfamily.com/python-encodings-and-unicode.html 143 # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode 144 # object otherwise it will try to automatically decode the byte stream as ASCII" 145 # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII, 146 # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this 147 # point we should be able to safely write(text), as the output file is unicode, and reading with 148 # the correct coded is taken care of earlier in the pipeline. 149 workfile.write(text) #.encode("utf-8")) 150 workfile.close() 151 # Run sentence splitter 152 assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR 153 args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH] 154 #p = subprocess.call(args) 155 p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 156 stdout, stderr = p.communicate() 157 if stdout != "": 158 print >> sys.stderr, stdout 159 if stderr != 'Extracting events.roading model file.\nstart classification.\n': 160 print >> sys.stderr, stderr 161 #print "stdout<", p.stdout.readlines(), ">" 162 #print "stderr<", p.stderr.readlines(), ">" 163 if postProcess: 164 ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") 165 ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8") 166 subprocess.call(["perl", os.path.join(os.path.dirname(os.path.abspath(__file__)), "geniass-postproc.pl")], stdin=ppIn, stdout=ppOut) 167 ppIn.close() 168 ppOut.close() 169 # Read split sentences 170 workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8") 171 else: 172 workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") 173 start = 0 # sentences are consecutively aligned to the text for charOffsets 174 sentenceCount = 0 175 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. 176 #text = text.replace(" ", " ") # should stop sentence splitter from crashing. 177 #alignmentText = text.replace("\n", " ").replace("\r", " ") 178 #docTokens = reWhiteSpace.split(text) 179 docIndex = 0 180 sentenceBeginIndex = -1 181 prevSentence = None 182 prevEndIndex = None 183 #emptySentenceCount = 0 184 prevText = None 185 for sText in workfile.readlines(): 186 sText = sText.strip() # The text of the sentence 187 if sText == "": 188 emptySentenceCount += 1 189 continue 190 191 for i in range(len(sText)): 192 if sText[i].isspace(): 193 assert sText[i] not in ["\n", "\r"] 194 continue 195 while text[docIndex].isspace(): 196 if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1: 197 redivideCount += 1 198 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex) 199 prevSentence.set("id", docId + ".s" + str(sentenceCount)) 200 prevSentence.set("redevided", "True") 201 sentencesCreated += 1 202 sentenceCount += 1 203 prevEndIndex = docIndex-1 204 sentenceBeginIndex = -1 205 document.append(prevSentence) 206 docIndex += 1 207 assert sText[i] == text[docIndex], (text, sText, prevText, sText[i:i+10], text[docIndex:docIndex+10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False 208 if sentenceBeginIndex == -1: 209 sentenceBeginIndex = docIndex 210 docIndex += 1 211 prevText = sText 212 if sentenceBeginIndex != -1: 213 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex) 214 prevSentence.set("id", docId + ".s" + str(sentenceCount)) 215 prevEndIndex = docIndex-1 216 sentenceBeginIndex = -1 217 sentencesCreated += 1 218 sentenceCount += 1 219 document.append(prevSentence) 220 # Add possible tail for last sentence 221 if prevEndIndex < len(text) - 1 and prevSentence != None: 222 assert prevSentence.get("tail") == None, prevSentence.get("tail") 223 prevSentence.set("tail", text[prevEndIndex+1:]) 224 225 #if emptySentenceCount > 0: 226 # print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") 227 # Remove original text 228 if removeText: 229 del document["text"] 230 # Move elements from document element to sentences 231 moveElements(document) 232 docCount += 1 233 234 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" 235 print >> sys.stderr, "Redivided", redivideCount, "sentences" 236 if emptySentenceCount > 0: 237 print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences" 238 239 if debug: 240 print >> sys.stderr, "Work directory preserved for debugging at", workdir 241 else: 242 # Remove work directory 243 shutil.rmtree(workdir) 244 245 if output != None: 246 print >> sys.stderr, "Writing output to", output 247 ETUtils.write(corpusRoot, output) 248 return corpusTree

249 250 if __name__=="__main__": 251 import sys 252 253 from optparse import OptionParser, OptionGroup 254 # Import Psyco if available 255 try: 256 import psyco 257 psyco.full() 258 print >> sys.stderr, "Found Psyco, using" 259 except ImportError: 260 print >> sys.stderr, "Psyco not installed" 261 262 optparser = OptionParser(description="GENIA Sentence Splitter wrapper") 263 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 264 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 265 optparser.add_option("-p", "--postprocess", default=False, action="store_true", dest="postprocess", help="Run postprocessor") 266 group = OptionGroup(optparser, "Install Options", "") 267 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER") 268 group.add_option("--installDir", default=None, dest="installDir", help="Install directory") 269 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory") 270 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files") 271 optparser.add_option_group(group) 272 (options, args) = optparser.parse_args() 273 274 if not options.install: 275 makeSentences(input=options.input, output=options.output, removeText=False, postProcess=options.postprocess) 276 else: 277 install(options.installDir, options.downloadDir, redownload=options.redownload) 278

Source Code for Module TEES.Tools.GeniaSentenceSplitter