TEES.Tools.SentenceSplitter

1 __version__ = "$Revision: 1.1 $" 2 3 import sys,os 4 import sys 5 try: 6 import xml.etree.cElementTree as ET 7 except ImportError: 8 import cElementTree as ET 9 10 import shutil 11 import subprocess 12 import tempfile 13 import codecs 14 import tarfile 15 16 from GeniaSentenceSplitter import moveElements 17 18 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 19 import Utils.ElementTreeUtils as ETUtils 20 from Utils.ProgressCounter import ProgressCounter 21

22 -def openFile(path, tarFile=None):

23 if tarFile != None: 24 try: 25 return tarFile.extractfile(tarFile.getmember(path)) 26 except KeyError: 27 pass 28 else: 29 if os.path.exists(path): 30 return open(path, "rt") 31 return None

32

33 -def makeSentences(input, tokenizationPath, output=None, removeText=False, escDict={}):

34 """ 35 Divide text in the "text" attributes of document and section 36 elements into sentence elements. These sentence elements are 37 inserted into their respective parent elements. 38 """ 39 print >> sys.stderr, "Loading corpus", input 40 corpusTree = ETUtils.ETFromObj(input) 41 print >> sys.stderr, "Corpus file loaded" 42 corpusRoot = corpusTree.getroot() 43 44 print >> sys.stderr, "Inserting tokenizations from", tokenizationPath 45 if tokenizationPath.find(".tar.gz") != -1: 46 tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz") 47 tarFilePath += ".tar.gz" 48 tarFile = tarfile.open(tarFilePath) 49 if tokenizationPath[0] == "/": 50 tokenizationPath = tokenizationPath[1:] 51 else: 52 tarFile = None 53 54 docCount = 0 55 docsWithSentences = 0 56 sentencesCreated = 0 57 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] 58 counter = ProgressCounter(len(sourceElements), "Sentence Splitting") 59 for document in sourceElements: 60 docCount += 1 61 origId = document.get("pmid") 62 if origId == None: 63 origId = document.get("origId") 64 origId = str(origId) 65 counter.update(1, "Splitting Documents ("+document.get("id")+"/" + origId + "): ") 66 docId = document.get("id") 67 if docId == None: 68 docId = "CORPUS.d" + str(docCount) 69 if document.find("sentence") == None: # no existing sentence split 70 text = document.get("text") 71 if text == None or text.strip() == "": 72 continue 73 74 newFile = os.path.join(tokenizationPath, origId + ".tok") 75 f = openFile(newFile, tarFile) 76 if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension 77 oldFile = os.path.join(tokenizationPath, origId + ".tokenized") 78 f = openFile(oldFile, tarFile) 79 if f == None: # no tokenization found 80 continue 81 sentencesCreated += alignSentences(document, f.readlines(), escDict) 82 f.close() 83 84 # Remove original text 85 if removeText: 86 del document["text"] 87 # Move elements from document element to sentences 88 moveElements(document) 89 docsWithSentences += 1 90 else: 91 docsWithSentences += 1 92 93 if tarFile != None: 94 tarFile.close() 95 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" 96 print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences" 97 98 if output != None: 99 print >> sys.stderr, "Writing output to", output 100 ETUtils.write(corpusRoot, output) 101 return corpusTree

102

103 -def alignSentences(document, sentenceTexts, escDict={}):

104 text = document.get("text") 105 start = 0 # sentences are consecutively aligned to the text for charOffsets 106 cEnd = 0 107 sentenceCount = 0 108 head = None 109 sentenceStart = None 110 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. 111 #text = text.replace(" ", " ") # should stop sentence splitter from crashing. 112 sText = None 113 for sText in sentenceTexts: 114 sText = sText.strip() # The text of the sentence 115 for key in sorted(escDict.keys()): 116 sText = sText.replace(key, escDict[key]) 117 if sText == "": 118 print >> sys.stderr, "Warning, empty sentence in", document.get("id") 119 continue 120 isFirst = True 121 for sToken in sText.split(): 122 # Find the starting point of the token in the text. This 123 # point must be after previous sentences 124 cStart = text.find(sToken, start) # find start position 125 assert cStart != -1, (text, sText, sToken, start) 126 if not text[cEnd:cStart].strip() == "": 127 print >> sys.stderr, "-----------------------------" 128 print >> sys.stderr, "text:", text 129 print >> sys.stderr, "text[cEnd:cStart+1]:", text[cEnd:cStart+1] 130 print >> sys.stderr, "prevSText:", prevSText 131 print >> sys.stderr, "sText:", sText 132 print >> sys.stderr, "sToken:", sToken 133 print >> sys.stderr, "start:", start 134 print >> sys.stderr, "-----------------------------" 135 assert False 136 #assert text[cEnd:cStart].strip() == "", (text, text[cEnd:cStart+1], sText, sToken, start) # only whitespace should separate words 137 tail = None 138 if isFirst: 139 sentenceStart = cStart 140 if cStart - start != 0: 141 prevSentence.set("tail", text[start:cStart]) 142 if cEnd == 0 and cStart != 0: 143 head = text[cEnd:cStart] 144 cEnd = cStart + len(sToken) # end position is determined by length 145 start = cStart + len(sToken) # for next token, start search from end of this one 146 isFirst = False 147 # make sentence element 148 e = ET.Element("sentence") 149 if head != None: 150 e.set("head", head) 151 e.set("text", text[sentenceStart:cEnd]) 152 e.set("charOffset", str(sentenceStart) + "-" + str(cEnd)) # NOTE: check 153 e.set("id", document.get("id") + ".s" + str(sentenceCount)) 154 document.append(e) # add sentence to parent element 155 prevSentence = e 156 sentenceCount += 1 157 if sentenceCount == len(sentenceTexts): # set tail of last sentence in document 158 if cEnd <= len(text): 159 e.set("tail", text[cEnd:]) 160 prevSText = sText 161 return sentenceCount

162 163 164 if __name__=="__main__": 165 import sys 166 167 from optparse import OptionParser 168 # Import Psyco if available 169 try: 170 import psyco 171 psyco.full() 172 print >> sys.stderr, "Found Psyco, using" 173 except ImportError: 174 print >> sys.stderr, "Psyco not installed" 175 176 optparser = OptionParser(description="For inserting an existing sentence splitting") 177 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 178 optparser.add_option("-t", "--tokenizationPath", default=None, dest="tokenizationPath", help="Tokenization path", metavar="FILE") 179 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 180 (options, args) = optparser.parse_args() 181 182 makeSentences(input=options.input, tokenizationPath=options.tokenizationPath, output=options.output, removeText=False) 183

Source Code for Module TEES.Tools.SentenceSplitter