1 __version__ = "$Revision: 1.7 $"
2
3 import sys,os
4 import shutil
5 import subprocess
6 import tempfile
7 import codecs
8 thisPath = os.path.dirname(os.path.abspath(__file__))
9 sys.path.append(os.path.abspath(os.path.join(thisPath,"..")))
10 try:
11 import xml.etree.cElementTree as ET
12 except ImportError:
13 import cElementTree as ET
14 import Utils.ElementTreeUtils as ETUtils
15 import Utils.Range as Range
16 import Tool
17 import Utils.Settings as Settings
18 from Utils.ProgressCounter import ProgressCounter
19 import Utils.Download as Download
20
21 -def install(destDir=None, downloadDir=None, redownload=False, updateLocalSettings=False):
22 print >> sys.stderr, "Installing GENIA Sentence Splitter"
23 if downloadDir == None:
24 downloadDir = os.path.join(Settings.DATAPATH, "tools/download/")
25 if destDir == None:
26 destDir = os.path.join(Settings.DATAPATH, "tools/geniass")
27 Download.downloadAndExtract(Settings.URL["GENIA_SENTENCE_SPLITTER"], destDir, downloadDir, "geniass")
28 print >> sys.stderr, "Compiling GENIA Sentence Splitter"
29 Tool.testPrograms("Genia Sentence Splitter", ["make", "ruby"])
30 cwd = os.getcwd()
31 os.chdir(destDir)
32 print >> sys.stderr, "Compiling Genia Sentence Splitter"
33 subprocess.call("make", shell=True)
34 os.chdir(cwd)
35 Tool.finalizeInstall(["./run_geniass.sh"],
36 {"./run_geniass.sh":"./run_geniass.sh README /dev/null " + Settings.RUBY_PATH},
37 destDir, {"GENIA_SENTENCE_SPLITTER_DIR":destDir}, updateLocalSettings)
38
40 entMap = {}
41 entSentence = {}
42 entSentenceIndex = {}
43 sentences = document.findall("sentence")
44 sentenceCount = 0
45 for sentence in sentences:
46 sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
47
48 entCount = 0
49 for entity in document.findall("entity"):
50 entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
51 if Range.overlap(sentenceOffset, entityOffset):
52 document.remove(entity)
53 sentence.append(entity)
54 entityId = entity.get("id")
55 entityIdLastPart = entityId.rsplit(".", 1)[-1]
56 if entityIdLastPart.startswith("e"):
57 entity.set("id", sentence.get("id") + "." + entityIdLastPart)
58 entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
59 else:
60 entity.set("docId", entityId)
61 entity.set("id", sentence.get("id") + ".e" + str(entCount))
62 entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
63 entSentence[entityId] = sentence
64 entSentenceIndex[entityId] = sentenceCount
65 newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
66 entity.set("origOffset", entity.get("charOffset"))
67 entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
68 entCount += 1
69 sentenceCount += 1
70
71 intCount = 0
72 for interaction in document.findall("interaction"):
73 if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
74 targetSentence = entSentence[interaction.get("e1")]
75 else:
76 targetSentence = entSentence[interaction.get("e2")]
77 document.remove(interaction)
78 targetSentence.append(interaction)
79 interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
80 interaction.set("e1", entMap[interaction.get("e1")])
81 interaction.set("e2", entMap[interaction.get("e2")])
82 intCount += 1
83
84 -def makeSentence(text, begin, end, prevSentence=None, prevEnd=None):
85
86 e = ET.Element("sentence")
87 e.set("text", text[begin:end])
88 e.set("charOffset", str(begin) + "-" + str(end))
89
90 if prevSentence != None and begin - prevEnd > 1:
91 prevSentence.set("tail", text[prevEnd+1:begin])
92
93 if begin > 0 and prevSentence == None:
94 e.set("head", text[0:begin])
95 assert "\n" not in e.get("text"), e.get("text")
96 assert "\r" not in e.get("text"), e.get("text")
97 return e
98
99 -def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False):
100 """
101 Run GENIA Sentence Splitter
102
103 Divide text in the "text" attributes of document and section
104 elements into sentence elements. These sentence elements are
105 inserted into their respective parent elements.
106 """
107 global sentenceSplitterDir
108
109 print >> sys.stderr, "Loading corpus", input
110 corpusTree = ETUtils.ETFromObj(input)
111 print >> sys.stderr, "Corpus file loaded"
112 corpusRoot = corpusTree.getroot()
113
114 print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR,
115 if postProcess:
116 print >> sys.stderr, "(Using post-processing)"
117 else:
118 print >> sys.stderr, "(No post-processing)"
119 docCount = 0
120 sentencesCreated = 0
121 redivideCount = 0
122 emptySentenceCount = 0
123 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
124 counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter")
125 counter.showMilliseconds = True
126
127 workdir = tempfile.mkdtemp()
128 for document in sourceElements:
129 counter.update(1, "Splitting Documents ("+document.get("id")+"): ")
130 docId = document.get("id")
131 if docId == None:
132 docId = "CORPUS.d" + str(docCount)
133 docTag = "-" + str(docCount)
134 assert document.find("sentence") == None
135 text = document.get("text")
136 if text == None or text.strip() == "":
137 continue
138
139
140
141 workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8")
142
143
144
145
146
147
148
149 workfile.write(text)
150 workfile.close()
151
152 assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR
153 args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH]
154
155 p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
156 stdout, stderr = p.communicate()
157 if stdout != "":
158 print >> sys.stderr, stdout
159 if stderr != 'Extracting events.roading model file.\nstart classification.\n':
160 print >> sys.stderr, stderr
161
162
163 if postProcess:
164 ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
165 ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8")
166 subprocess.call(["perl", os.path.join(os.path.dirname(os.path.abspath(__file__)), "geniass-postproc.pl")], stdin=ppIn, stdout=ppOut)
167 ppIn.close()
168 ppOut.close()
169
170 workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8")
171 else:
172 workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
173 start = 0
174 sentenceCount = 0
175
176
177
178
179 docIndex = 0
180 sentenceBeginIndex = -1
181 prevSentence = None
182 prevEndIndex = None
183
184 prevText = None
185 for sText in workfile.readlines():
186 sText = sText.strip()
187 if sText == "":
188 emptySentenceCount += 1
189 continue
190
191 for i in range(len(sText)):
192 if sText[i].isspace():
193 assert sText[i] not in ["\n", "\r"]
194 continue
195 while text[docIndex].isspace():
196 if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1:
197 redivideCount += 1
198 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex)
199 prevSentence.set("id", docId + ".s" + str(sentenceCount))
200 prevSentence.set("redevided", "True")
201 sentencesCreated += 1
202 sentenceCount += 1
203 prevEndIndex = docIndex-1
204 sentenceBeginIndex = -1
205 document.append(prevSentence)
206 docIndex += 1
207 assert sText[i] == text[docIndex], (text, sText, prevText, sText[i:i+10], text[docIndex:docIndex+10], (i, docIndex), sentenceBeginIndex)
208 if sentenceBeginIndex == -1:
209 sentenceBeginIndex = docIndex
210 docIndex += 1
211 prevText = sText
212 if sentenceBeginIndex != -1:
213 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex)
214 prevSentence.set("id", docId + ".s" + str(sentenceCount))
215 prevEndIndex = docIndex-1
216 sentenceBeginIndex = -1
217 sentencesCreated += 1
218 sentenceCount += 1
219 document.append(prevSentence)
220
221 if prevEndIndex < len(text) - 1 and prevSentence != None:
222 assert prevSentence.get("tail") == None, prevSentence.get("tail")
223 prevSentence.set("tail", text[prevEndIndex+1:])
224
225
226
227
228 if removeText:
229 del document["text"]
230
231 moveElements(document)
232 docCount += 1
233
234 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
235 print >> sys.stderr, "Redivided", redivideCount, "sentences"
236 if emptySentenceCount > 0:
237 print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences"
238
239 if debug:
240 print >> sys.stderr, "Work directory preserved for debugging at", workdir
241 else:
242
243 shutil.rmtree(workdir)
244
245 if output != None:
246 print >> sys.stderr, "Writing output to", output
247 ETUtils.write(corpusRoot, output)
248 return corpusTree
249
250 if __name__=="__main__":
251 import sys
252
253 from optparse import OptionParser, OptionGroup
254
255 try:
256 import psyco
257 psyco.full()
258 print >> sys.stderr, "Found Psyco, using"
259 except ImportError:
260 print >> sys.stderr, "Psyco not installed"
261
262 optparser = OptionParser(description="GENIA Sentence Splitter wrapper")
263 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
264 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
265 optparser.add_option("-p", "--postprocess", default=False, action="store_true", dest="postprocess", help="Run postprocessor")
266 group = OptionGroup(optparser, "Install Options", "")
267 group.add_option("--install", default=None, action="store_true", dest="install", help="Install BANNER")
268 group.add_option("--installDir", default=None, dest="installDir", help="Install directory")
269 group.add_option("--downloadDir", default=None, dest="downloadDir", help="Install files download directory")
270 group.add_option("--redownload", default=False, action="store_true", dest="redownload", help="Redownload install files")
271 optparser.add_option_group(group)
272 (options, args) = optparser.parse_args()
273
274 if not options.install:
275 makeSentences(input=options.input, output=options.output, removeText=False, postProcess=options.postprocess)
276 else:
277 install(options.installDir, options.downloadDir, redownload=options.redownload)
278