1 __version__ = "$Revision: 1.1 $"
2
3 import sys,os
4 import sys
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9
10 import shutil
11 import subprocess
12 import tempfile
13 import codecs
14 import tarfile
15
16 from GeniaSentenceSplitter import moveElements
17
18 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
19 import Utils.ElementTreeUtils as ETUtils
20 from Utils.ProgressCounter import ProgressCounter
21
23 if tarFile != None:
24 try:
25 return tarFile.extractfile(tarFile.getmember(path))
26 except KeyError:
27 pass
28 else:
29 if os.path.exists(path):
30 return open(path, "rt")
31 return None
32
33 -def makeSentences(input, tokenizationPath, output=None, removeText=False, escDict={}):
34 """
35 Divide text in the "text" attributes of document and section
36 elements into sentence elements. These sentence elements are
37 inserted into their respective parent elements.
38 """
39 print >> sys.stderr, "Loading corpus", input
40 corpusTree = ETUtils.ETFromObj(input)
41 print >> sys.stderr, "Corpus file loaded"
42 corpusRoot = corpusTree.getroot()
43
44 print >> sys.stderr, "Inserting tokenizations from", tokenizationPath
45 if tokenizationPath.find(".tar.gz") != -1:
46 tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz")
47 tarFilePath += ".tar.gz"
48 tarFile = tarfile.open(tarFilePath)
49 if tokenizationPath[0] == "/":
50 tokenizationPath = tokenizationPath[1:]
51 else:
52 tarFile = None
53
54 docCount = 0
55 docsWithSentences = 0
56 sentencesCreated = 0
57 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
58 counter = ProgressCounter(len(sourceElements), "Sentence Splitting")
59 for document in sourceElements:
60 docCount += 1
61 origId = document.get("pmid")
62 if origId == None:
63 origId = document.get("origId")
64 origId = str(origId)
65 counter.update(1, "Splitting Documents ("+document.get("id")+"/" + origId + "): ")
66 docId = document.get("id")
67 if docId == None:
68 docId = "CORPUS.d" + str(docCount)
69 if document.find("sentence") == None:
70 text = document.get("text")
71 if text == None or text.strip() == "":
72 continue
73
74 newFile = os.path.join(tokenizationPath, origId + ".tok")
75 f = openFile(newFile, tarFile)
76 if f == None:
77 oldFile = os.path.join(tokenizationPath, origId + ".tokenized")
78 f = openFile(oldFile, tarFile)
79 if f == None:
80 continue
81 sentencesCreated += alignSentences(document, f.readlines(), escDict)
82 f.close()
83
84
85 if removeText:
86 del document["text"]
87
88 moveElements(document)
89 docsWithSentences += 1
90 else:
91 docsWithSentences += 1
92
93 if tarFile != None:
94 tarFile.close()
95 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
96 print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences"
97
98 if output != None:
99 print >> sys.stderr, "Writing output to", output
100 ETUtils.write(corpusRoot, output)
101 return corpusTree
102
104 text = document.get("text")
105 start = 0
106 cEnd = 0
107 sentenceCount = 0
108 head = None
109 sentenceStart = None
110
111
112 sText = None
113 for sText in sentenceTexts:
114 sText = sText.strip()
115 for key in sorted(escDict.keys()):
116 sText = sText.replace(key, escDict[key])
117 if sText == "":
118 print >> sys.stderr, "Warning, empty sentence in", document.get("id")
119 continue
120 isFirst = True
121 for sToken in sText.split():
122
123
124 cStart = text.find(sToken, start)
125 assert cStart != -1, (text, sText, sToken, start)
126 if not text[cEnd:cStart].strip() == "":
127 print >> sys.stderr, "-----------------------------"
128 print >> sys.stderr, "text:", text
129 print >> sys.stderr, "text[cEnd:cStart+1]:", text[cEnd:cStart+1]
130 print >> sys.stderr, "prevSText:", prevSText
131 print >> sys.stderr, "sText:", sText
132 print >> sys.stderr, "sToken:", sToken
133 print >> sys.stderr, "start:", start
134 print >> sys.stderr, "-----------------------------"
135 assert False
136
137 tail = None
138 if isFirst:
139 sentenceStart = cStart
140 if cStart - start != 0:
141 prevSentence.set("tail", text[start:cStart])
142 if cEnd == 0 and cStart != 0:
143 head = text[cEnd:cStart]
144 cEnd = cStart + len(sToken)
145 start = cStart + len(sToken)
146 isFirst = False
147
148 e = ET.Element("sentence")
149 if head != None:
150 e.set("head", head)
151 e.set("text", text[sentenceStart:cEnd])
152 e.set("charOffset", str(sentenceStart) + "-" + str(cEnd))
153 e.set("id", document.get("id") + ".s" + str(sentenceCount))
154 document.append(e)
155 prevSentence = e
156 sentenceCount += 1
157 if sentenceCount == len(sentenceTexts):
158 if cEnd <= len(text):
159 e.set("tail", text[cEnd:])
160 prevSText = sText
161 return sentenceCount
162
163
164 if __name__=="__main__":
165 import sys
166
167 from optparse import OptionParser
168
169 try:
170 import psyco
171 psyco.full()
172 print >> sys.stderr, "Found Psyco, using"
173 except ImportError:
174 print >> sys.stderr, "Psyco not installed"
175
176 optparser = OptionParser(description="For inserting an existing sentence splitting")
177 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
178 optparser.add_option("-t", "--tokenizationPath", default=None, dest="tokenizationPath", help="Tokenization path", metavar="FILE")
179 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
180 (options, args) = optparser.parse_args()
181
182 makeSentences(input=options.input, tokenizationPath=options.tokenizationPath, output=options.output, removeText=False)
183