1 import sys, os, codecs, time, signal
2 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
3 from Utils.ProgressCounter import ProgressCounter
4
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9
11 """
12 Killing a process spawned by a shell is not really possible (at least in Python).
13 This becomes a problem, if a tool requires multiple (e.g. piped) processes to be
14 ran. With ProcessWrapper, all processes can be called directly from Python so
15 that their ids are known and they can be killed if they hang. A ProcessWrapper can
16 be passed as a parameter to ProcessUtils functions in place of a subprocess.Popen
17 object.
18 """
20 self.processes = processes
21
37
38
40 """
41 If any subprocess is running, returns None (not finished).
42 """
43 for process in self.processes:
44 if process.poll() == None:
45 return None
46 return "FINISHED"
47
48 -def waitForProcess(process, numCorpusSentences, measureByGap, outputFile, counterName, updateMessage, timeout=None):
49 """
50 Waits for a process to finish, and tracks the number of entities it writes
51 to it's outputfile. If writing a sentence takes longer than the timeout,
52 the process is considered stalled and is killed.
53 """
54 maxStartupTime = 600
55 counter = ProgressCounter(numCorpusSentences, counterName)
56 counter.showMilliseconds = True
57 prevNumSentences = 0
58 finalCheckLeft = True
59 processStatus = None
60 prevTime = time.time()
61 startTime = time.time()
62
63 while processStatus == None or finalCheckLeft:
64 if processStatus != None:
65 finalCheckLeft = False
66 if os.path.exists(outputFile[0]):
67
68 numSentences = 0
69 f = codecs.open(outputFile[0], "rt", **outputFile[1])
70 for line in f:
71 if measureByGap:
72 if line.strip() == "":
73 numSentences += 1
74 else:
75 numSentences += 1
76 f.close()
77
78 if numSentences - prevNumSentences != 0:
79 counter.update(numSentences - prevNumSentences, updateMessage + ": ")
80 if finalCheckLeft:
81
82 if time.time() - startTime < maxStartupTime or numSentences - prevNumSentences != 0:
83
84 prevTime = time.time()
85 else:
86 elapsedTime = time.time() - prevTime
87 if timeout != None and elapsedTime > timeout:
88 print >> sys.stderr, "Process timed out (" + str(elapsedTime) + " vs. " + str(timeout) + ")"
89 print >> sys.stderr, "Killing process"
90 process.kill()
91 prevNumSentences = numSentences
92 time.sleep(1)
93 else:
94 prevTime = time.time()
95 processStatus = process.poll()
96
97 counter.markFinished()
98 return (numSentences, numCorpusSentences)
99
101 """
102 Make a subset of the input data from "fromLine" to end of input file.
103 """
104 newInput = os.path.join(workdir, "input-from-" + str(fromLine))
105 newInputFile = codecs.open(newInput, "wt", "utf-8")
106
107 inputFile = codecs.open(input, "rt", "utf-8")
108 lineCount = -1
109 for line in inputFile:
110 lineCount += 1
111 if lineCount < fromLine:
112 continue
113 newInputFile.write(line)
114 inputFile.close()
115 newInputFile.close()
116 return newInput
117
118 -def mergeOutput(dir, numCorpusSentences, measureByGap, outputArgs={}):
119 """
120 Merge output files (multiple files may have been created if program failed on a sentence)
121 """
122 filenames = os.listdir(dir)
123 outputs = []
124 for filename in filenames:
125 if filename.find("output-from") != -1:
126 outputs.append( (int(filename.rsplit("-", 1)[-1]), filename) )
127 outputs.sort()
128
129
130 mergedOutput = codecs.open(os.path.join(dir, "merged-output"), "wt", **outputArgs)
131
132 missingSentences = 0
133 numSentences = 0
134
135 for i in range(len(outputs)):
136 f = codecs.open(os.path.join(dir, outputs[i][1]), "rt", **outputArgs)
137 for line in f:
138 mergedOutput.write(line)
139 if measureByGap:
140 if line.strip() == "":
141 numSentences += 1
142 else:
143 numSentences += 1
144 f.close()
145
146 if i < len(outputs) - 1:
147 while numSentences < outputs[i+1][0]:
148 mergedOutput.write("\n")
149 numSentences += 1
150 missingSentences += 1
151 else:
152 while numSentences < numCorpusSentences:
153 mergedOutput.write("\n")
154 numSentences += 1
155 missingSentences += 1
156 mergedOutput.close()
157 return missingSentences
158
160 """
161 Return the sentence count to which this process reached by counting
162 the sentences in the output file.
163 """
164 if subsetFileName.find("-from-") == -1:
165 return 0
166 numSentences = getLines(subsetFileName, measureByGap)
167 subsetPos = int(subsetFileName.rsplit("-", 1)[-1])
168 return subsetPos + numSentences
169
171 """
172 Number of sentences in the file, measured either in lines, or by empty "gap" lines
173 """
174 numSentences = 0
175 f = codecs.open(filename, "rt", "utf-8")
176 for line in f:
177 if measureByGap:
178 if line.strip() == "":
179 numSentences += 1
180 else:
181 numSentences += 1
182 f.close()
183 return numSentences
184
185 -def runSentenceProcess(launchProcess, programDir, input, workdir, measureByGap, counterName, updateMessage, timeout=None, processArgs={}, outputArgs={}):
186 """
187 Runs a process on input sentences, and in case of problems skips one sentence and
188 reruns the process on the remaining ones.
189 """
190
191 input = os.path.abspath(input)
192 origInput = input
193 numCorpusSentences = 0
194 inputFile = codecs.open(input, "rt", "utf-8")
195 for line in inputFile:
196 numCorpusSentences += 1
197 inputFile.close()
198
199 if "encoding" not in outputArgs:
200 outputArgs["encoding"] = "utf-8"
201
202 cwd = os.getcwd()
203 os.chdir(programDir)
204 finished = False
205 startLine = 0
206 while not finished:
207
208 inputLines = 0
209 inputFile = codecs.open(input, "rt", "utf-8")
210 for line in inputFile:
211 inputLines += 1
212 inputFile.close()
213
214 output = os.path.join(workdir, "output-from-" + str(startLine))
215 process = launchProcess(input, output, **processArgs)
216 result = waitForProcess(process, inputLines, measureByGap, (output, outputArgs), counterName, updateMessage, timeout)
217 if result[0] != result[1]:
218 gap = 1
219 startLine = getSubsetEndPos(output, measureByGap) + gap
220 if startLine >= numCorpusSentences:
221 finished = True
222 else:
223 print >> sys.stderr, "Process failed for sentence " + str(startLine-gap) + ", rerunning from sentence", startLine
224 input = makeSubset(origInput, workdir, startLine)
225 else:
226 finished = True
227 os.chdir(cwd)
228
229 numMissedSentences = mergeOutput(workdir, numCorpusSentences, measureByGap, outputArgs=outputArgs)
230 if numMissedSentences == 0:
231 print >> sys.stderr, "Processed succesfully all sentences"
232 else:
233 print >> sys.stderr, "Warning, processing failed for", numMissedSentences, "out of", numCorpusSentences, "sentences"
234 return os.path.abspath(os.path.join(workdir, "merged-output"))
235
237 index = 0
238 for e in parent:
239 if e == element:
240 return index
241 index += 1
242 return -1
243
245 index = 0
246 elemIndex = -1
247 for element in parent:
248 if element.tag == eTag:
249 elemIndex = index
250 index += 1
251 return elemIndex
252
254 for element in parent.getiterator():
255 if element.tag == tag:
256 found = True
257 for k, v in attDict.iteritems():
258 if element.get(k) != v:
259 found = False
260 if found:
261 return element
262 return None
263
265 element = parent.find(name)
266 if element == None:
267 element = ET.Element(name)
268 parent.append(element)
269 return element
270