1 from optparse import OptionParser
2 try:
3 import xml.etree.cElementTree as ElementTree
4 except ImportError:
5 import cElementTree as ElementTree
6 import gzip
7 import sys
8 import os
9 import re
10 import string
11 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
12 import Utils.ElementTreeUtils as ETUtils
13 from Utils.ProgressCounter import ProgressCounter
14
15
16 tokenIdPrefix = "st_"
17
18
19 splitTokenizationName = "split"
20
21
22 newParseName = "split_parse"
23
24
25 splitHyphenDepName = "hyphen"
26 splitSlashDepName = "slash"
27 splitParensDepName = "appos"
28 splitDefaultDepName = "dep"
29
30
31
33 toks = sentence.find("sentenceanalyses/tokenizations")
34 if toks == None:
35 toks = sentence.find("analyses")
36 assert toks != None, "Missing <tokenizations> in sentence %s" % sentenceId
37
38
39
40
41 for t in toks.getiterator("tokenization"):
42 if t.get("tokenizer") is not None:
43 assert t.get("tokenizer") is not None, "Split tokenization '%s' already exists in sentence %s!" % (tokenization, sentenceId)
44
45
46
47
48 newTok = ElementTree.SubElement(toks, "tokenization")
49 newTok.attrib["tokenizer"] = tokenization
50
51
52
53
54 return newTok
55
56
57
59 analyses = sentence.find("analyses")
60 if analyses == None:
61 return None
62 for t in analyses.findall("tokenization"):
63 if t.get("tokenizer") == tokenization:
64 if remove:
65 analyses.remove(t)
66 return t
67 return None
68
69
70
71 -def addParse(parse, tokenization, sentence, sentenceId):
72 for p in sentence.getiterator("parse"):
73 if p.get("parser") is not None:
74 assert p.get("parser") != parse, "New parse '%s' already exists in sentence %s!" % (parse, sentenceId)
75
76 newParse = ElementTree.SubElement(sentence.find("analyses"), "parse")
77 newParse.attrib["parser"] = parse
78 newParse.attrib["tokenizer"] = tokenization
79 return newParse
80
81
82
83
84 -def getParse(parse, tokenization, sentence, sentenceId, remove=False):
85
86 parsePath = "sentenceanalyses/parses/"+parse
87 found = sentence.find(parsePath)
88
89 if found is not None:
90 return found
91
92
93 parses = sentence.find("sentenceanalyses/parses")
94 if parses == None:
95 parses = sentence.find("analyses")
96 assert parses is not None, "ERROR: missing parses for sentence %s" % sentenceId
97
98 for p in parses.getiterator("parse"):
99 if p.get("parser") == parse:
100 assert p.get("tokenizer") == tokenization, "ERROR: tokenization/parse mismatch: parse %s has tokenizer %s, not %s" % (parse, p.get("tokenizer"), tokenization)
101 if remove:
102 parses.remove(p)
103 return p
104
105 return None
106
107
109 - def __init__(self, id, origId, pos, charOffset, text):
110 self.id = id
111 self.origId = origId
112 self.pos = pos
113 self.charOffset = charOffset
114 self.text = text
115 self.splitFromOffset = None
116
117
118 self.head = None
119 self.depType = None
120
122 return [t for t in self.text if t not in string.punctuation] == []
123
124
125
126
127 -def cutPoints(tokStart, tokEnd, entityOffsets):
152
153
154
155
156
158
159 if len(splitParts) < 2:
160 return
161
162
163
164
165
166 for i, tok in enumerate(splitParts):
167
168 prevTok = None
169 if i-1 >= 0:
170 prevTok = splitParts[i-1]
171 nextTok = None
172 if i+1 < len(splitParts):
173 nextTok = splitParts[i+1]
174 nextNextTok = None
175 if i+2 < len(splitParts):
176 nextNextTok = splitParts[i+2]
177
178
179 if tok.isPunct():
180 continue
181
182
183
184
185
186
187
188
189
190
191 if (nextTok is not None and nextTok.text in ["-", "/", "("] and
192 nextNextTok is not None and not nextNextTok.isPunct()):
193
194
195 if nextTok.text == "-":
196 tok.head = nextNextTok
197 tok.depType = splitHyphenDepName
198
199
200 elif nextTok.text == "/":
201 nextNextTok.head = tok
202 nextNextTok.depType = splitSlashDepName
203
204
205 elif nextTok.text == "(":
206 nextNextTok.head = tok
207 nextNextTok.depType = splitParensDepName
208
209
210 headLess = []
211 for tok in splitParts:
212 if tok.isPunct():
213 continue
214 if tok.head is None:
215 headLess.append(tok)
216 joinedText = " ".join([t.text for t in splitParts])
217 if len(headLess) == 0:
218 if logFile != None:
219 logFile.write("NOTE: no head candidates for " + joinedText + "\n")
220 if len(headLess) > 1:
221 if logFile != None:
222 logFile.write("NOTE: failed to resolve unique \"head\" for " + joinedText + ": " + " ".join([t.text for t in headLess]) + "\n")
223
224 for h in headLess[1:]:
225 h.head = headLess[0]
226 h.depType = splitDefaultDepName
227
228
229
230
231
233
234 sentenceId = sentence.get("id")
235 if sentence.get("origId") != None:
236 sentenceId += "/" + sentence.get("origId")
237 splitTokens = []
238
239
240
241 entityOffsets = []
242 for entity in sentence.getiterator("entity"):
243 if entity.get("isName") != None and entity.get("isName") == "False":
244 continue
245 offsets = entity.get("charOffset")
246 assert offsets is not None, "Missing charOffset!"
247
248 for offset in offsets.split(","):
249 m = re.match(r'^(\d+)-(\d+)$', offset)
250 assert m, "Failed to parse charOffset '%s'" % offset
251
252 start, end = int(m.group(1)), int(m.group(2)) - 1
253 entityOffsets.append((start,end))
254
255 seqId = 0
256 nextId = "%s%d" % (tokenIdPrefix, seqId)
257
258 for token in tokenization.getiterator("token"):
259
260 text = token.get("text")
261 origId = token.get("id")
262 POS = token.get("POS")
263 off = token.get("charOffset")
264
265
266 m = re.match(r'^(\d+)-(\d+)$', off)
267 assert m, "Failed to parse token charOffset '%s'" % off
268
269 tokStart, tokEnd = int(m.group(1)), int(m.group(2)) - 1
270
271
272 cuts = cutPoints(tokStart, tokEnd, entityOffsets)
273
274
275
276 newCuts = set(cuts)
277 for cut in cuts:
278 cutOffset = cut - tokStart
279 firstPart, lastPart = text[:cutOffset], text[cutOffset:]
280
281
282
283 if (lastPart[0] in ["-", "/"] and
284 len(lastPart) >= 2 and lastPart[1] not in string.punctuation):
285 newCuts.add(cut+1)
286
287
288 if (firstPart[-1] in ["-", "/"] and
289 len(firstPart) >= 2 and firstPart[-2] not in string.punctuation):
290 newCuts.add(cut-1)
291
292 cuts = sorted(list(newCuts))
293
294 parts = []
295 startOffset = 0
296 for cut in cuts:
297 cutOffset = cut - tokStart
298 parts.append(text[startOffset:cutOffset])
299 startOffset = cutOffset
300 parts.append(text[startOffset:])
301
302 if len(parts) > 1:
303
304 if logFile != None:
305 logFile.write("Token %s in sentence %s: cut '%s' into %d parts:" % (origId, sentenceId, text, len(parts)) + " ".join(["'%s'" % p for p in parts]) + "\n")
306
307 pass
308
309
310 assert text == "".join(parts), "INTERNAL ERROR: token parts don't add up to original!"
311
312
313
314
315 currentOffset = tokStart
316 splitParts = []
317 for part in parts:
318
319 tOff = "%d-%d" % (currentOffset, currentOffset + len(part))
320
321 t = Token(nextId, origId, POS, tOff, part)
322 t.splitFromOffset = off
323 splitParts.append(t)
324 splitTokens.append(t)
325
326 currentOffset += len(part)
327 seqId += 1
328 nextId = "%s%d" % (tokenIdPrefix, seqId)
329
330
331 resolveHeads(splitParts, logFile)
332
333 return splitTokens
334
335
337 for t in tokens:
338 newToken = ElementTree.SubElement(element, "token")
339 newToken.set("id", t.id)
340 newToken.set("text", t.text)
341 newToken.set("POS", t.pos)
342 newToken.set("charOffset", t.charOffset)
343 if t.splitFromOffset != None and t.splitFromOffset != t.charOffset:
344 newToken.set("splitFrom", t.splitFromOffset)
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359 -def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True):
360 print >> sys.stderr, "Protein Name Splitter"
361 if logFileName != None:
362 print >> sys.stderr, "Writing log to", logFileName
363 logFile = open(logFileName, "wt")
364 else:
365 logFile = None
366
367
368
369
370 tree = ETUtils.ETFromObj(input)
371
372 if tokenizationName == None:
373 tokenizationName = parseName
374
375
376 root = tree.getroot()
377
378 sentences = [x for x in root.getiterator("sentence")]
379 counter = ProgressCounter(len(sentences), "Split Protein Names")
380 counter.showMilliseconds = True
381 missingTokCount = 0
382 for sentence in sentences:
383 sId = sentence.get("id")
384 counter.update(1, "Splitting names ("+sId+"): ")
385
386 tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld)
387 if tok == None:
388 missingTokCount += 1
389 continue
390
391 assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (tokenizationName, sId)
392
393 parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld)
394 assert parse is not None, "Missing parse '%s' in sentence %s!" % (parseName, sId)
395
396 split = splitTokens(tok, sentence, logFile)
397
398
399 if removeOld:
400 if newTokenizationName == None:
401 newTokenizationName = tok.get("tokenizer")
402 if newParseName == None:
403 newParseName = parse.get("parser")
404 else:
405 if newTokenizationName == None:
406 newTokenizationName = "split-" + tok.get("tokenizer")
407 if newParseName == None:
408 newParseName = "split-" + parse.get("parser")
409
410
411 splittok = addTokenization(newTokenizationName, sentence, sId)
412 addTokensToTree(split, splittok)
413 for a in tok.attrib:
414 if splittok.get(a) == None:
415 splittok.set(a, tok.get(a))
416
417
418
419
420 tokenIdMap = {}
421 for t in split:
422 if t.head:
423 head = t.head
424
425 while head.head is not None:
426 assert head.head != t, "Cyclic heads"
427 head = head.head
428
429
430
431 tokenIdMap[t.origId] = head.id
432 else:
433
434
435 if t.origId not in tokenIdMap or not t.isPunct():
436 tokenIdMap[t.origId] = t.id
437
438
439
440 newparse = addParse(newParseName, newTokenizationName, sentence, sId)
441 for a in parse.attrib:
442 if newparse.get(a) == None:
443 newparse.set(a, parse.get(a))
444 newparse.set("ProteinNameSplitter", "True")
445 splittok.set("ProteinNameSplitter", "True")
446
447 depSeqId = 0
448 for d in parse.getiterator("dependency"):
449 t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type")
450 assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR"
451
452 dep = ElementTree.SubElement(newparse, "dependency")
453 dep.set("t1", tokenIdMap[t1])
454 dep.set("t2", tokenIdMap[t2])
455 dep.set("type", dType)
456 dep.set("id", "sd_%d" % depSeqId)
457 depSeqId += 1
458
459
460 for t in [tok for tok in split if tok.head is not None]:
461 dep = ElementTree.SubElement(newparse, "dependency")
462 dep.set("t1", t.head.id)
463 dep.set("t2", t.id)
464 dep.set("type", t.depType)
465 dep.set("split", "PNS")
466 dep.set("id", "spd_%d" % depSeqId)
467 depSeqId += 1
468
469 for phrase in parse.getiterator("phrase"):
470 newparse.append(phrase)
471
472
473
474
475 print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences"
476
477
478 if logFile != None:
479 logFile.close()
480
481
482 if output != None:
483 print >> sys.stderr, "Writing output to", output
484 ETUtils.write(tree, output)
485 return tree
486
487
488
489 if __name__=="__main__":
490 optParser = OptionParser(usage="%prog [OPTIONS]\nModifies one parse and associated tokenization to split (some) hyphenated\nwords, e.g. \"actin-binding\".")
491 optParser.add_option("-f", "--analysisFile", dest="file", metavar="FILE", default=None, help = "Path to the xml-formatted analysis file")
492 optParser.add_option("-o", "--output", dest="output", metavar="FILE", default=None, help = "Path to the xml-formatted analysis file")
493 optParser.add_option("-p", "--parse", dest="parse", default = None, help = "Name of the parse to modify")
494 optParser.add_option("-t", "--tokenization", dest="tokenization", default=None, help="Name of the tokenization to modify")
495 optParser.add_option("-s", "--splittokenization", dest="splittokenization", default=splitTokenizationName, help="Name of the new split tokenization to create")
496 optParser.add_option("-n", "--newparse", dest="newparse", default=newParseName, help="Name of the new parse to create")
497 optParser.add_option("-l", "--logFile", dest="logFileName", default=None, help="Log for the splitter messages")
498 (options, args) = optParser.parse_args()
499
500 if (options.file is None or options.parse is None or
501 options.tokenization is None):
502 print >> sys.stderr, "The -f, -p and -t options are mandatory."
503 optParser.print_help()
504 sys.exit(1)
505
506 mainFunc(options.file, options.output, options.parse, options.tokenization, options.splittokenization, options.newparse, options.logFileName)
507