TEES.Utils.ProteinNameSplitter

1 from optparse import OptionParser 2 try: 3 import xml.etree.cElementTree as ElementTree 4 except ImportError: 5 import cElementTree as ElementTree 6 import gzip 7 import sys 8 import os 9 import re 10 import string 11 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 12 import Utils.ElementTreeUtils as ETUtils 13 from Utils.ProgressCounter import ProgressCounter 14 15 # the prefix to use for split token ids 16 tokenIdPrefix = "st_" 17 18 # the default name of the new tokenization 19 splitTokenizationName = "split" 20 21 # the default name of the new parse 22 newParseName = "split_parse" 23 24 # the special dependency types to connect split tokens with 25 splitHyphenDepName = "hyphen" 26 splitSlashDepName = "slash" 27 splitParensDepName = "appos" 28 splitDefaultDepName = "dep" 29 30 # returns a cElementTree element corresponding to a new tokenization 31 # in the given sentence element.

32 -def addTokenization(tokenization, sentence, sentenceId):

33 toks = sentence.find("sentenceanalyses/tokenizations") 34 if toks == None: 35 toks = sentence.find("analyses") 36 assert toks != None, "Missing <tokenizations> in sentence %s" % sentenceId 37 38 # # assume new-style if there's at least one <tokenization> with 39 # # a "tokenizer" attribute. Also check duplicates. 40 # isNew = False 41 for t in toks.getiterator("tokenization"): 42 if t.get("tokenizer") is not None: 43 assert t.get("tokenizer") is not None, "Split tokenization '%s' already exists in sentence %s!" % (tokenization, sentenceId) 44 # isNew = True 45 46 # add the tokenization. 47 # if isNew: 48 newTok = ElementTree.SubElement(toks, "tokenization") 49 newTok.attrib["tokenizer"] = tokenization 50 # else: 51 # assert toks.find(tokenization) is None, "Split tokenization '%s' already exists in sentence %s!" % (tokenization, sentenceId) 52 # newTok = ElementTree.SubElement(toks, tokenization) 53 54 return newTok

55 56 # returns a cElementTree element corresponding to the given tokenization 57 # in the given sentence element.

58 -def getTokenization(tokenization, sentence, sentenceId, remove=False):

59 analyses = sentence.find("analyses") 60 if analyses == None: 61 return None 62 for t in analyses.findall("tokenization"): 63 if t.get("tokenizer") == tokenization: 64 if remove: 65 analyses.remove(t) 66 return t 67 return None

68 69 # returns a cElementTree element corresponding to a new parse in the 70 # given sentence element.

71 -def addParse(parse, tokenization, sentence, sentenceId):

72 for p in sentence.getiterator("parse"): 73 if p.get("parser") is not None: 74 assert p.get("parser") != parse, "New parse '%s' already exists in sentence %s!" % (parse, sentenceId) 75 76 newParse = ElementTree.SubElement(sentence.find("analyses"), "parse") 77 newParse.attrib["parser"] = parse 78 newParse.attrib["tokenizer"] = tokenization 79 return newParse

80 81 # returns a cElementTree element correspoding to the given parse 82 # in the given sentence element. Also checks that the parse is created 83 # for the given tokenization.

84 -def getParse(parse, tokenization, sentence, sentenceId, remove=False):

85 # first try old-style format, then new. 86 parsePath = "sentenceanalyses/parses/"+parse 87 found = sentence.find(parsePath) 88 89 if found is not None: 90 return found 91 92 # then try new-style 93 parses = sentence.find("sentenceanalyses/parses") 94 if parses == None: 95 parses = sentence.find("analyses") 96 assert parses is not None, "ERROR: missing parses for sentence %s" % sentenceId 97 98 for p in parses.getiterator("parse"): 99 if p.get("parser") == parse: 100 assert p.get("tokenizer") == tokenization, "ERROR: tokenization/parse mismatch: parse %s has tokenizer %s, not %s" % (parse, p.get("tokenizer"), tokenization) 101 if remove: 102 parses.remove(p) 103 return p 104 105 return None

106 107 # represents a token in the analysis XML.

108 -class Token:

109 - def __init__(self, id, origId, pos, charOffset, text):

110 self.id = id 111 self.origId = origId 112 self.pos = pos 113 self.charOffset = charOffset 114 self.text = text 115 self.splitFromOffset = None 116 117 # these oddities are used in re-connecting split tokens 118 self.head = None 119 self.depType = None

120

121 - def isPunct(self):

122 return [t for t in self.text if t not in string.punctuation] == []

123 124 # given token start and end offsets and a list of (start,end) spans 125 # of entities, returns a list of the points at which the token would 126 # need to be "cut" to give parts that divide cleanly into entities.

127 -def cutPoints(tokStart, tokEnd, entityOffsets):

128 cutPoints = set() 129 130 for start, end in entityOffsets: 131 if start > tokStart and start <= tokEnd: 132 # must be cut at the start of the entity 133 cutPoints.add(start) 134 135 if end >= tokStart and end < tokEnd: 136 # must be cut after the end of the entity 137 cutPoints.add(end+1) 138 139 # "postprocess" the proposed cuts to remove all instances where a 140 # cut would break an entity. This is to protect against e.g. 141 # "H2A" in "H2A and 2B" from being cut (rather meaninglessly) into 142 # "H" and "2A" to match annotated entities "H2A" and "H2B". 143 for cut in cutPoints.copy(): 144 for start, end in entityOffsets: 145 if cut > start and cut <= end: 146 try: 147 cutPoints.remove(cut) 148 except KeyError: 149 print >> sys.stderr, "!" 150 151 return sorted(list(cutPoints))

152 153 154 # heuristically determines which of the given parts of what was originally 155 # a single token should be considered the "head" of the split token parts. 156 # Sets Token.head and Token.depType.

157 -def resolveHeads(splitParts, logFile=None):

158 # if there's only one part, there's nothing to resolve 159 if len(splitParts) < 2: 160 return 161 162 # since tokens may be split at multiple places for various 163 # reasons, start by first marking "head" locally, determining 164 # for each split which of the tokens is the head. This will 165 # then be further resolved transitively. 166 for i, tok in enumerate(splitParts): 167 # may need to refer to these 168 prevTok = None 169 if i-1 >= 0: 170 prevTok = splitParts[i-1] 171 nextTok = None 172 if i+1 < len(splitParts): 173 nextTok = splitParts[i+1] 174 nextNextTok = None 175 if i+2 < len(splitParts): 176 nextNextTok = splitParts[i+2] 177 178 # ignore all-punctuation tokens 179 if tok.isPunct(): 180 continue 181 182 # not a good idea --- these may resolve other heads in turn. 183 # # ignore tokens for which the head has been already 184 # # determined 185 # if tok.head is not None: 186 # assert tok.depType is not None 187 # continue 188 189 # if the next token is a hyphen or slash (etc.) and the next one 190 # is not punctuation, we can resolve this bit.. 191 if (nextTok is not None and nextTok.text in ["-", "/", "("] and 192 nextNextTok is not None and not nextNextTok.isPunct()): 193 # for the hyphen case, the latter non-punct token is 194 # the head 195 if nextTok.text == "-": 196 tok.head = nextNextTok 197 tok.depType = splitHyphenDepName 198 199 # for slashes, the preceding token is assumed the head 200 elif nextTok.text == "/": 201 nextNextTok.head = tok 202 nextNextTok.depType = splitSlashDepName 203 204 # same for parens 205 elif nextTok.text == "(": 206 nextNextTok.head = tok 207 nextNextTok.depType = splitParensDepName 208 209 # if all but one non-punctuation token have a head, all is OK 210 headLess = [] 211 for tok in splitParts: 212 if tok.isPunct(): 213 continue 214 if tok.head is None: 215 headLess.append(tok) 216 joinedText = " ".join([t.text for t in splitParts]) 217 if len(headLess) == 0: 218 if logFile != None: 219 logFile.write("NOTE: no head candidates for " + joinedText + "\n") 220 if len(headLess) > 1: 221 if logFile != None: 222 logFile.write("NOTE: failed to resolve unique \"head\" for " + joinedText + ": " + " ".join([t.text for t in headLess]) + "\n") 223 # assume the first candidate is the head, connect the other there. 224 for h in headLess[1:]: 225 h.head = headLess[0] 226 h.depType = splitDefaultDepName

227 228 229 # splits the <token>s in the given tokenization, attempting to split them 230 # so that each entity has its own token. Returns a list of Token objects 231 # representing the new split ones.

232 -def splitTokens(tokenization, sentence, logFile=None):

233 # store the tokens for the new split tokenization here 234 sentenceId = sentence.get("id") 235 if sentence.get("origId") != None: 236 sentenceId += "/" + sentence.get("origId") 237 splitTokens = [] 238 239 # get the character offsets of entities, and turn them into a list 240 # of (from,to) tuples. 241 entityOffsets = [] 242 for entity in sentence.getiterator("entity"): 243 if entity.get("isName") != None and entity.get("isName") == "False": 244 continue 245 offsets = entity.get("charOffset") 246 assert offsets is not None, "Missing charOffset!" 247 # format is "NUM-NUM(,NUM-NUM)+". split by commas, parse ranges 248 for offset in offsets.split(","): 249 m = re.match(r'^(\d+)-(\d+)$', offset) 250 assert m, "Failed to parse charOffset '%s'" % offset 251 #start, end = int(m.group(1)), int(m.group(2)) 252 start, end = int(m.group(1)), int(m.group(2)) - 1 253 entityOffsets.append((start,end)) 254 255 seqId = 0#1 256 nextId = "%s%d" % (tokenIdPrefix, seqId) 257 258 for token in tokenization.getiterator("token"): 259 260 text = token.get("text") 261 origId = token.get("id") 262 POS = token.get("POS") 263 off = token.get("charOffset") 264 265 # parse the token offset 266 m = re.match(r'^(\d+)-(\d+)$', off) 267 assert m, "Failed to parse token charOffset '%s'" % off 268 #tokStart, tokEnd = int(m.group(1)), int(m.group(2)) 269 tokStart, tokEnd = int(m.group(1)), int(m.group(2)) - 1 270 271 # determine points at which the token must be cut 272 cuts = cutPoints(tokStart, tokEnd, entityOffsets) 273 274 # go through the cuts, possibly adding more to further break e.g. 275 # "actin" "-binding" into "actin" "-" "binding". 276 newCuts = set(cuts) 277 for cut in cuts: 278 cutOffset = cut - tokStart 279 firstPart, lastPart = text[:cutOffset], text[cutOffset:] 280 281 # extra cut immediately after cut followed by hyphen, 282 # slash etc. that precedes a non-punctuation character. 283 if (lastPart[0] in ["-", "/"] and 284 len(lastPart) >= 2 and lastPart[1] not in string.punctuation): 285 newCuts.add(cut+1) 286 287 # same in reverse (sort of). 288 if (firstPart[-1] in ["-", "/"] and 289 len(firstPart) >= 2 and firstPart[-2] not in string.punctuation): 290 newCuts.add(cut-1) 291 292 cuts = sorted(list(newCuts)) 293 294 parts = [] 295 startOffset = 0 296 for cut in cuts: 297 cutOffset = cut - tokStart 298 parts.append(text[startOffset:cutOffset]) 299 startOffset = cutOffset 300 parts.append(text[startOffset:]) 301 302 if len(parts) > 1: 303 # debug 304 if logFile != None: 305 logFile.write("Token %s in sentence %s: cut '%s' into %d parts:" % (origId, sentenceId, text, len(parts)) + " ".join(["'%s'" % p for p in parts]) + "\n") 306 #print >> sys.stderr, "Token %s in sentence %s: cut '%s' into %d parts:" % (origId, sentenceId, text, len(parts)), " ".join(["'%s'" % p for p in parts]) 307 pass 308 309 # sanity check 310 assert text == "".join(parts), "INTERNAL ERROR: token parts don't add up to original!" 311 312 313 # create a token for each part. For now, don't assign the 314 # "head"; this will be determined later. 315 currentOffset = tokStart 316 splitParts = [] 317 for part in parts: 318 #tOff = "%d-%d" % (currentOffset, currentOffset + len(part)-1) 319 tOff = "%d-%d" % (currentOffset, currentOffset + len(part)) 320 321 t = Token(nextId, origId, POS, tOff, part) 322 t.splitFromOffset = off 323 splitParts.append(t) 324 splitTokens.append(t) 325 326 currentOffset += len(part) 327 seqId += 1 328 nextId = "%s%d" % (tokenIdPrefix, seqId) 329 330 331 resolveHeads(splitParts, logFile) 332 333 return splitTokens

334 335 # writes the given Tokens as <token>s into the given ElementTree element.

336 -def addTokensToTree(tokens, element):

337 for t in tokens: 338 newToken = ElementTree.SubElement(element, "token") 339 newToken.set("id", t.id) 340 newToken.set("text", t.text) 341 newToken.set("POS", t.pos) 342 newToken.set("charOffset", t.charOffset) 343 if t.splitFromOffset != None and t.splitFromOffset != t.charOffset: 344 newToken.set("splitFrom", t.splitFromOffset)

345 346 #def indent(elem, level=0): 347 # """Stolen from Antti's code stolen from Jari's code""" 348 # i = "\n" + level*" " 349 # if len(elem): 350 # if not elem.text or not elem.text.strip(): 351 # elem.text = i + " " 352 # for e in elem: 353 # indent(e, level+1) 354 # if not e.tail or not e.tail.strip(): 355 # e.tail = i 356 # if level and (not elem.tail or not elem.tail.strip()): 357 # elem.tail = i 358

359 -def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True):

360 print >> sys.stderr, "Protein Name Splitter" 361 if logFileName != None: 362 print >> sys.stderr, "Writing log to", logFileName 363 logFile = open(logFileName, "wt") 364 else: 365 logFile = None 366 #if input.endswith(".gz"): 367 # inFile = gzip.GzipFile(input) 368 #else: 369 # inFile = open(input) 370 tree = ETUtils.ETFromObj(input) 371 372 if tokenizationName == None: 373 tokenizationName = parseName 374 375 #tree = ElementTree.parse(inFile) 376 root = tree.getroot() 377 378 sentences = [x for x in root.getiterator("sentence")] 379 counter = ProgressCounter(len(sentences), "Split Protein Names") 380 counter.showMilliseconds = True 381 missingTokCount = 0 382 for sentence in sentences: 383 sId = sentence.get("id") 384 counter.update(1, "Splitting names ("+sId+"): ") 385 386 tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld) 387 if tok == None: 388 missingTokCount += 1 389 continue 390 391 assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (tokenizationName, sId) 392 393 parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld) 394 assert parse is not None, "Missing parse '%s' in sentence %s!" % (parseName, sId) 395 396 split = splitTokens(tok, sentence, logFile) 397 398 # Default names 399 if removeOld: 400 if newTokenizationName == None: 401 newTokenizationName = tok.get("tokenizer") 402 if newParseName == None: 403 newParseName = parse.get("parser") 404 else: 405 if newTokenizationName == None: 406 newTokenizationName = "split-" + tok.get("tokenizer") 407 if newParseName == None: 408 newParseName = "split-" + parse.get("parser") 409 410 # add a new tokenization with the split tokens. 411 splittok = addTokenization(newTokenizationName, sentence, sId) 412 addTokensToTree(split, splittok) 413 for a in tok.attrib: 414 if splittok.get(a) == None: 415 splittok.set(a, tok.get(a)) 416 #splittok.set("split-") 417 418 # make a mapping from original to split token ids. Store the 419 # head token when given. 420 tokenIdMap = {} 421 for t in split: 422 if t.head: 423 head = t.head 424 # traverse 425 while head.head is not None: 426 assert head.head != t, "Cyclic heads" 427 head = head.head 428 429 # should match (nah, punctuation problems) 430 # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict" 431 tokenIdMap[t.origId] = head.id 432 else: 433 # only allow overwrite of existing entry if the current token 434 # is not punctuation. 435 if t.origId not in tokenIdMap or not t.isPunct(): 436 tokenIdMap[t.origId] = t.id 437 438 # make a copy of the specified parse that refers to the split tokens 439 # instead of the originals. 440 newparse = addParse(newParseName, newTokenizationName, sentence, sId) 441 for a in parse.attrib: 442 if newparse.get(a) == None: 443 newparse.set(a, parse.get(a)) 444 newparse.set("ProteinNameSplitter", "True") 445 splittok.set("ProteinNameSplitter", "True") 446 447 depSeqId = 0 #1 448 for d in parse.getiterator("dependency"): 449 t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type") 450 assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR" 451 452 dep = ElementTree.SubElement(newparse, "dependency") 453 dep.set("t1", tokenIdMap[t1]) 454 dep.set("t2", tokenIdMap[t2]) 455 dep.set("type", dType) 456 dep.set("id", "sd_%d" % depSeqId) 457 depSeqId += 1 458 459 # Add in new dependencies between the split parts. 460 for t in [tok for tok in split if tok.head is not None]: 461 dep = ElementTree.SubElement(newparse, "dependency") 462 dep.set("t1", t.head.id) 463 dep.set("t2", t.id) 464 dep.set("type", t.depType) 465 dep.set("split", "PNS") 466 dep.set("id", "spd_%d" % depSeqId) 467 depSeqId += 1 468 469 for phrase in parse.getiterator("phrase"): 470 newparse.append(phrase) 471 472 # debugging 473 #print >> sys.stderr, "NEW DEP IN", sId 474 475 print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences" 476 477 #indent(root) 478 if logFile != None: 479 logFile.close() 480 481 # debugging 482 if output != None: 483 print >> sys.stderr, "Writing output to", output 484 ETUtils.write(tree, output) 485 return tree

486 #else: 487 # tree.write(options.output) 488 489 if __name__=="__main__": 490 optParser = OptionParser(usage="%prog [OPTIONS]\nModifies one parse and associated tokenization to split (some) hyphenated\nwords, e.g. \"actin-binding\".") 491 optParser.add_option("-f", "--analysisFile", dest="file", metavar="FILE", default=None, help = "Path to the xml-formatted analysis file") 492 optParser.add_option("-o", "--output", dest="output", metavar="FILE", default=None, help = "Path to the xml-formatted analysis file") 493 optParser.add_option("-p", "--parse", dest="parse", default = None, help = "Name of the parse to modify") 494 optParser.add_option("-t", "--tokenization", dest="tokenization", default=None, help="Name of the tokenization to modify") 495 optParser.add_option("-s", "--splittokenization", dest="splittokenization", default=splitTokenizationName, help="Name of the new split tokenization to create") 496 optParser.add_option("-n", "--newparse", dest="newparse", default=newParseName, help="Name of the new parse to create") 497 optParser.add_option("-l", "--logFile", dest="logFileName", default=None, help="Log for the splitter messages") 498 (options, args) = optParser.parse_args() 499 500 if (options.file is None or options.parse is None or 501 options.tokenization is None): 502 print >> sys.stderr, "The -f, -p and -t options are mandatory." 503 optParser.print_help() 504 sys.exit(1) 505 506 mainFunc(options.file, options.output, options.parse, options.tokenization, options.splittokenization, options.newparse, options.logFileName) 507

Source Code for Module TEES.Utils.ProteinNameSplitter