TEES.Utils.STFormat.STTools

1 import sys, os 2 import codecs 3 import Validate 4 5 #def compareOffsets(a, b): 6 # if a.charBegin != b.charBegin: 7 # if a.charBegin < b.charBegin: 8 # return -1 9 # else: 10 # return 1 11 # else: 12 # if a.charEnd < b.charEnd: 13 # return -1 14 # elif a.charEnd == b.charEnd: 15 # return 0 16 # else: 17 # return 1 18 # return 0 19

20 -class Document:

21 - def __init__(self):

22 self.id = None 23 self.text = None 24 self.proteins = [] 25 self.triggers = [] 26 self.events = [] 27 self.relations = [] 28 self.dataSet = None 29 self.license = None

30

31 -class Annotation:

32 - def __init__(self, id = None, type = None, text=None, trigger=None, arguments=None):

33 self.id = id # protein/word/dependency/trigger/event 34 self.type = type # protein/word/dependency/trigger/event 35 self.text = text # protein/word/trigger 36 self.charBegin = -1 # protein/word/trigger 37 self.charEnd = -1 # protein/word/trigger 38 self.alternativeOffsets = [] 39 self.equiv = [] # group of elements that are equivalent 40 self.trigger = trigger # event 41 self.arguments = [] # event/dependency/relation 42 if arguments != None: 43 self.arguments = arguments 44 self.sites = [] 45 self.speculation = None # event 46 self.negation = None # event 47 self.fileType = None # "a1" or "a2" 48 # Optional confidence scores 49 self.triggerScores = None 50 self.unmergingScores = None 51 self.speculationScores = None 52 self.negationScores = None

53

54 - def isNegated(self):

55 return self.negation != None

56

57 - def isSpeculative(self):

58 return self.speculation != None

59

60 - def isName(self):

61 return self.type == "Protein" or self.type == "Gene"

62 63 # for debugging

64 - def __repr__(self):

65 if self.id == None: 66 return "<Ann NO-ID>" 67 else: 68 return "<Ann " + self.id + ">"

69

70 -def getStatistics(documents, printStats=True, statSeparator="\n"):

71 from collections import defaultdict 72 import types 73 if type(documents) in types.StringTypes: 74 documents = loadSet(documents) 75 76 stats = defaultdict(int) 77 for document in documents: 78 stats["total-docs"] += 1 79 stats["total-events"] += len(document.events) 80 stats["total-relations"] += len(document.relations) 81 stats["total-proteins"] += len(document.proteins) 82 stats["doc-events-"+str(len(document.events))] += 1 83 stats["doc-relations-"+str(len(document.relations))] += 1 84 stats["doc-proteins-"+str(len(document.proteins))] += 1 85 for event in document.events: 86 stats["events-"+event.type] += 1 87 if event.speculation != None: 88 stats["events-"+event.type+"-spec"] += 1 89 if event.negation != None: 90 stats["events-"+event.type+"-neg"] += 1 91 argStats = defaultdict(int) 92 nesting = False 93 for arg in event.arguments: 94 argType = arg[0] 95 if arg[2] != None: 96 argType += "(" + arg[2].type + ")" 97 if not arg[1].isName(): 98 nesting = True 99 argStats[argType] += 1 100 if nesting: 101 stats["events-"+event.type+"-parent"] += 1 102 stats["args-"+event.type+"-"+"-".join([str(key)+"_"+str(argStats[key]) for key in sorted(argStats.keys())]) ] += 1 103 if printStats: 104 print >> sys.stderr, "Event Statistics:" 105 print >> sys.stderr, statSeparator.join([str(key)+":"+str(stats[key]) for key in sorted(stats.keys())]) 106 return stats

107

108 -def readTAnnotation(string, readScores=False):

109 #print string 110 assert string[0] == "T" or string[0] == "W", string 111 string = string.strip() 112 ann = Annotation() 113 splits = string.split("\t") 114 ann.id = splits[0] 115 middle = splits[1] 116 ann.text = splits[2] 117 if readScores: 118 ann.triggerScores = splits[3] 119 #ann.id, middle, ann.text = string.split("\t") 120 ann.type, ann.charBegin, ann.charEnd = middle.split() 121 ann.charBegin = int(ann.charBegin) 122 ann.charEnd = int(ann.charEnd) 123 # TODO disabled temporarily 124 # if len(splits) > 3: 125 # skip = False 126 # for split in splits[3:]: 127 # if not skip: 128 # cSplits = split.split() 129 # assert len(cSplits) == 2, (cSplits, string) 130 # c1 = int(cSplits[0]) 131 # c2 = int(cSplits[1]) 132 # ann.alternativeOffsets.append( (c1, c2) ) 133 # skip = not skip 134 return ann

135

136 -def readStarAnnotation(string, proteins):

137 assert string[0] == "*", string 138 string = string.strip() 139 star, rest = string.split("\t") 140 equivs = [] 141 if rest.find("Equiv") == 0: 142 splits = rest.split(" ") 143 type = splits[0] 144 assert type == "Equiv" 145 entities = splits[1:] 146 equivs.append( entities ) 147 if len(equivs) > 0: 148 protMap = {} 149 for protein in proteins: 150 protMap[protein.id] = protein 151 for equiv in equivs: 152 for member in equiv: 153 for other in equiv: 154 if member == other: 155 continue 156 if not protMap[other] in protMap[member].equiv: 157 protMap[member].equiv.append(protMap[other])

158

159 -def readEvent(string, sitesAreArguments=False, readScores=False):

160 string = string.strip() 161 ann = Annotation() 162 ann.id, rest = string.split("\t") 163 args = rest.split() 164 trigger = args[0] 165 args = args[1:] 166 splits = trigger.split(":") 167 ann.type = splits[0] 168 ann.trigger = None 169 if len(splits) > 1: 170 if "=" not in splits[1]: 171 ann.trigger = splits[1] 172 elif readScores: 173 ann.unmergingScores = splits[1] 174 if len(splits) > 2 and readScores: 175 assert "=" in splits[2] 176 ann.unmergingScores = splits[2] 177 # if len(splits) == 2: # (splits, trigger, string) 178 # ann.type, ann.trigger = splits[0], splits[1] 179 # else: 180 # ann.type = splits[0] 181 # ann.trigger = None 182 argMap = {} 183 #print string 184 for arg in args: 185 argTuple = arg.split(":") 186 argScores = [] 187 if len(argTuple) > 2: 188 if readScores: 189 argScores = argTuple[2:] 190 argTuple = argTuple[:2] 191 argTuple += [None] + argScores + [None] # room for the site 192 # In the Shared Task Annotation, the word Site can mean a site, or then again not, 193 # because the same term Site is used also for a Site that is not a Site, but just 194 # a "Site"-type argument for a SiteOf event in the BI-task, which may, or may not 195 # (didn't check), have also actual Sites. 196 if sitesAreArguments or argTuple[0].find("Site") == -1 or ann.type == "SiteOf": # not a site or SiteOf-type event 197 origArgName = argTuple[0] 198 if argTuple[0].find("Theme") != -1: # multiple themes are numbered 199 argTuple[0] = "Theme" #["Theme", argTuple[1], None] 200 assert origArgName != "" # extra whitespace caused errors with splitting, splitting fixed 201 argMap[origArgName] = argTuple 202 ann.arguments.append( argTuple ) 203 if "Site" in argTuple[0]: 204 assert argTuple[0] == "Site" 205 argTuple[0] = "SiteArg" 206 #print argMap 207 if len(argMap.keys()) != len(args): # We have sites 208 for arg in args: 209 argTuple = arg.split(":") 210 if "Site" in argTuple[0]: 211 if argTuple[0] == "CSite": 212 target = "Cause" 213 else: 214 target = "Theme" + argTuple[0][4:] 215 if target not in argMap: # a single theme is not numbered 216 assert "Theme" in target 217 assert "Theme" in argMap 218 target = "Theme" 219 argMap[target][2] = argTuple[1] 220 if readScores and len(argTuple) > 2: 221 argMap[target][4] = argTuple[2] 222 return ann

223

224 -def readRAnnotation(string, readScores=False):

225 string = string.strip() 226 ann = Annotation() 227 tabSplits = string.split("\t") 228 ann.id = tabSplits[0] 229 args = tabSplits[1].split() 230 ann.type = args[0] 231 args = args[1:] 232 argMap = {} 233 #print string 234 for arg in args: 235 argTuple = arg.split(":") 236 #assert argTuple[0].find("Arg") != -1, (string, argTuple) 237 if readScores and len(argTuple) > 2: 238 ann.arguments.append( [argTuple[0], argTuple[1], None, argTuple[2], None] ) 239 else: 240 ann.arguments.append( [argTuple[0], argTuple[1], None] ) 241 if len(tabSplits) == 3: 242 assert ann.type == "Coref" 243 assert tabSplits[2][0] == "[" and tabSplits[2][-1] == "]", (string, tabSplits) 244 protIds = tabSplits[2][1:-1].split(",") 245 for protId in protIds: 246 ann.arguments.append( ["Connected", protId.strip(), None] ) 247 return ann

248

249 -def readDependencyAnnotation(string):

250 string = string.strip() 251 id, depType, word1, word2 = string.split() 252 assert word1[0] == "W" and word2[0] == "W", string 253 ann = Annotation() 254 ann.id = id 255 ann.type = depType 256 ann.arguments = [("Word", word1), ("Word", word2)] 257 return ann

258

259 -def loadA1(filename):

260 #f = open(filename) 261 f = codecs.open(filename, "rt", "utf-8") 262 proteins = [] 263 words = [] 264 dependencies = [] 265 lines = f.readlines() 266 count = 0 267 for line in lines: 268 if line[0] == "T": 269 proteins.append(readTAnnotation(line)) 270 count += 1 271 for line in lines: 272 if line[0] == "*": 273 readStarAnnotation(line, proteins) 274 count += 1 275 for line in lines: 276 if line[0] == "W": 277 words.append(readTAnnotation(line)) 278 count += 1 279 for line in lines: 280 if line[0] == "R": # in a1-files, "R" refers to dependencies 281 dependencies.append(readDependencyAnnotation(line)) 282 count += 1 283 assert count == len(lines), lines # check that all lines were processed 284 f.close() 285 # Mark source file type 286 for ann in proteins + words + dependencies: 287 ann.fileType = "a1" 288 # Build syntactic links 289 if len(words) > 0: 290 wordMap = {} 291 for word in words: 292 wordMap[word.id] = word 293 for dep in dependencies: 294 for i in range(len(dep.arguments)): 295 arg = dep.arguments[i] 296 dep.arguments[i] = (arg[0], wordMap[arg[1]]) 297 return proteins, words, dependencies

298

299 -def loadRelOrA2(filename, proteins, sitesAreArguments=False, readScores=False):

300 if readScores and os.path.exists(filename + ".scores"): 301 #f = open(filename + ".scores", "rt") 302 f = codecs.open(filename + ".scores", "rt", "utf-8") 303 else: 304 #f = open(filename, "rt") 305 f = codecs.open(filename, "rt", "utf-8") 306 triggers = [] 307 triggerMap = {} 308 for protein in proteins: 309 triggerMap[protein.id] = protein 310 events = [] 311 eventMap = {} 312 relations = [] 313 lines = f.readlines() 314 f.close() 315 count = 0 316 for line in lines: 317 if line[0] == "T": 318 triggers.append( readTAnnotation(line, readScores=readScores) ) 319 triggerMap[triggers[-1].id] = triggers[-1] 320 count += 1 321 for line in lines: 322 if line[0] == "E": 323 events.append( readEvent(line, sitesAreArguments, readScores=readScores) ) 324 eventMap[events[-1].id] = events[-1] 325 count += 1 326 for line in lines: 327 if line[0] == "R": 328 relations.append(readRAnnotation(line, readScores=readScores)) 329 # NOTE: Temporarily treating relations as events to get equiv-resolution 330 # working 331 #events.append(readRAnnotation(line)) 332 count += 1 333 for line in lines: 334 if line[0] == "M": 335 if not readScores: 336 mId, rest = line.strip().split("\t") 337 mScore = None 338 else: 339 mId, rest, mScore = line.strip().split("\t") 340 mType, eventId = rest.split() 341 assert mType in ["Speculation", "Negation"] 342 if mType == "Speculation": 343 eventMap[eventId].speculation = mId 344 eventMap[eventId].speculationScores = mScore 345 elif mType == "Negation": 346 eventMap[eventId].negation = mId 347 eventMap[eventId].negationScores = mScore 348 count += 1 349 for line in lines: 350 if line[0] == "*": 351 readStarAnnotation(line, proteins + triggers) 352 count += 1 353 assert count == len(lines), lines # check that all lines were processed 354 355 # Mark source file type 356 for ann in triggers + events + relations: 357 ann.fileType = "a2" 358 # Build links 359 for event in events: 360 #print event.id 361 if event.trigger != None: 362 event.trigger = triggerMap[event.trigger] 363 # Move scores from event to trigger 364 event.trigger.unmergingScores = event.unmergingScores 365 event.trigger.negationScores = event.negationScores 366 event.trigger.speculationScores = event.speculationScores 367 event.unmergingScores = None 368 event.negationScores = None 369 event.speculationScores = None 370 371 for i in range(len(event.arguments)): 372 arg = event.arguments[i] 373 if arg[1][0] == "T": 374 if arg[2] != None: 375 #event.arguments[i] = (arg[0], triggerMap[arg[1]], triggerMap[arg[2]]) 376 event.arguments[i][1] = triggerMap[arg[1]] 377 event.arguments[i][2] = triggerMap[arg[2]] 378 else: 379 #event.arguments[i] = (arg[0], triggerMap[arg[1]], None) 380 event.arguments[i][1] = triggerMap[arg[1]] 381 elif arg[1][0] == "E": 382 assert arg[2] == None, (filename, event.id, arg, event.arguments) # no sites on events 383 #event.arguments[i] = (arg[0], eventMap[arg[1]], None) 384 event.arguments[i][1] = eventMap[arg[1]] 385 # Build links 386 for relation in relations: 387 for i in range(len(relation.arguments)): 388 arg = relation.arguments[i] 389 if arg[1][0] == "T": 390 if arg[2] != None: 391 #relation.arguments[i] = (arg[0], triggerMap[arg[1]], triggerMap[arg[2]]) 392 relation.arguments[i][1] = triggerMap[arg[1]] 393 relation.arguments[i][2] = triggerMap[arg[2]] 394 else: 395 # if not triggerMap.has_key(arg[1]): # NOTE! hack for CO bugs 396 # relation.arguments = relation.arguments[0:i] 397 # if len(relation.arguments) == 1: # NOTE! hack 398 # relations = [] 399 # break 400 #relation.arguments[i] = (arg[0], triggerMap[arg[1]], None) 401 relation.arguments[i][1] = triggerMap[arg[1]] 402 403 return triggers, events, relations

404

405 -def loadText(filename):

406 #f = open(filename) 407 f = codecs.open(filename, "rt", "utf-8") 408 text = f.read() 409 f.close() 410 return text

411

412 -def load(id, dir, loadA2=True, sitesAreArguments=False, a2Tag="a2", readScores=False):

413 #print id 414 id = str(id) 415 a1Path = os.path.join(dir, id + ".a1") 416 if os.path.exists(a1Path): 417 proteins, words, dependencies = loadA1(a1Path) 418 else: 419 proteins = [] 420 words = [] 421 dependencies = [] 422 if not loadA2: 423 return proteins, [], [], [], [], [] 424 a2Path = os.path.join(dir, id + "." + a2Tag) 425 relPath = os.path.join(dir, id + ".rel") 426 triggers = [] 427 events = [] 428 relations = [] 429 if os.path.exists(a2Path): 430 triggers, events, relations = loadRelOrA2(a2Path, proteins, sitesAreArguments, readScores=readScores) 431 elif os.path.exists(relPath): 432 triggers, events, relations = loadRelOrA2(relPath, proteins, sitesAreArguments, readScores=readScores) 433 return proteins, words, dependencies, triggers, events, relations

434

435 -def loadSet(path, setName=None, level="a2", sitesAreArguments=False, a2Tag="a2", readScores=False):

436 assert level in ["txt", "a1", "a2"] 437 if path.endswith(".tar.gz"): 438 import tempfile 439 import tarfile 440 import shutil 441 dir = tempfile.mkdtemp() 442 f = tarfile.open(path, "r") 443 f.extractall(dir) 444 # Check if compressed directory is included in the package, like in the ST'11 corpus files 445 compressedFilePath = os.path.join(dir, os.path.basename(path)[:-len(".tar.gz")]) 446 if not os.path.exists(compressedFilePath): # at least CO training set has a different dirname inside the tarfile 447 compressedFilePath = compressedFilePath.rsplit("_", 1)[0] 448 print >> sys.stderr, "Package name directory does not exist, trying", compressedFilePath 449 if os.path.exists(compressedFilePath): 450 print >> sys.stderr, "Reading document set from compressed filename directory", compressedFilePath 451 dir = compressedFilePath 452 f.close() 453 elif path.endswith(".txt"): 454 import tempfile 455 import shutil 456 dir = tempfile.mkdtemp() 457 shutil.copy2(path, os.path.join(dir, os.path.basename(path))) 458 else: 459 dir = path 460 461 ids = set() 462 documents = [] 463 license = None 464 if os.path.exists(os.path.join(dir, "LICENSE")): 465 licenseFile = open(os.path.join(dir, "LICENSE"), "rt") 466 license = "".join(licenseFile.readlines()) 467 licenseFile.close() 468 for filename in os.listdir(dir): 469 if filename.endswith(".txt"): 470 ids.add(filename.split(".")[0]) 471 for id in sorted(list(ids)): 472 #print "Loading", id 473 doc = Document() 474 doc.id = id 475 if not level == "txt": 476 try: 477 doc.proteins, doc.words, doc.dependencies, doc.triggers, doc.events, doc.relations = load(str(id), dir, level=="a2", sitesAreArguments, a2Tag=a2Tag, readScores=readScores) 478 except: 479 print >> sys.stderr, "Exception reading document", id, "from", dir 480 raise 481 doc.text = loadText( os.path.join(dir, str(id) + ".txt") ) 482 doc.dataSet = setName 483 doc.license = license 484 documents.append(doc) 485 486 if dir != path: 487 shutil.rmtree(dir) 488 return documents

489

490 -def writeSet(documents, output, resultFileTag="a2", debug=False, task=2, validate=True, writeScores=False):

491 from collections import defaultdict 492 import shutil 493 counts = defaultdict(int) 494 495 while output.endswith("/"): 496 output = output[:-1] 497 if output.endswith(".tar.gz"): 498 outdir = output + "-temp" 499 else: 500 outdir = output 501 if os.path.exists(outdir): 502 shutil.rmtree(outdir) 503 504 if not validate: 505 print "Warning! No validation." 506 for doc in documents: 507 if validate: 508 if debug: print >> sys.stderr, "Validating", doc.id 509 Validate.allValidate(doc, counts, task, verbose=debug) 510 #doc.proteins.sort(cmp=compareOffsets) 511 #doc.triggers.sort(cmp=compareOffsets) 512 if debug: print >> sys.stderr, "Writing", doc.id 513 write(doc.id, outdir, doc.proteins, doc.triggers, doc.events, doc.relations, resultFileTag, counts, task=task, writeScores=writeScores) 514 # Write text file 515 #out = open(os.path.join(outdir, str(doc.id) + ".txt"), "wt") 516 out = codecs.open(os.path.join(outdir, str(doc.id) + ".txt"), "wt", "utf-8") 517 out.write(doc.text) 518 out.close() 519 if output.endswith(".tar.gz"): 520 package(outdir, output, ["a1", "txt", resultFileTag, resultFileTag+".scores"]) 521 shutil.rmtree(outdir) 522 print counts

523 524

525 -def getMaxId(annotations):

526 nums = [0] 527 for annotation in annotations: 528 if annotation.id != None: 529 assert annotation.id[1:].isdigit(), annotation.id 530 nums.append(int(annotation.id[1:])) 531 return max(nums)

532

533 -def updateIds(annotations, minId=0):

534 newIds = False 535 for ann in annotations: 536 if ann.id == None: 537 newIds = True 538 break 539 if newIds: 540 idCount = max(getMaxId(annotations) + 1, minId) 541 for ann in annotations: 542 if len(ann.arguments) == 0 and ann.trigger == None: 543 ann.id = "T" + str(idCount) 544 elif ann.type in ["Subunit-Complex", "Protein-Component", "Coref", "Renaming", "SR-subunitof", "SR-equivto", "SR-partof", "SR-memberof"]: 545 ann.id = "R" + str(idCount) 546 #elif ann.trigger != None or ann.type in ["ActionTarget", "Interaction", "TranscriptionBy", ""]: 547 else: 548 ann.id = "E" + str(idCount) 549 idCount += 1

550

551 -def writeTAnnotation(proteins, out, writeScores, idStart=0):

552 updateIds(proteins, idStart) 553 for protein in proteins: 554 assert protein.id[0] == "T", (protein.id, protein.text) 555 out.write(protein.id + "\t") 556 out.write(protein.type + " " + str(protein.charBegin) + " " + str(protein.charEnd) + "\t") 557 if protein.text == None: 558 out.write(str(protein.text)) 559 else: 560 out.write(protein.text.replace("\n", "
").replace("\r", "
")) 561 if writeScores and protein.triggerScores != None: 562 out.write("\t" + protein.triggerScores.replace(":", "=")) 563 out.write("\n")

564

565 -def getDuplicatesMapping(eventLines):

566 # Duplicates are BAAADD. However, removing nested duplicates is also BAAADDD. Evaluation system doesn't like 567 # either. So, if you don't remove duplicates, it refuses to evaluate because of duplicates. If you do remove 568 # duplicates, it refuses to evaluate because of missing events. That's why they ARE THERE, how can they 569 # be duplicates, IF THEY ARE PART OF DIFFERENT NESTING CHAINS??? The "solution" is to remap nesting events 570 # to removed duplicates. 571 duplicateMap = {} 572 seenLineMap = {} 573 for eventLineTuple in eventLines: 574 if eventLineTuple[1] not in seenLineMap: 575 seenLineMap[eventLineTuple[1]] = eventLineTuple[0] 576 else: 577 duplicateMap[eventLineTuple[0]] = seenLineMap[eventLineTuple[1]] 578 return duplicateMap

579 580 #def removeDuplicates(): 581 # for e1 in events[:]: 582 # for e2 in events[:]: 583 # if e1 == e2: 584 # continue 585 # if e1.trigger == e2.trigger and len(e1.arguments) == len(e2.arguments): 586 # for arg1 in zip(e1.arguments, e2.arguments) 587

588 -def writeEvents(events, out, counts, task, writeScores=False):

589 updateIds(events) 590 mCounter = 1 591 eventLines = [] 592 nestedEvents = set() 593 for event in events: 594 eventLine = "" 595 #out.write(event.id + "\t") 596 # Event id part ############################ 597 trigger = event.trigger 598 if trigger == None: 599 eventLine += event.type 600 else: 601 eventLine += trigger.type + ":" + trigger.id 602 if writeScores and event.trigger.unmergingScores != None: 603 eventLine += ":" + event.trigger.unmergingScores.replace(":", "=") 604 # Argument part ############################# 605 typeCounts = {} 606 # Count arguments 607 targetProteins = set() 608 for arg in event.arguments: 609 argType = arg[0] 610 if argType == "Target" and event.type == "Coref": 611 targetProteins.add(arg[1].id) 612 else: 613 if not typeCounts.has_key(argType): 614 typeCounts[argType] = 0 615 typeCounts[argType] += 1 616 # Determine which arguments need numbering 617 #for key in typeCounts.keys(): 618 # if typeCounts[key] <= 1: 619 # del typeCounts[key] 620 # Write arguments 621 currTypeCounts = {} 622 for key in typeCounts.keys(): 623 currTypeCounts[key] = 0 624 for arg in event.arguments: 625 argType = arg[0] 626 if argType == "Target" and event.type == "Coref": 627 continue 628 assert arg[1].id != None, (event.id, event.arguments, arg) 629 currTypeCounts[argType] += 1 630 if typeCounts[argType] > 1: 631 eventLine += " " + argType + str(currTypeCounts[argType]) + ":" + arg[1].id 632 else: 633 eventLine += " " + argType + ":" + arg[1].id 634 if writeScores and len(arg) > 3 and arg[3] != None: 635 eventLine += ":" + arg[3].replace(":", "=") 636 637 # keep track of nesting 638 if arg[1].id[0] == "E": 639 nestedEvents.add(arg[1].id) 640 641 # Reset type counts for writing sites 642 currTypeCounts = {} 643 for key in typeCounts.keys(): 644 currTypeCounts[key] = 0 645 # Write sites 646 for arg in event.arguments: 647 if task == 1: 648 continue 649 650 if arg[2] == None: 651 continue 652 653 #if arg[2].id in ["T18", "T19"]: 654 # print arg 655 # out.write("XXX") 656 # print event.type 657 658 # limit sites to accepted event types 659 # Todo! This should be done in validate 660 #if event.type not in ["Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation"]: 661 # continue 662 663 argType = arg[0] 664 if argType == "Target" and event.type == "Coref": 665 continue 666 currTypeCounts[argType] += 1 667 668 sitePrefix = "" 669 if argType.find("Cause") != -1: 670 sitePrefix = "C" 671 if typeCounts[argType] > 1: 672 eventLine += " " + sitePrefix + "Site" + str(currTypeCounts[argType]) + ":" + arg[2].id 673 else: 674 eventLine += " " + sitePrefix + "Site" + ":" + arg[2].id 675 if writeScores and len(arg) > 4 and arg[4] != None: 676 eventLine += ":" + arg[4].replace(":", "=") 677 678 # Write Coref targets 679 if len(targetProteins) > 0: 680 eventLine += "\t[" + ", ".join(sorted(list(targetProteins))) + "]" 681 682 eventLine += "\n" 683 684 685 # Write task 3 686 if event.speculation != None: 687 eventLine += "M" + str(mCounter) + "\t" + "Speculation " + str(event.id) 688 if writeScores and event.trigger != None and event.trigger.speculationScores != None: 689 eventLine += "\t" + event.trigger.speculationScores.replace(":", "=") 690 eventLine += "\n" 691 mCounter += 1 692 if event.negation != None: 693 eventLine += "M" + str(mCounter) + "\t" + "Negation " + str(event.id) 694 if writeScores and event.trigger != None and event.trigger.negationScores != None: 695 eventLine += "\t" + event.trigger.negationScores.replace(":", "=") 696 eventLine += "\n" 697 mCounter += 1 698 699 eventLines.append( [event.id, eventLine] ) 700 701 # Write ignoring duplicates 702 #duplicateMap = getDuplicatesMapping(eventLines) 703 #seenLines = set() 704 for eventLineTuple in eventLines: 705 out.write(eventLineTuple[0] + "\t" + eventLineTuple[1])

706 707 # if eventLineTuple[1] not in seenLines: 708 # eventLine = eventLineTuple[1] + " " 709 # for key in sorted(duplicateMap.keys()): 710 # eventLine = eventLine.replace(key, duplicateMap[key]) 711 # out.write(eventLineTuple[0] + "\t" + eventLine) 712 # seenLines.add(eventLineTuple[1]) 713 # Write task 3 714 #for event in events: 715 # if event.negation != None: 716

717 -def write(id, dir, proteins, triggers, events, relations, resultFileTag="a2", counts=None, debug=False, task=2, writeScores=False):

718 id = str(id) 719 if debug: 720 print id 721 if not os.path.exists(dir): 722 os.makedirs(dir) 723 724 #updateIds(proteins) 725 #updateIds(triggers, getMaxId(stDoc.proteins) + 1) 726 #updateIds(events) 727 #updateIds(relations) 728 729 if proteins != None: 730 out = codecs.open(os.path.join(dir, id + ".a1"), "wt", "utf-8") 731 writeTAnnotation(proteins, out, False) 732 out.close() 733 resultFile = codecs.open(os.path.join(dir, id + "." + resultFileTag), "wt", "utf-8") 734 writeTAnnotation(triggers, resultFile, False, getMaxId(proteins) + 1) 735 if writeScores: 736 resultScoresFile = codecs.open(os.path.join(dir, id + "." + resultFileTag + ".scores"), "wt", "utf-8") 737 writeTAnnotation(triggers, resultScoresFile, True, getMaxId(proteins) + 1) 738 if len(events) > 0: 739 if debug: print >> sys.stderr, "Writing events" 740 writeEvents(events, resultFile, counts, task, writeScores=False) 741 if writeScores: 742 writeEvents(events, resultScoresFile, counts, task, writeScores=True) 743 if len(relations) > 0: 744 if debug: print >> sys.stderr, "Writing relations" 745 writeEvents(relations, resultFile, counts, task) 746 if writeScores: 747 writeEvents(relations, resultScoresFile, counts, task, writeScores=True) 748 resultFile.close() 749 if writeScores: 750 resultScoresFile.close()

751

752 -def package(sourceDir, outputFile, includeTags=["a2", "a2.scores"]):

753 import tarfile 754 allFiles = os.listdir(sourceDir) 755 tarFiles = [] 756 for file in allFiles: 757 for tag in includeTags: 758 if file.endswith(tag): 759 tarFiles.append(file) 760 break 761 packageFile = tarfile.open(outputFile, "w:gz") 762 tempCwd = os.getcwd() 763 os.chdir(sourceDir) 764 for file in tarFiles: 765 packageFile.add(file)#, exclude = lambda x: x == submissionFileName) 766 #if "final" in outputFile: 767 # packageFile.add("/home/jari/data/BioNLP11SharedTask/resources/questionnaire.txt", "questionnaire.txt") 768 os.chdir(tempCwd) 769 packageFile.close()

770 771 if __name__=="__main__": 772 import sys 773 from optparse import OptionParser 774 # Import Psyco if available 775 try: 776 import psyco 777 psyco.full() 778 print >> sys.stderr, "Found Psyco, using" 779 except ImportError: 780 print >> sys.stderr, "Psyco not installed" 781 782 optparser = OptionParser(usage="%prog [options]\nST format input and output.") 783 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 784 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 785 optparser.add_option("-t", "--outputTag", default="a2", dest="outputTag", help="a2 file extension.") 786 optparser.add_option("-s", "--sentences", default=False, action="store_true", dest="sentences", help="Write each sentence to its own document") 787 optparser.add_option("-r", "--origIds", default=False, action="store_true", dest="origIds", help="Use stored original ids (can cause problems with duplicates).") 788 optparser.add_option("-a", "--task", default=2, type="int", dest="task", help="1 or 2") 789 optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="Verbose output.") 790 (options, args) = optparser.parse_args() 791 792 assert options.input != options.output 793 documents = loadSet(options.input, "GE", level="a2", sitesAreArguments=False, a2Tag="a2", readScores=False) 794 writeSet(documents, options.output, resultFileTag=options.outputTag, debug=options.debug, task=options.task, validate=True, writeScores=False) 795 796 #if __name__=="__main__": 797 # # Import Psyco if available 798 # try: 799 # import psyco 800 # psyco.full() 801 # print >> sys.stderr, "Found Psyco, using" 802 # except ImportError: 803 # print >> sys.stderr, "Psyco not installed" 804 # 805 # #proteins, triggers, events = load(1335418, "/home/jari/biotext/tools/TurkuEventExtractionSystem-1.0/data/evaluation-data/evaluation-tools-devel-gold") 806 # #write(1335418, "/home/jari/data/temp", proteins, triggers, events ) 807 # 808 # p = "/home/jari/data/BioNLP09SharedTask/bionlp09_shared_task_development_data_rev1" 809 # documents = loadSet(p) 810 # writeSet(documents, "/home/jari/data/temp/testSTTools") 811

Source Code for Module TEES.Utils.STFormat.STTools