Package TEES :: Package Utils :: Package STFormat :: Module ConvertXML
[hide private]

Source Code for Module TEES.Utils.STFormat.ConvertXML

  1  import sys, os 
  2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
  4  from STTools import * 
  5  import xml.etree.cElementTree as ET 
  6  import Utils.ElementTreeUtils as ETUtils 
  7  import Utils.Range as Range 
  8   
  9  #def compareArguments(a, b): 
 10  #    if a[0] == "Cause": 
 11  #        return 1 
 12  #    elif b[0] == "Cause": 
 13  #        return -1 
 14  #    return 0 
 15   
16 -def toInteractionXML(documents, corpusName="GENIA", output=None):
17 corpusRoot = ET.Element("corpus") 18 corpusRoot.set("source", corpusName) 19 docCounter = 0 20 for doc in documents: 21 docEl = ET.Element("document") 22 docId = corpusName + ".d" + str(docCounter) 23 docEl.set("id", docId) 24 docCounter += 1 25 #docEl.set("pmid", str(doc.id)) 26 docEl.set("origId", str(doc.id)) 27 docEl.set("text", doc.text) 28 if doc.dataSet != None: 29 docEl.set("set", doc.dataSet) 30 corpusRoot.append(docEl) 31 # If this is a sentence, make one 32 isSentence = len(doc.words) > 0 33 if isSentence: 34 sentEl = ET.SubElement(docEl, "sentence") 35 sentEl.set("id", docId + ".s0") 36 sentEl.set("text", doc.text) 37 sentEl.set("charOffset", "0-" + str(len(doc.text))) 38 docId = sentEl.get("id") # hack to get all subelements here 39 docEl = sentEl # hack to get all subelements here 40 # Write triggers and entities 41 elCounter = 0 42 triggerToEvents = {} 43 for trigger in doc.triggers: 44 triggerId = trigger.id 45 triggerToEvents[triggerId] = [] 46 for event in doc.events: 47 if event.trigger == trigger: 48 triggerToEvents[triggerId].append(event.id) 49 if len(triggerToEvents[triggerId]) == 0: 50 triggerToEvents[triggerId].append(trigger.id) 51 tMap = {} 52 eventMap = {} 53 for event in doc.events: 54 eventMap[event.id] = event 55 for protein in doc.proteins: 56 entEl = ET.Element("entity") 57 protId = docId + ".e" + str(elCounter) 58 entEl.set("id", protId) 59 entEl.set("origId", str(doc.id) + "." + str(protein.id)) 60 entEl.set("text", protein.text) 61 entEl.set("charOffset", str(protein.charBegin) + "-" + str(protein.charEnd)) 62 if len(protein.alternativeOffsets) > 0: 63 altOffs = [] 64 for ao in protein.alternativeOffsets: 65 altOffs.append( str(ao[0]) + "-" + str(ao[1]-1) ) 66 entEl.set("altOffset", ",".join(altOffs)) 67 entEl.set("type", protein.type) 68 assert protein.fileType in ["a1", "a2"], protein.fileType 69 if protein.fileType == "a1": #protein.isName(): 70 entEl.set("isName", "True") 71 else: 72 entEl.set("isName", "False") 73 elCounter += 1 74 docEl.append(entEl) 75 assert not tMap.has_key(protId) 76 tMap[protein.id] = protId 77 for protein in doc.triggers: 78 for eventId in triggerToEvents[protein.id]: # Write duplicate triggers 79 entEl = ET.Element("entity") 80 protId = docId + ".e" + str(elCounter) 81 entEl.set("id", protId) 82 entEl.set("origId", str(doc.id) + "." + str(protein.id)) 83 entEl.set("text", protein.text) 84 entEl.set("charOffset", str(protein.charBegin) + "-" + str(protein.charEnd)) 85 if len(protein.alternativeOffsets) > 0: 86 altOffs = [] 87 for ao in protein.alternativeOffsets: 88 altOffs.append( str(ao[0]) + "-" + str(ao[1]-1) ) 89 entEl.set("altOffset", ",".join(altOffs)) 90 entEl.set("type", protein.type) 91 assert protein.fileType in ["a1", "a2"], protein.fileType 92 if protein.fileType == "a1": #protein.isName(): 93 entEl.set("isName", "True") 94 else: 95 entEl.set("isName", "False") 96 # Add negation and speculation 97 if eventId in eventMap and eventMap[eventId].negation != None: 98 entEl.set("negation", "True") 99 if eventId in eventMap and eventMap[eventId].speculation != None: 100 entEl.set("speculation", "True") 101 elCounter += 1 102 docEl.append(entEl) 103 assert not tMap.has_key(protId) 104 tMap[eventId] = protId 105 # Pre-define XML interaction ids 106 elCounter = 0 107 # Write events 108 for event in doc.events: 109 if event.trigger == None: # triggerless event (simple pairwise interaction) 110 assert len(event.arguments) >= 2, (event.id, event.type, event.arguments) 111 a1 = event.arguments[0] 112 a2 = event.arguments[1] 113 intEl = ET.Element("interaction") 114 intEl.set("directed", "True") 115 intEl.set("id", docId + ".i" + str(elCounter)) 116 elCounter += 1 117 intEl.set("origId", str(doc.id) + "." + str(event.id)) 118 intEl.set("e1", tMap[a1[1].id]) 119 intEl.set("e2", tMap[a2[1].id]) 120 #intEl.set("type", event.type) 121 #intEl.set("argTypes", a1[0] + "/" + a2[0]) 122 intEl.set("type", event.type + "(" + a1[0] + "/" + a2[0] + ")") 123 docEl.append(intEl) 124 else: 125 argCount = 0 126 for arg in event.arguments: 127 intEl = ET.Element("interaction") 128 intEl.set("directed", "True") 129 intEl.set("id", docId + ".i" + str(elCounter)) 130 elCounter += 1 131 intEl.set("origId", str(doc.id) + "." + str(event.id) + "." + str(argCount)) 132 #intEl.set("e1", tMap[event.trigger.id]) 133 intEl.set("e1", tMap[event.id]) 134 if arg[1].trigger != None: 135 #intEl.set("e2", tMap[arg[1].trigger.id]) 136 intEl.set("e2", tMap[arg[1].id]) 137 else: 138 intEl.set("e2", tMap[arg[1].id]) 139 intEl.set("type", arg[0]) 140 docEl.append(intEl) 141 argCount += 1 142 # Add site 143 if arg[2] != None: 144 intEl = ET.Element("interaction") 145 intEl.set("directed", "True") 146 intEl.set("id", docId + ".i" + str(elCounter)) 147 elCounter += 1 148 intEl.set("origId", str(doc.id) + "." + str(event.id) + "." + str(argCount) + ".site") 149 intEl.set("e1", tMap[arg[2].id]) # "Entity"-type entity is the source 150 assert arg[2].type == "Entity" 151 intEl.set("e2", tMap[arg[1].id]) # "Protein"-type entity is the target 152 assert arg[1].type in ["Protein", "Gene", "Chemical", "Organism", "Regulon-operon", "Two-component-system"], (arg[1].type, doc.id, doc.dataSet, event.id) 153 intEl.set("type", "Site") 154 docEl.append(intEl) 155 argCount += 1 156 # Write relations 157 for relation in doc.relations: 158 assert len(relation.arguments) >= 2, (relation.id, relation.type, relation.arguments) 159 a1 = relation.arguments[0] 160 a2 = relation.arguments[1] 161 # if a1[0] == "Arg2": 162 # temp = a1 163 # a1 = a2 164 # a2 = temp 165 assert a1[0] == "Arg1" or a1[0] == "Former" or a1[0] == "Anaphora", (a1, relation.arguments) 166 assert a2[0] == "Arg2" or a2[0] == "New" or a2[0] == "Antecedent", (a2, relation.arguments) 167 intEl = ET.Element("interaction") 168 intEl.set("directed", "True") 169 intEl.set("id", docId + ".i" + str(elCounter)) 170 elCounter += 1 171 intEl.set("origId", str(doc.id) + "." + str(relation.id)) 172 intEl.set("e1", tMap[a1[1].id]) 173 intEl.set("e2", tMap[a2[1].id]) 174 intEl.set("type", relation.type) 175 docEl.append(intEl) 176 if len(relation.arguments) > 2: 177 assert relation.type == "Coref", (relation.id, docId, relation.type) 178 for connProt in relation.arguments[2:]: 179 intEl = ET.Element("interaction") 180 intEl.set("directed", "True") 181 intEl.set("id", docId + ".i" + str(elCounter)) 182 elCounter += 1 183 intEl.set("origId", str(doc.id) + "." + str(relation.id)) 184 intEl.set("e1", tMap[a2[1].id]) # link proteins to antecedent 185 intEl.set("e2", tMap[connProt[1].id]) 186 intEl.set("type", "Target") 187 docEl.append(intEl) 188 #docEl.append(intEl) # adding original intEl after extra argument loop broke everything 189 if isSentence: 190 sentAnalysesEl = ET.SubElement(sentEl, "analyses") 191 #parsesEl = ET.SubElement(sentAnalysesEl, "parses") 192 parseEl = ET.SubElement(sentAnalysesEl, "parse") 193 #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations") 194 tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization") 195 parseEl.set("parser", "gold") 196 parseEl.set("tokenizer", "gold") 197 tokenizationEl.set("tokenizer", "gold") 198 tokenMap = {} 199 for word in doc.words: 200 tokEl = ET.SubElement(tokenizationEl, "token") 201 tokEl.set("id", word.id) 202 tokEl.set("text", word.text) 203 tokEl.set("POS", "None") 204 tokEl.set("charOffset", str(word.charBegin) + "-" + str(word.charEnd)) 205 tokenMap[word.id] = tokEl 206 for dep in doc.dependencies: 207 depEl = ET.SubElement(parseEl, "dependency") 208 depEl.set("id", dep.id) 209 depEl.set("type", dep.type) 210 assert len(dep.arguments) == 2 211 depEl.set("t1", dep.arguments[0][1].id) 212 depEl.set("t2", dep.arguments[1][1].id) 213 if dep.type.find(":") != -1: 214 word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split("-") 215 tokenMap[dep.arguments[0][1].id].set("POS", word1Type) 216 tokenMap[dep.arguments[1][1].id].set("POS", word2Type) 217 218 if output != None: 219 print >> sys.stderr, "Writing output to", output 220 ETUtils.write(corpusRoot, output) 221 return ET.ElementTree(corpusRoot)
222
223 -def toSTFormat(input, output=None, outputTag="a2", useOrigIds=False, debug=False, task=2, validate=True, writeScores=False):
224 print >> sys.stderr, "Loading corpus", input 225 corpusTree = ETUtils.ETFromObj(input) 226 print >> sys.stderr, "Corpus file loaded" 227 corpusRoot = corpusTree.getroot() 228 229 nonEntitySiteCount = 0 230 documents = [] 231 for document in corpusRoot.findall("document"): 232 stDoc = Document() 233 stDoc.proteins = [] 234 stDoc.triggers = [] 235 stDoc.events = [] 236 stDoc.relations = [] 237 stDoc.id = document.get("pmid") 238 if stDoc.id == None: 239 stDoc.id = document.get("origId") 240 stDoc.text = "" 241 documents.append(stDoc) 242 eMap = {} 243 tMap = {} 244 siteMap = {} 245 siteScores = {} 246 sites = [] 247 sentenceOffsets = {} 248 for sentence in document.findall("sentence"): 249 head = sentence.get("head") 250 if head != None: 251 stDoc.text += head 252 stDoc.text += sentence.get("text") 253 tail = sentence.get("tail") 254 if tail != None: 255 stDoc.text += tail 256 sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) 257 sentenceOffsets[sentence.get("id")] = sentenceOffset 258 if stDoc.id == None: 259 stDoc.id = sentence.get("origId").rsplit(".", 1)[0] 260 entityElementMap = {} # for task 3 261 for entity in document.getiterator("entity"): 262 eType = entity.get("type") 263 if eType == "neg": 264 continue 265 entityElementMap[entity.get("id")] = entity 266 entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) 267 ann = Annotation() 268 ann.type = eType 269 if useOrigIds: 270 entityOrigId = entity.get("origId") 271 if entityOrigId != None and entityOrigId.find(".") != -1: # fix gluing of doc and ann id 272 entityOrigId = entityOrigId.rsplit(".",1)[-1] 273 if entityOrigId != None: 274 if entityOrigId[0] == "E": # a special id denoting a numbered, but triggerless event 275 ann.eventId = entityOrigId 276 ann.id = None 277 else: 278 ann.id = entityOrigId 279 ann.text = entity.get("text") 280 assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset) 281 ann.charBegin = entityOffset[0] 282 ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1 283 idStem = entity.get("id").split(".e", 1)[0] 284 if sentenceOffsets.has_key(idStem): 285 sentenceOffset = sentenceOffsets[idStem] 286 ann.charBegin += sentenceOffset[0] 287 ann.charEnd += sentenceOffset[0] 288 if entity.get("speculation") == "True": 289 ann.speculation = True 290 if entity.get("negation") == "True": 291 ann.negation = True 292 if entity.get("isName") == "True": 293 # Remember to use original id for names! 294 if entity.get("origId") != None: 295 ann.id = entity.get("origId").rsplit(".", 1)[-1] 296 assert ann.id[0].isupper(), ann.id 297 for c in ann.id[1:]: 298 assert c.isdigit(), ann.id 299 stDoc.proteins.append(ann) 300 # The part below is dangerous, and incompatibilities should be handled rather 301 # by not converting to the shared task format when it cannot be done 302 #if entity.get("origId") != None: 303 # # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format 304 # nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1] 305 # if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit(): 306 # ann.id = nonNamedEntityOrigId 307 #stDoc.proteins.append(ann) 308 else: 309 found = False # prevent duplicate triggers 310 for trigger in stDoc.triggers: 311 if trigger.charBegin == ann.charBegin and trigger.charEnd == ann.charEnd and \ 312 trigger.text == ann.text and trigger.type == ann.type: 313 found = True 314 ann = trigger 315 break 316 if not found: 317 stDoc.triggers.append(ann) 318 assert entity.get("id") != None 319 tMap[entity.get("id")] = ann 320 if entity.get("type") == "Process": # these can have 0 interactions 321 event = Annotation() 322 event.trigger = ann 323 event.type = event.trigger.type 324 eMap[entity.get("id")] = event 325 if entityElementMap[entity.get("id")].get("speculation") == "True": 326 event.speculation = True 327 if entityElementMap[entity.get("id")].get("negation") == "True": 328 event.negation = True 329 stDoc.events.append(event) 330 # Add confidence scores 331 ann.triggerScores = entity.get("predictions") 332 ann.unmergingScores = entity.get("umStrength") 333 ann.speculationScores = entity.get("modPred") 334 ann.negationScores = entity.get("modPred") 335 # First map Coref proteins 336 corefProtMap = {} 337 for interaction in document.getiterator("interaction"): 338 intType = interaction.get("type") 339 if intType == "Target": 340 e1 = interaction.get("e1") 341 e2 = interaction.get("e2") 342 if not tMap.has_key(e2): 343 print >> sys.stderr, "Warning, no trigger for Coref Protein Target" 344 continue 345 e2 = tMap[e2] 346 if not corefProtMap.has_key(e1): 347 corefProtMap[e1] = [] 348 if not e2 in corefProtMap[e1]: 349 corefProtMap[e1].append(e2) 350 # Then process all interactions 351 for interaction in document.getiterator("interaction"): 352 intType = interaction.get("type") 353 if intType == "neg" or intType == "Target": 354 continue # Targets have already been put into a dictionary 355 #elif intType in ["Site", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation"]: 356 #elif intType in ["Site", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation", 357 # "InputAssociation", "InputProcess", "InputInhibitor", "OutputProcess"]: 358 if "/" in intType and "(" in intType: # BI-task 359 eventType, argTypes = intType.split("(") 360 arg1Type, arg2Type = argTypes[:-1].split("/") 361 event = Annotation() 362 event.trigger = None # triggerless event (same as relation) 363 event.type = eventType 364 event.arguments.append([arg1Type, interaction.get("e1"), None]) 365 event.arguments.append([arg2Type, interaction.get("e2"), None]) 366 if event.arguments[0][0] == "SiteArg": # convert back to actual sites 367 event.arguments[0][0] = "Site" 368 if event.arguments[1][0] == "SiteArg": # convert back to actual sites 369 event.arguments[1][0] = "Site" 370 #event.speculation = entityElementMap[e1].get("speculation") 371 #event.negation = entityElementMap[e1].get("negation") 372 stDoc.events.append(event) 373 elif intType not in ["Protein-Component", "Subunit-Complex", "Renaming", "Coref", "SR-subunitof", "SR-equivto", "SR-partof", "SR-memberof"]: 374 #if intType == "Site" and tMap[interaction.get("e1")].type == "Entity": 375 if intType == "Site": 376 # These sites are real sites (i.e. task 2 sites). 377 # Other sites are just arguments called "site" 378 #sites.append(interaction) 379 siteMap[interaction.get("e2")] = tMap[interaction.get("e1")] 380 siteScores[interaction.get("e2")] = interaction.get("predictions") 381 else: 382 e1 = interaction.get("e1") 383 if eMap.has_key(e1): # event has already been created 384 event = eMap[e1] # eMap lists events by their trigger ids 385 else: 386 eventType = tMap[interaction.get("e1")].type 387 if eventType != "Entity": # "Entity"-type entities are never event roots 388 event = Annotation() 389 event.trigger = tMap[interaction.get("e1")] 390 event.type = event.trigger.type 391 if hasattr(event.trigger, "eventId"): 392 event.id = event.trigger.eventId 393 eMap[e1] = event 394 if entityElementMap[e1].get("speculation") == "True": 395 event.speculation = True 396 if entityElementMap[e1].get("negation") == "True": 397 event.negation = True 398 stDoc.events.append(event) 399 else: 400 event = None 401 if event != None: 402 arg = [interaction.get("type"), interaction.get("e2"), None, interaction.get("predictions")] 403 if arg[0] == "SiteArg": # convert back to actual sites 404 arg[0] = "Site" 405 if arg[3] != None: # Convert also prediction strengths 406 arg[3] = arg[3].replace("SiteArg", "Site") 407 event.arguments.append(arg) 408 else: # interaction is a relation 409 rel = Annotation() 410 rel.type = interaction.get("type") 411 e1 = interaction.get("e1") 412 e2 = interaction.get("e2") 413 relScores = interaction.get("predictions") 414 #assert rel.type == "Protein-Component" or rel.type == "Subunit-Complex" or rel.type == "Renaming", (rel.type, stDoc.id, interaction.get("id")) 415 if rel.type == "Protein-Component" or rel.type == "Subunit-Complex": 416 rel.arguments.append(["Arg1", tMap[e1], None, relScores]) 417 rel.arguments.append(["Arg2", tMap[e2], None, relScores]) 418 elif rel.type == "Renaming": 419 rel.arguments.append(["Former", tMap[e1], None, relScores]) 420 rel.arguments.append(["New", tMap[e2], None, relScores]) 421 elif rel.type == "Coref": 422 rel.arguments.append(["Anaphora", tMap[e1], None, relScores]) 423 rel.arguments.append(["Antecedent", tMap[e2], None, relScores]) 424 # Add protein arguments' 425 if corefProtMap.has_key(e2): 426 for prot in corefProtMap[e2]: 427 rel.arguments.append(["Target", prot, None]) 428 elif rel.type.startswith("SR-"): 429 rel.arguments.append(["Arg1", tMap[e1], None, relScores]) 430 rel.arguments.append(["Arg2", tMap[e2], None, relScores]) 431 else: 432 assert False, (rel.type, stDoc.id, interaction.get("id")) 433 stDoc.relations.append(rel) 434 # Map argument targets 435 for event in stDoc.events: 436 for arg in event.arguments[:]: 437 if arg[1] == None: 438 assert False 439 continue 440 id = arg[1] 441 if eMap.has_key(id): 442 arg[1] = eMap[id] 443 elif tMap.has_key(id): 444 arg[1] = tMap[id] 445 ## Remove Entity-type triggers if they are Regulation-arguments 446 #if "egulation" in event.type and tMap[id].type != "Protein": 447 # event.arguments.remove(arg) 448 # add sites 449 if siteMap.has_key(id): 450 if siteMap[id].type == "Entity": 451 assert id not in eMap 452 assert id in tMap 453 arg[2] = siteMap[id] 454 if id in siteScores and siteScores[id] != None: 455 while len(arg) < 5: 456 arg += [None] 457 assert arg[4] == None 458 arg[4] = siteScores[id] 459 else: 460 nonEntitySiteCount += 1 461 #assert siteMap[id].type == "Entity", (stDoc.id, event.id, id, siteMap[id].id, siteMap[id].type) 462 # # Remove eventless triggers 463 # triggersToKeep = [] 464 # for trigger in stDoc.triggers: 465 # if trigger.type == "Entity": 466 # triggersToKeep.append(trigger) 467 # else: 468 # for event in stDoc.events: 469 # if event.trigger == trigger: 470 # triggersToKeep.append(trigger) 471 # break 472 # stDoc.triggers = triggersToKeep 473 # Sort arguments 474 #for eKey in sorted(eMap.keys()): 475 # event = eMap[eKey] 476 # event.arguments.sort(cmp=compareArguments) 477 # Create STFormat ids 478 #updateIds(stDoc.proteins) 479 #updateIds(stDoc.triggers, getMaxId(stDoc.proteins) + 1) 480 #updateIds(stDoc.events) 481 #updateIds(stDoc.relations) 482 483 if nonEntitySiteCount > 0: 484 print >> sys.stderr, "Warning, discarded", nonEntitySiteCount, "non-entity sites" 485 486 if output != None: 487 print >> sys.stderr, "Writing output to", output 488 writeSet(documents, output, resultFileTag=outputTag, debug=debug, task=task, validate=validate, writeScores=writeScores) 489 return documents
490 491 #def toSTFormatSentences(input, output=None, outputTag="a2"): 492 # print >> sys.stderr, "Loading corpus", input 493 # corpusTree = ETUtils.ETFromObj(input) 494 # print >> sys.stderr, "Corpus file loaded" 495 # corpusRoot = corpusTree.getroot() 496 # 497 # documents = [] 498 # for document in corpusRoot.findall("document"): 499 # sentenceCount = 0 500 # for sentence in document.findall("sentence"): 501 # stDoc = Document() 502 # stDoc.proteins = [] 503 # stDoc.triggers = [] 504 # stDoc.events = [] 505 # stDoc.relations = [] 506 # stDoc.id = document.get("origId") + ".s" + str(sentenceCount) 507 # stDoc.text = sentence.get("text") #"" 508 # tail = sentence.get("tail") 509 # if tail != None: 510 # stDoc.text += tail 511 # documents.append(stDoc) 512 # eMap = {} 513 # tMap = {} 514 # sites = [] 515 # sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) 516 ## sentenceOffsets = {} 517 ## for sentence in document.findall("sentence"): 518 ## stDoc.text += sentence.get("text") 519 ## tail = sentence.get("tail") 520 ## if tail != None: 521 ## stDoc.text += tail 522 ## sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) 523 ## sentenceOffsets[sentence.get("id")] = sentenceOffset 524 # for entity in sentence.getiterator("entity"): 525 # eType = entity.get("type") 526 # if eType == "neg": 527 # continue 528 # entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) 529 # ann = Annotation() 530 # ann.type = eType 531 # ann.text = entity.get("text") 532 # ann.charBegin = entityOffset[0] 533 # ann.charEnd = entityOffset[1] + 1 534 # idStem = entity.get("id").rsplit(".", 1)[0] 535 # if sentenceOffsets.has_key(idStem): 536 # #sentenceOffset = sentenceOffsets[idStem] 537 # ann.charBegin += sentenceOffset[0] 538 # ann.charEnd += sentenceOffset[0] 539 # if entity.get("speculation") == "True": 540 # ann.speculation = True 541 # if entity.get("negation") == "True": 542 # ann.negation = True 543 # if entity.get("isName") == "True": 544 # stDoc.proteins.append(ann) 545 # else: 546 # stDoc.triggers.append(ann) 547 # tMap[entity.get("id")] = ann 548 # # First map Coref proteins 549 # corefProtMap = {} 550 # for interaction in sentence.getiterator("interaction"): 551 # intType = interaction.get("type") 552 # if intType == "Target": 553 # e1 = interaction.get("e1") 554 # e2 = interaction.get("e2") 555 # if not tMap.has_key(e2): 556 # print >> sys.stderr, "Warning, no trigger for Coref Protein Target" 557 # continue 558 # e2 = tMap[e2] 559 # if not corefProtMap.has_key(e1): 560 # corefProtMap[e1] = [] 561 # if not e2 in corefProtMap[e1]: 562 # corefProtMap[e1].append(e2) 563 # # Then process all interactions 564 # for interaction in sentence.getiterator("interaction"): 565 # intType = interaction.get("type") 566 # if intType == "neg" or intType == "Target": 567 # continue # Targets have already been put into a dictionary 568 # elif intType in ["Site", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation"]: 569 # if intType == "Site": 570 # sites.append(interaction) 571 # else: 572 # e1 = interaction.get("e1") 573 # if eMap.has_key(e1): 574 # event = eMap[e1] 575 # else: 576 # event = Annotation() 577 # event.trigger = tMap[interaction.get("e1")] 578 # eMap[e1] = event 579 # stDoc.events.append(event) 580 # arg = [interaction.get("type"), interaction.get("e2"), None] 581 # event.arguments.append(arg) 582 # else: # interaction is a relation 583 # rel = Annotation() 584 # rel.type = interaction.get("type") 585 # e1 = interaction.get("e1") 586 # e2 = interaction.get("e2") 587 # #assert rel.type == "Protein-Component" or rel.type == "Subunit-Complex" or rel.type == "Renaming", (rel.type, stDoc.id, interaction.get("id")) 588 # if rel.type == "Protein-Component" or rel.type == "Subunit-Complex": 589 # rel.arguments.append(["Arg1", tMap[e1], None]) 590 # rel.arguments.append(["Arg2", tMap[e2], None]) 591 # elif rel.type == "Renaming": 592 # rel.arguments.append(["Former", tMap[e1], None]) 593 # rel.arguments.append(["New", tMap[e2], None]) 594 # elif rel.type == "Coref": 595 # rel.arguments.append(["Anaphora", tMap[e1], None]) 596 # rel.arguments.append(["Antecedent", tMap[e2], None]) 597 # # Add protein arguments' 598 # if corefProtMap.has_key(e2): 599 # for prot in corefProtMap[e2]: 600 # rel.arguments.append(["Target", prot, None]) 601 # else: 602 # assert False, (rel.type, stDoc.id, interaction.get("id")) 603 # stDoc.relations.append(rel) 604 # # Map argument targets 605 # for eKey in sorted(eMap.keys()): 606 # event = eMap[eKey] 607 # for arg in event.arguments: 608 # if tMap.has_key(arg[2]): 609 # arg[2] = tMap[arg2] 610 # else: 611 # arg[2] = eMap[arg2] 612 # # Create STFormat ids 613 # updateIds(stDoc.proteins) 614 # updateIds(stDoc.triggers, getMaxId(stDoc.proteins) + 1) 615 # updateIds(stDoc.events) 616 # updateIds(stDoc.relations) 617 # sentenceCount += 1 618 # 619 # if output != None: 620 # print >> sys.stderr, "Writing output to", output 621 # writeSet(documents, output, resultFileTag=outputTag) 622 # return documents 623 624 if __name__=="__main__": 625 import sys 626 from optparse import OptionParser 627 # Import Psyco if available 628 try: 629 import psyco 630 psyco.full() 631 print >> sys.stderr, "Found Psyco, using" 632 except ImportError: 633 print >> sys.stderr, "Psyco not installed" 634 635 optparser = OptionParser(description="Conversion between BioNLP ST format and Interaction XML") 636 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 637 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 638 optparser.add_option("-t", "--outputTag", default="a2", dest="outputTag", help="a2 file extension.") 639 optparser.add_option("-s", "--sentences", default=False, action="store_true", dest="sentences", help="Write each sentence to its own document") 640 optparser.add_option("-r", "--origIds", default=False, action="store_true", dest="origIds", help="Use stored original ids (can cause problems with duplicates).") 641 optparser.add_option("-a", "--task", default=2, type="int", dest="task", help="1 or 2") 642 optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="Verbose output.") 643 (options, args) = optparser.parse_args() 644 645 if options.input[-4:] == ".xml": 646 print >> sys.stderr, "Loading XML" 647 xml = ETUtils.ETFromObj(options.input) 648 if options.sentences: 649 print >> sys.stderr, "Converting to ST Format (sentences)" 650 toSTFormatSentences(xml, options.output, options.outputTag, options.origIds) 651 else: 652 print >> sys.stderr, "Converting to ST Format" 653 toSTFormat(xml, options.output, options.outputTag, options.origIds, debug=options.debug, task=options.task) 654 655 656 #if __name__=="__main__": 657 # # Import Psyco if available 658 # try: 659 # import psyco 660 # psyco.full() 661 # print >> sys.stderr, "Found Psyco, using" 662 # except ImportError: 663 # print >> sys.stderr, "Psyco not installed" 664 # 665 # #proteins, triggers, events = load(1335418, "/home/jari/biotext/tools/TurkuEventExtractionSystem-1.0/data/evaluation-data/evaluation-tools-devel-gold") 666 # #write(1335418, "/home/jari/data/temp", proteins, triggers, events ) 667 # 668 # #p = "/home/jari/data/BioNLP09SharedTask/bionlp09_shared_task_development_data_rev1" 669 # p = "/home/jari/data/BioNLP11SharedTask/BioNLP-ST_2011_Entity_Relations_development_data" 670 # print "Loading documents" 671 # documents = loadSet(p) 672 # print "Writing XML" 673 # xml = toInteractionXML(documents, "GENIA", "/home/jari/data/temp/new-devel.xml") 674 # print "Converting back" 675 # toSTFormat(xml, "/home/jari/data/temp/new-devel-stformat") 676