Package TEES :: Package Utils :: Package STFormat :: Module Validate
[hide private]

Source Code for Module TEES.Utils.STFormat.Validate

  1  from collections import defaultdict 
  2   
3 -def validateREL(documents):
4 for document in documents: 5 if len(document.events) > 0: 6 print >> sys.stderr, "Warning, events for REL task" 7 for relation in relations: 8 assert len(relation.arguments) == 2 9 pass
10
11 -def compareEvents(e1, e2):
12 if e1.type == e2.type and e1.trigger == e2.trigger and len(e1.arguments) == len(e2.arguments): 13 for arg1, arg2 in zip(e1.arguments, e2.arguments): 14 if arg1[0] != arg2[0] or arg1[1] != arg2[1] or arg1[2] != arg2[2]: 15 return False 16 return True 17 else: 18 return False
19
20 -def removeDuplicates(events):
21 firstLoop = True 22 numRemoved = 0 23 totalRemoved = 0 24 # Since removed events cause nesting events' arguments to be remapped, 25 # some of these nesting events may in turn become duplicates. Loop until 26 # all such duplicates are removed. 27 while(numRemoved > 0 or firstLoop): 28 firstLoop = False 29 # Group duplicate events 30 duplGroups = {} 31 isDuplicate = {} 32 for i in range(len(events)-1): 33 e1 = events[i] 34 duplGroups[e1] = [] # "same as e1" 35 # Check all other events against e1 36 for j in range(i+1, len(events)): 37 e2 = events[j] 38 if compareEvents(e1, e2): 39 if e2 not in isDuplicate: # else already added to a duplGroup 40 isDuplicate[e2] = True 41 duplGroups[e1].append(e2) 42 # Mark events for keeping or removal 43 replaceWith = {} 44 toRemove = set() 45 for mainEvent, duplGroup in duplGroups.iteritems(): 46 if len(duplGroup) == 0: 47 continue 48 # Mark for removal or preservation 49 for event in duplGroup: 50 assert event not in replaceWith 51 replaceWith[event] = mainEvent 52 toRemove.add(event) 53 # Remove events and remap arguments 54 kept = [] 55 for event in events: 56 if event not in toRemove: 57 for arg in event.arguments: 58 if arg[1] in replaceWith: 59 assert arg[2] == None 60 arg[1] = replaceWith[arg[1]] 61 kept.append(event) 62 numRemoved = len(events) - len(kept) 63 totalRemoved += numRemoved 64 events = kept 65 return events
66
67 -def getBISuperType(eType):
68 if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]: 69 return "ProteinEntity" 70 elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]: 71 return "GeneEntity" 72 else: 73 return None
74
75 -def isIDCore(eType):
76 return eType in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]
77
78 -def isIDTask(proteins):
79 for protein in proteins: 80 if protein.type in ["Regulon-operon", "Two-component-system", "Chemical"]: 81 return True 82 return False
83 84 # Enforce type-specific limits
85 -def validate(events, simulation=False, verbose=False, docId=None): #, taskIsID=None):
86 #assert taskIsID != None 87 88 numRemoved = 0 89 removeCounts = defaultdict(int) 90 totalRemoved = 0 91 if simulation: 92 verbose = True 93 docId = str(docId) 94 # Since removed events cause nesting events' arguments to be remapped, 95 # some of these nesting events may in turn become duplicates. Loop until 96 # all such duplicates are removed. 97 firstLoop = True 98 while(numRemoved > 0 or firstLoop): 99 firstLoop = False 100 toRemove = set() 101 for event in events: 102 # Check arguments 103 for arg in event.arguments[:]: 104 #if arg[1].type == "Entity": 105 # print "arg[1] == Entity" 106 # if not verbose: 107 # assert False, arg 108 if arg[2] != None and arg[2].type != "Entity": 109 print "arg[2] != Entity:", arg[2].type 110 #if not verbose: 111 if verbose: print "VAL:", docId + "." + str(event.id), "Warning, non-entity type arg[2]" 112 assert False, arg 113 # GE-regulation rules 114 if "egulation" in event.type: 115 typeCounts = {"Cause":0, "Theme":0} 116 for arg in event.arguments[:]: 117 if arg[0] not in typeCounts or not (isIDCore(arg[1].type) or arg[1].trigger != None): 118 # if arg[1] has no trigger, this means that arg[1] is a trigger for 119 # which no event was predicted 120 event.arguments.remove(arg) 121 if verbose: print "VAL:", docId + "." + str(event.id), "Removed", event.type, "event argument of type", arg[0], arg 122 else: 123 typeCounts[arg[0]] += 1 124 if typeCounts["Theme"] == 0:# and not taskIsID: 125 toRemove.add(event) 126 if verbose: print "VAL:", docId + "." + str(event.id), "(P/N/R)egulation with no themes" 127 if len(event.arguments) == 0: 128 toRemove.add(event) 129 if verbose: print "VAL:", docId + "." + str(event.id), "(P/N/R)egulation with no arguments" 130 elif event.type != "Catalysis": # The three regulations and catalysis are the only events that can have a cause 131 for arg in event.arguments[:]: 132 if arg[0] == "Cause": 133 event.arguments.remove(arg) 134 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type 135 # Remove illegal arguments (GE=Only a protein can be a Theme for a non-regulation event) 136 if event.type in ["Gene_expression", "Transcription"]: 137 for arg in event.arguments[:]: 138 if arg[0] == "Theme" and arg[1].type not in ["Protein", "Regulon-operon"]: 139 event.arguments.remove(arg) 140 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type 141 if event.type in ["Protein_catabolism", "Phosphorylation"]: 142 for arg in event.arguments[:]: 143 if arg[0] == "Theme" and arg[1].type not in ["Protein"]: 144 event.arguments.remove(arg) 145 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type 146 if event.type in ["Localization", "Binding"]: 147 for arg in event.arguments[:]: 148 if arg[0] == "Theme" and arg[1].type not in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]: 149 event.arguments.remove(arg) 150 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type 151 # Check non-regulation events 152 if event.type in ["Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Localization", "Binding"]: 153 themeCount = 0 154 for arg in event.arguments: 155 if arg[0] == "Theme": 156 themeCount += 1 157 if themeCount == 0: 158 if event.type == "Localization" and len(event.arguments) > 0: # Also used in BB 159 for arg in event.arguments: 160 if arg[0] in ["ToLoc", "AtLoc"]: # GE-task Localization 161 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with no themes" 162 toRemove.add(event) 163 break 164 else: 165 toRemove.add(event) 166 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with no themes" 167 # Filter sites from GE events that can't have them (moved from STTools.writeEvents 168 if event.type not in ["Binding", "Phosphorylation"]: # also ["Positive_regulation", "Negative_regulation", "Regulation"] 169 for arg in event.arguments: 170 if arg[2] != None: 171 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type, "with a site." 172 removeCounts["site-removed-from-" + event.type] += 1 173 arg[2] = None 174 if len(arg) > 4: 175 arg[4] = None 176 # check non-binding events 177 if event.type != "Binding": 178 themeCount = 0 179 for arg in event.arguments: 180 if arg[0] == "Theme": 181 themeCount += 1 182 if themeCount > 1: 183 toRemove.add(event) 184 if verbose: print "VAL:", docId + "." + str(event.id), "Non-binding event", event.type, "with", themeCount, "themes" 185 if event.type == "Process": 186 for arg in event.arguments[:]: 187 if arg[0] != "Participant": 188 event.arguments.remove(arg) 189 if verbose: print "VAL:", docId + "." + str(event.id), "Non-participant argument", arg[0], "for", event.type 190 elif not isIDCore(arg[1].type): 191 event.arguments.remove(arg) 192 if verbose: print "VAL:", docId + "." + str(event.id), arg[0], "argument with target", arg[1].type 193 if event.type == "PartOf": # BB 194 assert len(event.arguments) == 2 195 # BB 196 if event.arguments[0][1].type != "Host": 197 toRemove.add(event) 198 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with arg 1 of type", event.arguments[0][1].type 199 if event.arguments[1][1].type != "HostPart": 200 toRemove.add(event) 201 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with arg 2 of type", event.arguments[1][1].type 202 if event.type == "Localization": # BB and others 203 for arg in event.arguments: 204 if arg[0] == "Bacterium" and arg[1].type != "Bacterium": 205 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type 206 toRemove.add(event) 207 208 # BI-rules 209 if len(event.arguments) == 2: 210 arg1Type = event.arguments[0][1].type 211 arg1SuperType = getBISuperType(arg1Type) 212 arg2Type = event.arguments[1][1].type 213 arg2SuperType = getBISuperType(arg2Type) 214 if event.type == "RegulonDependence": 215 if arg1Type != "Regulon": toRemove.add(event) 216 if arg2SuperType not in ["GeneEntity", "ProteinEntity"]: toRemove.add(event) 217 elif event.type == "BindTo": 218 if arg1SuperType != "ProteinEntity": toRemove.add(event) 219 if arg2Type not in ["Site", "Promoter", "Gene", "GeneComplex"]: toRemove.add(event) 220 elif event.type == "TranscriptionFrom": 221 if arg1Type not in ["Transcription", "Expression"]: toRemove.add(event) 222 if arg2Type not in ["Site", "Promoter"]: toRemove.add(event) 223 elif event.type == "RegulonMember": 224 if arg1Type != "Regulon": toRemove.add(event) 225 if arg2SuperType not in ["GeneEntity", "ProteinEntity"]: toRemove.add(event) 226 elif event.type == "SiteOf": 227 if arg1Type != "Site": toRemove.add(event) 228 if not (arg2Type in ["Site", "Promoter"] or arg2SuperType == "GeneEntity"): toRemove.add(event) 229 elif event.type == "TranscriptionBy": 230 if arg1Type != "Transcription": toRemove.add(event) 231 if arg2SuperType != "ProteinEntity": toRemove.add(event) 232 elif event.type == "PromoterOf": 233 if arg1Type != "Promoter": toRemove.add(event) 234 if arg2SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event) 235 elif event.type == "PromoterDependence": 236 if arg1Type != "Promoter": toRemove.add(event) 237 if arg2SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event) 238 elif event.type == "ActionTarget": 239 if arg1Type not in ["Action", "Expression", "Transcription"]: toRemove.add(event) 240 elif event.type == "Interaction": 241 if arg1SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event) 242 if arg2SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event) 243 # BI-task implicit rules (not defined in documentation, discovered by evaluator complaining) 244 if len(event.arguments) == 2: 245 # Evaluator says: "SEVERE: role Target does not allow entity of type Site". 246 # This is not actually true, because if you check this for all Target-arguments, and 247 # remove such events, performance decreases for the gold-data. But what can you do, 248 # the evaluator keeps complaining, and won't process the data. The "solution" is to 249 # remove from Target/Site-checking those classes which reduce performance on gold data. 250 if event.type not in ["BindTo", "SiteOf"]: 251 if arg1Type == "Site" and event.arguments[0][0] == "Target": 252 if verbose: print "VAL:", docId + "." + str(event.id), "Removing illegal Target-argument from event", event.type 253 toRemove.add(event) 254 if arg2Type == "Site" and event.arguments[1][0] == "Target": 255 if verbose: print "VAL:", docId + "." + str(event.id), "Removing illegal Target-argument from event", event.type 256 toRemove.add(event) 257 # EPI-specific rules 258 if event.type in ["Dephosphorylation", 259 "Hydroxylation", 260 "Dehydroxylation", 261 "Ubiquitination", 262 "Deubiquitination", 263 "DNA_methylation", 264 "DNA_demethylation", 265 "Glycosylation", 266 "Deglycosylation", 267 "Acetylation", 268 "Deacetylation", 269 "Methylation", 270 "Demethylation", 271 "Catalysis"]: 272 eventType = event.type 273 # Filter arguments 274 for arg in event.arguments[:]: 275 if arg[2] != None and eventType == "Catalysis": # No task 2 for Catalysis 276 arg[2] = None 277 if arg[0] in ["Theme", "Cause"] and (arg[1].trigger == None and arg[1].type not in ["Protein", "Entity"]): # Suspicious, trigger as argument 278 event.arguments.remove(arg) 279 elif arg[0] == "Cause" and (arg[1].type != "Protein" or eventType != "Catalysis"): 280 event.arguments.remove(arg) 281 elif arg[0] == "Theme": 282 if eventType == "Catalysis": 283 if arg[1].type in ["Entity", "Protein"]: 284 event.arguments.remove(arg) 285 elif arg[1].type != "Protein": 286 event.arguments.remove(arg) 287 elif arg[0] == "Sidechain" and eventType not in ["Glycosylation", "Deglycosylation"]: 288 event.arguments.remove(arg) 289 elif arg[0] == "Contextgene" and (eventType not in ["Acetylation", "Deacetylation", "Methylation", "Demethylation"] or arg[1].type != "Protein"): 290 event.arguments.remove(arg) 291 # Count remaining arguments 292 typeCounts = {"Cause":0, "Theme":0} 293 for arg in event.arguments: 294 if arg[0] in typeCounts: 295 typeCounts[arg[0]] += 1 296 # Make choices 297 if typeCounts["Theme"] == 0: 298 toRemove.add(event) 299 if verbose: print "VAL:", docId + "." + str(event.id), "EPI-event with no themes" 300 if len(event.arguments) == 0: 301 toRemove.add(event) 302 if verbose: print "VAL:", docId + "." + str(event.id), "EPI-event with no arguments" 303 304 # Remove events and remap arguments 305 if not simulation: 306 kept = [] 307 for event in events: 308 if event not in toRemove: 309 for arg in event.arguments[:]: 310 if arg[1] in toRemove: 311 event.arguments.remove(arg) 312 kept.append(event) 313 else: 314 removeCounts[event.type] += 1 315 numRemoved = len(events) - len(kept) 316 totalRemoved += numRemoved 317 events = kept 318 else: 319 numRemoved = 0 320 return events, removeCounts 321
322 -def removeUnusedTriggers(document):
323 # Remove triggers which are not used as triggers or arguments 324 triggersToKeep = [] 325 for trigger in document.triggers: 326 kept = False 327 for event in document.events + document.relations: 328 if event.trigger == trigger: 329 triggersToKeep.append(trigger) 330 kept = True 331 break 332 else: 333 for arg in event.arguments: 334 if arg[1] == trigger or arg[2] == trigger: 335 triggersToKeep.append(trigger) 336 kept = True 337 break 338 if kept: 339 break 340 document.triggers = triggersToKeep
341
342 -def allValidate(document, counts, task, verbose=False):
343 numEvents = len(document.events) 344 document.events, removeCounts = validate(document.events, verbose=verbose, docId=document.id) #, taskIsID=isIDTask(document.proteins)) 345 for key in removeCounts: 346 counts["invalid-" + key] += removeCounts[key] 347 counts["validation-removed"] += numEvents - len(document.events) 348 numEvents = len(document.events) 349 document.events = removeDuplicates(document.events) 350 counts["duplicates-removed"] += numEvents - len(document.events) 351 removeArguments(document, task, counts) 352 removeEntities(document, task, counts) 353 # triggers 354 numTriggers = len(document.triggers) 355 removeUnusedTriggers(document) 356 counts["unused-triggers-removed"] += numTriggers - len(document.triggers)
357
358 -def removeArguments(document, task, counts):
359 if task != 1: 360 return 361 for event in document.events: 362 for arg in event.arguments[:]: 363 if arg[0] in ["Site", "AtLoc", "ToLoc", "Sidechain", "Contextgene"]: 364 event.arguments.remove(arg) 365 counts["t2-arguments-removed"] += 1
366
367 -def removeEntities(document, task, counts):
368 if task != 1: 369 return 370 # "Entity"-entities are not used in task 1, so they 371 # can be removed then. 372 triggersToKeep = [] 373 for trigger in document.triggers: 374 if trigger.type == "Entity": 375 counts["t2-entities-removed"] += 1 376 else: 377 triggersToKeep.append(trigger) 378 document.triggers = triggersToKeep
379 380 if __name__=="__main__": 381 import sys 382 import STTools 383 from optparse import OptionParser 384 # Import Psyco if available 385 try: 386 import psyco 387 psyco.full() 388 print >> sys.stderr, "Found Psyco, using" 389 except ImportError: 390 print >> sys.stderr, "Psyco not installed" 391 392 optparser = OptionParser(description="Validate BioNLP'11 event constraints") 393 optparser.add_option("-i", "--input", default=None, dest="input", help="", metavar="FILE") 394 optparser.add_option("-o", "--output", default=None, dest="output", help="") 395 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="") 396 optparser.add_option("--noScores", default=False, action="store_true", dest="noScores", help="") 397 (options, args) = optparser.parse_args() 398 399 if options.output == None: 400 options.output = options.input + "-validated.tar.gz" 401 print >> sys.stderr, "Reading documents" 402 documents = STTools.loadSet(options.input, readScores=(not options.noScores)) 403 print >> sys.stderr, "Writing documents" 404 STTools.writeSet(documents, options.output, validate=True, writeScores=(not options.noScores), task=2, debug=options.debug) 405