1 from collections import defaultdict
2
4 for document in documents:
5 if len(document.events) > 0:
6 print >> sys.stderr, "Warning, events for REL task"
7 for relation in relations:
8 assert len(relation.arguments) == 2
9 pass
10
12 if e1.type == e2.type and e1.trigger == e2.trigger and len(e1.arguments) == len(e2.arguments):
13 for arg1, arg2 in zip(e1.arguments, e2.arguments):
14 if arg1[0] != arg2[0] or arg1[1] != arg2[1] or arg1[2] != arg2[2]:
15 return False
16 return True
17 else:
18 return False
19
21 firstLoop = True
22 numRemoved = 0
23 totalRemoved = 0
24
25
26
27 while(numRemoved > 0 or firstLoop):
28 firstLoop = False
29
30 duplGroups = {}
31 isDuplicate = {}
32 for i in range(len(events)-1):
33 e1 = events[i]
34 duplGroups[e1] = []
35
36 for j in range(i+1, len(events)):
37 e2 = events[j]
38 if compareEvents(e1, e2):
39 if e2 not in isDuplicate:
40 isDuplicate[e2] = True
41 duplGroups[e1].append(e2)
42
43 replaceWith = {}
44 toRemove = set()
45 for mainEvent, duplGroup in duplGroups.iteritems():
46 if len(duplGroup) == 0:
47 continue
48
49 for event in duplGroup:
50 assert event not in replaceWith
51 replaceWith[event] = mainEvent
52 toRemove.add(event)
53
54 kept = []
55 for event in events:
56 if event not in toRemove:
57 for arg in event.arguments:
58 if arg[1] in replaceWith:
59 assert arg[2] == None
60 arg[1] = replaceWith[arg[1]]
61 kept.append(event)
62 numRemoved = len(events) - len(kept)
63 totalRemoved += numRemoved
64 events = kept
65 return events
66
68 if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]:
69 return "ProteinEntity"
70 elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]:
71 return "GeneEntity"
72 else:
73 return None
74
76 return eType in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]
77
79 for protein in proteins:
80 if protein.type in ["Regulon-operon", "Two-component-system", "Chemical"]:
81 return True
82 return False
83
84
85 -def validate(events, simulation=False, verbose=False, docId=None):
86
87
88 numRemoved = 0
89 removeCounts = defaultdict(int)
90 totalRemoved = 0
91 if simulation:
92 verbose = True
93 docId = str(docId)
94
95
96
97 firstLoop = True
98 while(numRemoved > 0 or firstLoop):
99 firstLoop = False
100 toRemove = set()
101 for event in events:
102
103 for arg in event.arguments[:]:
104
105
106
107
108 if arg[2] != None and arg[2].type != "Entity":
109 print "arg[2] != Entity:", arg[2].type
110
111 if verbose: print "VAL:", docId + "." + str(event.id), "Warning, non-entity type arg[2]"
112 assert False, arg
113
114 if "egulation" in event.type:
115 typeCounts = {"Cause":0, "Theme":0}
116 for arg in event.arguments[:]:
117 if arg[0] not in typeCounts or not (isIDCore(arg[1].type) or arg[1].trigger != None):
118
119
120 event.arguments.remove(arg)
121 if verbose: print "VAL:", docId + "." + str(event.id), "Removed", event.type, "event argument of type", arg[0], arg
122 else:
123 typeCounts[arg[0]] += 1
124 if typeCounts["Theme"] == 0:
125 toRemove.add(event)
126 if verbose: print "VAL:", docId + "." + str(event.id), "(P/N/R)egulation with no themes"
127 if len(event.arguments) == 0:
128 toRemove.add(event)
129 if verbose: print "VAL:", docId + "." + str(event.id), "(P/N/R)egulation with no arguments"
130 elif event.type != "Catalysis":
131 for arg in event.arguments[:]:
132 if arg[0] == "Cause":
133 event.arguments.remove(arg)
134 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type
135
136 if event.type in ["Gene_expression", "Transcription"]:
137 for arg in event.arguments[:]:
138 if arg[0] == "Theme" and arg[1].type not in ["Protein", "Regulon-operon"]:
139 event.arguments.remove(arg)
140 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type
141 if event.type in ["Protein_catabolism", "Phosphorylation"]:
142 for arg in event.arguments[:]:
143 if arg[0] == "Theme" and arg[1].type not in ["Protein"]:
144 event.arguments.remove(arg)
145 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type
146 if event.type in ["Localization", "Binding"]:
147 for arg in event.arguments[:]:
148 if arg[0] == "Theme" and arg[1].type not in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]:
149 event.arguments.remove(arg)
150 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type
151
152 if event.type in ["Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Localization", "Binding"]:
153 themeCount = 0
154 for arg in event.arguments:
155 if arg[0] == "Theme":
156 themeCount += 1
157 if themeCount == 0:
158 if event.type == "Localization" and len(event.arguments) > 0:
159 for arg in event.arguments:
160 if arg[0] in ["ToLoc", "AtLoc"]:
161 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with no themes"
162 toRemove.add(event)
163 break
164 else:
165 toRemove.add(event)
166 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with no themes"
167
168 if event.type not in ["Binding", "Phosphorylation"]:
169 for arg in event.arguments:
170 if arg[2] != None:
171 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type, "with a site."
172 removeCounts["site-removed-from-" + event.type] += 1
173 arg[2] = None
174 if len(arg) > 4:
175 arg[4] = None
176
177 if event.type != "Binding":
178 themeCount = 0
179 for arg in event.arguments:
180 if arg[0] == "Theme":
181 themeCount += 1
182 if themeCount > 1:
183 toRemove.add(event)
184 if verbose: print "VAL:", docId + "." + str(event.id), "Non-binding event", event.type, "with", themeCount, "themes"
185 if event.type == "Process":
186 for arg in event.arguments[:]:
187 if arg[0] != "Participant":
188 event.arguments.remove(arg)
189 if verbose: print "VAL:", docId + "." + str(event.id), "Non-participant argument", arg[0], "for", event.type
190 elif not isIDCore(arg[1].type):
191 event.arguments.remove(arg)
192 if verbose: print "VAL:", docId + "." + str(event.id), arg[0], "argument with target", arg[1].type
193 if event.type == "PartOf":
194 assert len(event.arguments) == 2
195
196 if event.arguments[0][1].type != "Host":
197 toRemove.add(event)
198 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with arg 1 of type", event.arguments[0][1].type
199 if event.arguments[1][1].type != "HostPart":
200 toRemove.add(event)
201 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with arg 2 of type", event.arguments[1][1].type
202 if event.type == "Localization":
203 for arg in event.arguments:
204 if arg[0] == "Bacterium" and arg[1].type != "Bacterium":
205 if verbose: print "VAL:", docId + "." + str(event.id), event.type, "with", arg[0], "arg of type", arg[1].type
206 toRemove.add(event)
207
208
209 if len(event.arguments) == 2:
210 arg1Type = event.arguments[0][1].type
211 arg1SuperType = getBISuperType(arg1Type)
212 arg2Type = event.arguments[1][1].type
213 arg2SuperType = getBISuperType(arg2Type)
214 if event.type == "RegulonDependence":
215 if arg1Type != "Regulon": toRemove.add(event)
216 if arg2SuperType not in ["GeneEntity", "ProteinEntity"]: toRemove.add(event)
217 elif event.type == "BindTo":
218 if arg1SuperType != "ProteinEntity": toRemove.add(event)
219 if arg2Type not in ["Site", "Promoter", "Gene", "GeneComplex"]: toRemove.add(event)
220 elif event.type == "TranscriptionFrom":
221 if arg1Type not in ["Transcription", "Expression"]: toRemove.add(event)
222 if arg2Type not in ["Site", "Promoter"]: toRemove.add(event)
223 elif event.type == "RegulonMember":
224 if arg1Type != "Regulon": toRemove.add(event)
225 if arg2SuperType not in ["GeneEntity", "ProteinEntity"]: toRemove.add(event)
226 elif event.type == "SiteOf":
227 if arg1Type != "Site": toRemove.add(event)
228 if not (arg2Type in ["Site", "Promoter"] or arg2SuperType == "GeneEntity"): toRemove.add(event)
229 elif event.type == "TranscriptionBy":
230 if arg1Type != "Transcription": toRemove.add(event)
231 if arg2SuperType != "ProteinEntity": toRemove.add(event)
232 elif event.type == "PromoterOf":
233 if arg1Type != "Promoter": toRemove.add(event)
234 if arg2SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event)
235 elif event.type == "PromoterDependence":
236 if arg1Type != "Promoter": toRemove.add(event)
237 if arg2SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event)
238 elif event.type == "ActionTarget":
239 if arg1Type not in ["Action", "Expression", "Transcription"]: toRemove.add(event)
240 elif event.type == "Interaction":
241 if arg1SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event)
242 if arg2SuperType not in ["ProteinEntity", "GeneEntity"]: toRemove.add(event)
243
244 if len(event.arguments) == 2:
245
246
247
248
249
250 if event.type not in ["BindTo", "SiteOf"]:
251 if arg1Type == "Site" and event.arguments[0][0] == "Target":
252 if verbose: print "VAL:", docId + "." + str(event.id), "Removing illegal Target-argument from event", event.type
253 toRemove.add(event)
254 if arg2Type == "Site" and event.arguments[1][0] == "Target":
255 if verbose: print "VAL:", docId + "." + str(event.id), "Removing illegal Target-argument from event", event.type
256 toRemove.add(event)
257
258 if event.type in ["Dephosphorylation",
259 "Hydroxylation",
260 "Dehydroxylation",
261 "Ubiquitination",
262 "Deubiquitination",
263 "DNA_methylation",
264 "DNA_demethylation",
265 "Glycosylation",
266 "Deglycosylation",
267 "Acetylation",
268 "Deacetylation",
269 "Methylation",
270 "Demethylation",
271 "Catalysis"]:
272 eventType = event.type
273
274 for arg in event.arguments[:]:
275 if arg[2] != None and eventType == "Catalysis":
276 arg[2] = None
277 if arg[0] in ["Theme", "Cause"] and (arg[1].trigger == None and arg[1].type not in ["Protein", "Entity"]):
278 event.arguments.remove(arg)
279 elif arg[0] == "Cause" and (arg[1].type != "Protein" or eventType != "Catalysis"):
280 event.arguments.remove(arg)
281 elif arg[0] == "Theme":
282 if eventType == "Catalysis":
283 if arg[1].type in ["Entity", "Protein"]:
284 event.arguments.remove(arg)
285 elif arg[1].type != "Protein":
286 event.arguments.remove(arg)
287 elif arg[0] == "Sidechain" and eventType not in ["Glycosylation", "Deglycosylation"]:
288 event.arguments.remove(arg)
289 elif arg[0] == "Contextgene" and (eventType not in ["Acetylation", "Deacetylation", "Methylation", "Demethylation"] or arg[1].type != "Protein"):
290 event.arguments.remove(arg)
291
292 typeCounts = {"Cause":0, "Theme":0}
293 for arg in event.arguments:
294 if arg[0] in typeCounts:
295 typeCounts[arg[0]] += 1
296
297 if typeCounts["Theme"] == 0:
298 toRemove.add(event)
299 if verbose: print "VAL:", docId + "." + str(event.id), "EPI-event with no themes"
300 if len(event.arguments) == 0:
301 toRemove.add(event)
302 if verbose: print "VAL:", docId + "." + str(event.id), "EPI-event with no arguments"
303
304
305 if not simulation:
306 kept = []
307 for event in events:
308 if event not in toRemove:
309 for arg in event.arguments[:]:
310 if arg[1] in toRemove:
311 event.arguments.remove(arg)
312 kept.append(event)
313 else:
314 removeCounts[event.type] += 1
315 numRemoved = len(events) - len(kept)
316 totalRemoved += numRemoved
317 events = kept
318 else:
319 numRemoved = 0
320 return events, removeCounts
321
323
324 triggersToKeep = []
325 for trigger in document.triggers:
326 kept = False
327 for event in document.events + document.relations:
328 if event.trigger == trigger:
329 triggersToKeep.append(trigger)
330 kept = True
331 break
332 else:
333 for arg in event.arguments:
334 if arg[1] == trigger or arg[2] == trigger:
335 triggersToKeep.append(trigger)
336 kept = True
337 break
338 if kept:
339 break
340 document.triggers = triggersToKeep
341
342 -def allValidate(document, counts, task, verbose=False):
343 numEvents = len(document.events)
344 document.events, removeCounts = validate(document.events, verbose=verbose, docId=document.id)
345 for key in removeCounts:
346 counts["invalid-" + key] += removeCounts[key]
347 counts["validation-removed"] += numEvents - len(document.events)
348 numEvents = len(document.events)
349 document.events = removeDuplicates(document.events)
350 counts["duplicates-removed"] += numEvents - len(document.events)
351 removeArguments(document, task, counts)
352 removeEntities(document, task, counts)
353
354 numTriggers = len(document.triggers)
355 removeUnusedTriggers(document)
356 counts["unused-triggers-removed"] += numTriggers - len(document.triggers)
357
359 if task != 1:
360 return
361 for event in document.events:
362 for arg in event.arguments[:]:
363 if arg[0] in ["Site", "AtLoc", "ToLoc", "Sidechain", "Contextgene"]:
364 event.arguments.remove(arg)
365 counts["t2-arguments-removed"] += 1
366
368 if task != 1:
369 return
370
371
372 triggersToKeep = []
373 for trigger in document.triggers:
374 if trigger.type == "Entity":
375 counts["t2-entities-removed"] += 1
376 else:
377 triggersToKeep.append(trigger)
378 document.triggers = triggersToKeep
379
380 if __name__=="__main__":
381 import sys
382 import STTools
383 from optparse import OptionParser
384
385 try:
386 import psyco
387 psyco.full()
388 print >> sys.stderr, "Found Psyco, using"
389 except ImportError:
390 print >> sys.stderr, "Psyco not installed"
391
392 optparser = OptionParser(description="Validate BioNLP'11 event constraints")
393 optparser.add_option("-i", "--input", default=None, dest="input", help="", metavar="FILE")
394 optparser.add_option("-o", "--output", default=None, dest="output", help="")
395 optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="")
396 optparser.add_option("--noScores", default=False, action="store_true", dest="noScores", help="")
397 (options, args) = optparser.parse_args()
398
399 if options.output == None:
400 options.output = options.input + "-validated.tar.gz"
401 print >> sys.stderr, "Reading documents"
402 documents = STTools.loadSet(options.input, readScores=(not options.noScores))
403 print >> sys.stderr, "Writing documents"
404 STTools.writeSet(documents, options.output, validate=True, writeScores=(not options.noScores), task=2, debug=options.debug)
405