1 parse__version__ = "$Revision: 1.3 $"
2
3 import sys,os
4 sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9 import Utils.ElementTreeUtils as ETUtils
10 import Utils.InteractionXML.IDUtils as IDUtils
11 import types
12 from collections import defaultdict
13 import Utils.FindHeads as FindHeads
14
15 -def getText(element):
16 text = ""
17 if element.text != None:
18 text += element.text
19 for child in list(element):
20 text += getText(child)
21 if element.tail != None:
22 text += element.tail
23 return text
24
26 if element.tag == "clueType":
27 clueText = element.text
28 return [clueText, 0, 0]
29
30 text = ""
31 if element.text != None:
32 text += element.text
33 for child in list(element):
34 childText = getClue(child)
35 if type(childText) == types.StringType:
36 text += childText
37 else:
38 childText[1] = len(text)
39 childText[2] = len(text) + len(childText[0]) - 1
40 return childText
41 if element.tail != None:
42 text += element.tail
43 return text
44
46 xml = ETUtils.ETFromObj(path)
47 sentDict = {}
48 for sentence in xml.getiterator("sentence"):
49 sentenceText = getText(sentence).strip()
50 if not sentDict.has_key(sentenceText):
51 sentDict[sentenceText] = []
52
53 for event in xml.getiterator("event"):
54 sentenceText = getText(event).strip()
55 if not sentDict.has_key(sentenceText):
56 sentDict[sentenceText] = []
57 events = sentDict[sentenceText]
58
59 clue = event.find("clue")
60 clueTuple = getClue(clue)
61 eventType = event.find("type").get("class")
62 if eventType == "Protein_amino_acid_phosphorylation":
63 eventType = "Phosphorylation"
64 if type(clueTuple) == types.StringType:
65 if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue)
66 else:
67 assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple)
68 event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
69 if event not in events:
70 events.append(event)
71 return sentDict
72
74 if eventType in ["Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation"]:
75 return True
76 else:
77 return False
78
80 print "Removing duplicate triggers"
81 counts = {}
82 for sentence in input.getiterator("sentence"):
83 origTriggers = []
84 newTriggers = []
85 for entity in sentence.findall("entity"):
86 if entity.get("isName") == "False":
87 if entity.get("source") == "GENIA_event_annotation_0.9":
88 newTriggers.append(entity)
89 else:
90 origTriggers.append(entity)
91 for origTrig in origTriggers:
92 countType = "origTrig-" + origTrig.get("type")
93 if not counts.has_key(countType):
94 counts[countType] = 0
95 counts[countType] += 1
96 for newTrig in newTriggers[:]:
97 removed = False
98 for origTrig in origTriggers:
99 if newTrig.get("headOffset") == origTrig.get("headOffset"):
100 sentence.remove(newTrig)
101 newTriggers.remove(newTrig)
102 removed = True
103 countType = "removed-N/O-" + newTrig.get("type") + "/" + origTrig.get("type")
104 if not counts.has_key(countType):
105 counts[countType] = 0
106 counts[countType] += 1
107 break
108 if not removed:
109 countType = "newTrig-" + newTrig.get("type")
110 if not counts.has_key(countType):
111 counts[countType] = 0
112 counts[countType] += 1
113 print "Counts:"
114 for k in sorted(counts.keys()):
115 print " ", k, counts[k]
116
117 -def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False):
118 print >> sys.stderr, "Loading corpus", input
119 corpusTree = ETUtils.ETFromObj(input)
120 print >> sys.stderr, "Corpus file loaded"
121 corpusRoot = corpusTree.getroot()
122
123 counts = defaultdict(int)
124 for document in corpusRoot.findall("document"):
125 sentDict = None
126 pmid = document.get("pmid")
127 isPMC = False
128 for sentence in document.findall("sentence"):
129 counts["sentences"] += 1
130 sentenceId = str(sentence.get("id")) + "/" + str(sentence.get("origId"))
131 if verbose: print "Processing", sentenceId
132 if sentDict == None:
133 if sentence.get("origId") != None:
134 assert pmid == None
135 sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml" , verbose=verbose)
136 else:
137
138 assert pmid != None
139 if pmid.startswith("PMC"):
140 isPMC = True
141 sentDict = {}
142 else:
143 assert pmid.startswith("PMID")
144 sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml" , verbose=verbose)
145 interactionXMLText = sentence.get("text")
146 if not sentDict.has_key(interactionXMLText):
147 counts["missing-sentences"] += 1
148 if isPMC: counts["missing-sentences-PMC"] += 1
149 if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text"))
150 else:
151 sentenceAnalyses = sentence.find("sentenceanalyses")
152 if sentenceAnalyses != None:
153 sentence.remove(sentenceAnalyses)
154 entityIdCount = IDUtils.getNextFreeId(sentence.findall("entity"))
155 events = sentDict[interactionXMLText]
156 events.sort()
157 for event in events:
158 if not keepEvent(event[2]):
159 counts["filtered-triggers"] += 1
160 continue
161 trigger = ET.Element("entity")
162 trigger.set("isName", "False")
163 trigger.set("charOffset", str(event[0]) + "-" + str(event[1]))
164 trigger.set("type", str(event[2]))
165 trigger.set("text", str(event[3]))
166 trigger.set("source", "GENIA_event_annotation_0.9")
167 trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount))
168 entityIdCount += 1
169 counts["added-triggers"] += 1
170 sentence.append(trigger)
171 if sentenceAnalyses != None:
172 sentence.append(sentenceAnalyses)
173
174 FindHeads.findHeads(corpusTree, parse, removeExisting=False)
175 removeDuplicates(corpusRoot)
176 print counts
177
178 if output != None:
179 print >> sys.stderr, "Writing output to", output
180 ETUtils.write(corpusRoot, output)
181 return corpusTree
182
183 if __name__=="__main__":
184 import sys
185
186 from optparse import OptionParser
187
188 try:
189 import psyco
190 psyco.full()
191 print >> sys.stderr, "Found Psyco, using"
192 except ImportError:
193 print >> sys.stderr, "Psyco not installed"
194
195 optparser = OptionParser(usage="%prog [options]\n")
196 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
197 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
198 optparser.add_option("-e", "--eventDir", default="/home/jari/data/GENIA_event_annotation_0.9/GENIAcorpus_event", dest="eventDir", help="Output file in interaction xml format.")
199 optparser.add_option("-p", "--parse", default="split-mccc-preparsed", dest="parse", help="Parse XML element name")
200 optparser.add_option("-v", "--verbose", default=False, action="store_true", dest="verbose", help="verbose mode")
201 (options, args) = optparser.parse_args()
202 assert options.input != None
203
204 run(input=options.input, output=options.output, eventDir=options.eventDir, parse=options.parse, verbose=options.verbose)
205