1  parse__version__ = "$Revision: 1.3 $" 
  2   
  3  import sys,os 
  4  sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..") 
  5  try: 
  6      import xml.etree.cElementTree as ET 
  7  except ImportError: 
  8      import cElementTree as ET 
  9  import Utils.ElementTreeUtils as ETUtils 
 10  import Utils.InteractionXML.IDUtils as IDUtils 
 11  import types 
 12  from collections import defaultdict 
 13  import Utils.FindHeads as FindHeads 
 14   
 15 -def getText(element): 
  16      text = "" 
 17      if element.text != None: 
 18          text += element.text 
 19      for child in list(element): 
 20          text += getText(child) 
 21      if element.tail != None: 
 22          text += element.tail 
 23      return text 
  24   
 26      if element.tag == "clueType": 
 27          clueText = element.text 
 28          return [clueText, 0, 0] 
 29       
 30      text = "" 
 31      if element.text != None: 
 32          text += element.text 
 33      for child in list(element): 
 34          childText = getClue(child)  
 35          if type(childText) == types.StringType: 
 36              text += childText 
 37          else: 
 38              childText[1] = len(text) 
 39              childText[2] = len(text) + len(childText[0]) - 1 
 40              return childText 
 41      if element.tail != None: 
 42          text += element.tail 
 43      return text 
  44   
 46      xml = ETUtils.ETFromObj(path) 
 47      sentDict = {} 
 48      for sentence in xml.getiterator("sentence"): 
 49          sentenceText = getText(sentence).strip() 
 50          if not sentDict.has_key(sentenceText): 
 51              sentDict[sentenceText] = [] 
 52   
 53      for event in xml.getiterator("event"): 
 54          sentenceText = getText(event).strip() 
 55          if not sentDict.has_key(sentenceText): 
 56              sentDict[sentenceText] = [] 
 57          events = sentDict[sentenceText] 
 58           
 59          clue = event.find("clue") 
 60          clueTuple = getClue(clue) 
 61          eventType = event.find("type").get("class") 
 62          if eventType == "Protein_amino_acid_phosphorylation": 
 63              eventType = "Phosphorylation" 
 64          if type(clueTuple) == types.StringType: 
 65              if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue) 
 66          else: 
 67              assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple) 
 68              event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0]) 
 69              if event not in events: 
 70                  events.append(event) 
 71      return sentDict 
  72   
 74      if eventType in ["Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation"]: 
 75          return True 
 76      else: 
 77          return False 
  78   
 80      print "Removing duplicate triggers" 
 81      counts = {} 
 82      for sentence in input.getiterator("sentence"): 
 83          origTriggers = [] 
 84          newTriggers = [] 
 85          for entity in sentence.findall("entity"): 
 86              if entity.get("isName") == "False": 
 87                  if entity.get("source") == "GENIA_event_annotation_0.9": 
 88                      newTriggers.append(entity) 
 89                  else: 
 90                      origTriggers.append(entity) 
 91          for origTrig in origTriggers: 
 92              countType = "origTrig-" + origTrig.get("type") 
 93              if not counts.has_key(countType): 
 94                  counts[countType] = 0 
 95              counts[countType] += 1             
 96          for newTrig in newTriggers[:]: 
 97              removed = False 
 98              for origTrig in origTriggers: 
 99                  if newTrig.get("headOffset") == origTrig.get("headOffset"): 
100                      sentence.remove(newTrig) 
101                      newTriggers.remove(newTrig) 
102                      removed = True 
103                      countType = "removed-N/O-" + newTrig.get("type") + "/" + origTrig.get("type") 
104                      if not counts.has_key(countType): 
105                          counts[countType] = 0 
106                      counts[countType] += 1 
107                      break 
108              if not removed: 
109                  countType = "newTrig-" + newTrig.get("type") 
110                  if not counts.has_key(countType): 
111                      counts[countType] = 0 
112                  counts[countType] += 1 
113      print "Counts:" 
114      for k in sorted(counts.keys()): 
115          print " ", k, counts[k] 
 116   
117 -def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False): 
 118      print >> sys.stderr, "Loading corpus", input 
119      corpusTree = ETUtils.ETFromObj(input) 
120      print >> sys.stderr, "Corpus file loaded" 
121      corpusRoot = corpusTree.getroot() 
122       
123      counts = defaultdict(int) 
124      for document in corpusRoot.findall("document"): 
125          sentDict = None 
126          pmid = document.get("pmid") 
127          isPMC = False 
128          for sentence in document.findall("sentence"): 
129              counts["sentences"] += 1 
130              sentenceId = str(sentence.get("id")) + "/" + str(sentence.get("origId")) 
131              if verbose: print "Processing", sentenceId 
132              if sentDict == None: 
133                  if sentence.get("origId") != None: 
134                      assert pmid == None 
135                      sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml" , verbose=verbose) 
136                  else: 
137                       
138                      assert pmid != None 
139                      if pmid.startswith("PMC"): 
140                          isPMC = True 
141                          sentDict = {} 
142                      else: 
143                          assert pmid.startswith("PMID") 
144                          sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml" , verbose=verbose) 
145              interactionXMLText = sentence.get("text") 
146              if not sentDict.has_key(interactionXMLText): 
147                  counts["missing-sentences"] += 1 
148                  if isPMC: counts["missing-sentences-PMC"] += 1 
149                  if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text")) 
150              else: 
151                  sentenceAnalyses = sentence.find("sentenceanalyses") 
152                  if sentenceAnalyses != None: 
153                      sentence.remove(sentenceAnalyses) 
154                  entityIdCount = IDUtils.getNextFreeId(sentence.findall("entity")) 
155                  events = sentDict[interactionXMLText] 
156                  events.sort() 
157                  for event in events: 
158                      if not keepEvent(event[2]): 
159                          counts["filtered-triggers"] += 1 
160                          continue 
161                      trigger = ET.Element("entity") 
162                      trigger.set("isName", "False") 
163                      trigger.set("charOffset", str(event[0]) + "-" + str(event[1])) 
164                      trigger.set("type", str(event[2])) 
165                      trigger.set("text", str(event[3])) 
166                      trigger.set("source", "GENIA_event_annotation_0.9") 
167                      trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount)) 
168                      entityIdCount += 1 
169                      counts["added-triggers"] += 1 
170                      sentence.append(trigger) 
171                  if sentenceAnalyses != None: 
172                      sentence.append(sentenceAnalyses) 
173       
174      FindHeads.findHeads(corpusTree, parse, removeExisting=False) 
175      removeDuplicates(corpusRoot) 
176      print counts 
177       
178      if output != None: 
179          print >> sys.stderr, "Writing output to", output 
180          ETUtils.write(corpusRoot, output) 
181      return corpusTree 
 182       
183  if __name__=="__main__": 
184      import sys 
185       
186      from optparse import OptionParser 
187       
188      try: 
189          import psyco 
190          psyco.full() 
191          print >> sys.stderr, "Found Psyco, using" 
192      except ImportError: 
193          print >> sys.stderr, "Psyco not installed" 
194   
195      optparser = OptionParser(usage="%prog [options]\n") 
196      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 
197      optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 
198      optparser.add_option("-e", "--eventDir", default="/home/jari/data/GENIA_event_annotation_0.9/GENIAcorpus_event", dest="eventDir", help="Output file in interaction xml format.") 
199      optparser.add_option("-p", "--parse", default="split-mccc-preparsed", dest="parse", help="Parse XML element name") 
200      optparser.add_option("-v", "--verbose", default=False, action="store_true", dest="verbose", help="verbose mode") 
201      (options, args) = optparser.parse_args() 
202      assert options.input != None 
203       
204      run(input=options.input, output=options.output, eventDir=options.eventDir, parse=options.parse, verbose=options.verbose) 
205