1  __version__ = "$Revision: 1.2 $" 
  2   
  3  import sys,os 
  4  import sys 
  5  try: 
  6      import xml.etree.cElementTree as ET 
  7  except ImportError: 
  8      import cElementTree as ET 
  9  import Utils.ElementTreeUtils as ETUtils 
 10   
 11 -def convert(input, output=None, outputRoot=None): 
  12      print >> sys.stderr, "##### Convert PMC to Interaction XML #####" 
 13       
 14      print >> sys.stderr, "Loading corpus", input 
 15      pmcTree = ETUtils.ETFromObj(input) 
 16      print >> sys.stderr, "Corpus file loaded" 
 17      pmcRoot = pmcTree.getroot() 
 18           
 19      includeElements = [ 
 20          "front", 
 21          "article-meta", 
 22          "title-group", 
 23          "article-title", 
 24          "abstract", 
 25          "body", 
 26          "sec", 
 27          "p", 
 28          "title"] 
 29      collapseElements = [ 
 30          "front", 
 31          "article-meta", 
 32          "title-group", 
 33          "p"] 
 34       
 35      if outputRoot == None: 
 36          outputRoot = ET.Element("corpus") 
 37          outputRoot.set("source", "PMC") 
 38       
 39      outputRoot.append(addElements(pmcRoot, includeElements, collapseElements)) 
 40       
 41      outputTree = ET.ElementTree(outputRoot) 
 42      if output != None: 
 43          print >> sys.stderr, "Writing output to", output 
 44          ETUtils.write(outputTree, output) 
 45      return outputTree 
  46       
 47 -def addElements(pmcElement, includeElements, collapseElements, outputParent=None, pmcPath="", secCount = [0], articleId=None): 
  48      stop = False 
 49      if pmcElement.tag == "article": 
 50          assert articleId == None 
 51          outputParent = ET.Element("document") 
 52          pmid = None 
 53          for idElement in pmcElement.getiterator("article-id"): 
 54              if idElement.get("pub-id-type") == "pmid": 
 55                  pmid = idElement.text 
 56                  break 
 57          articleId = "PMC" + ".d" + str(pmid) 
 58          outputParent.set("id", articleId) 
 59      elif pmcElement.tag in includeElements: 
 60          pmcElementText = getText(pmcElement) 
 61          if (pmcElementText != None and pmcElementText.strip() != "") or pmcElement.tag not in collapseElements: 
 62              section = ET.Element("section") 
 63              section.set("id", articleId + ".c" + str(secCount[0])) 
 64              secCount[0] += 1 
 65              section.set("type", pmcElement.tag) 
 66              pmcElementId = pmcElement.get("id") 
 67              if pmcElementId != None: 
 68                  section.set("secId", pmcElementId) 
 69              section.set("pmcPath", pmcPath) 
 70              if pmcElementText != None: 
 71                  section.set("text", pmcElementText) 
 72              outputParent.append(section) 
 73              outputParent = section 
 74      else: 
 75          stop = True 
 76       
 77      if not stop: 
 78          childCounts = {} 
 79          for pmcChild in list(pmcElement): 
 80              childTag = pmcChild.tag 
 81              if not childCounts.has_key(childTag): 
 82                  childCounts[childTag] = 0 
 83              else: 
 84                  childCounts[childTag] += 1 
 85              addElements(pmcChild, includeElements, collapseElements, outputParent, pmcPath + "/" + childTag + "-" + str(childCounts[childTag]), secCount, articleId) 
 86       
 87      return outputParent 
  88   
 89 -def getText(element): 
  90      text = element.text 
 91      if text == None or text == "": 
 92          return text 
 93      for child in list(element): 
 94          assert child.tag in ("xref", "italic", "bold", "fig", "ext-link"), child.tag 
 95          if child.text != None: 
 96              text += child.text 
 97          if child.tail != None: 
 98              text += child.tail 
 99      while text[-1] == "\n": 
100          text = text[:-1] 
101      return text 
 102   
103  if __name__=="__main__": 
104      import sys 
105       
106      from optparse import OptionParser 
107       
108      try: 
109          import psyco 
110          psyco.full() 
111          print >> sys.stderr, "Found Psyco, using" 
112      except ImportError: 
113          print >> sys.stderr, "Psyco not installed" 
114   
115      optparser = OptionParser(usage="%prog [options]\n") 
116      optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 
117      optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 
118      (options, args) = optparser.parse_args() 
119       
120      convert(input=options.input, output=options.output) 
121