Package TEES :: Package Utils :: Package InteractionXML :: Module ConvertPMC
[hide private]

Source Code for Module TEES.Utils.InteractionXML.ConvertPMC

  1  __version__ = "$Revision: 1.2 $" 
  2   
  3  import sys,os 
  4  import sys 
  5  try: 
  6      import xml.etree.cElementTree as ET 
  7  except ImportError: 
  8      import cElementTree as ET 
  9  import Utils.ElementTreeUtils as ETUtils 
 10   
11 -def convert(input, output=None, outputRoot=None):
12 print >> sys.stderr, "##### Convert PMC to Interaction XML #####" 13 14 print >> sys.stderr, "Loading corpus", input 15 pmcTree = ETUtils.ETFromObj(input) 16 print >> sys.stderr, "Corpus file loaded" 17 pmcRoot = pmcTree.getroot() 18 19 includeElements = [ 20 "front", 21 "article-meta", 22 "title-group", 23 "article-title", 24 "abstract", 25 "body", 26 "sec", 27 "p", 28 "title"] 29 collapseElements = [ 30 "front", 31 "article-meta", 32 "title-group", 33 "p"] 34 35 if outputRoot == None: 36 outputRoot = ET.Element("corpus") 37 outputRoot.set("source", "PMC") 38 39 outputRoot.append(addElements(pmcRoot, includeElements, collapseElements)) 40 41 outputTree = ET.ElementTree(outputRoot) 42 if output != None: 43 print >> sys.stderr, "Writing output to", output 44 ETUtils.write(outputTree, output) 45 return outputTree
46
47 -def addElements(pmcElement, includeElements, collapseElements, outputParent=None, pmcPath="", secCount = [0], articleId=None):
48 stop = False 49 if pmcElement.tag == "article": 50 assert articleId == None 51 outputParent = ET.Element("document") 52 pmid = None 53 for idElement in pmcElement.getiterator("article-id"): 54 if idElement.get("pub-id-type") == "pmid": 55 pmid = idElement.text 56 break 57 articleId = "PMC" + ".d" + str(pmid) 58 outputParent.set("id", articleId) 59 elif pmcElement.tag in includeElements: 60 pmcElementText = getText(pmcElement) 61 if (pmcElementText != None and pmcElementText.strip() != "") or pmcElement.tag not in collapseElements: 62 section = ET.Element("section") 63 section.set("id", articleId + ".c" + str(secCount[0])) 64 secCount[0] += 1 65 section.set("type", pmcElement.tag) 66 pmcElementId = pmcElement.get("id") 67 if pmcElementId != None: 68 section.set("secId", pmcElementId) 69 section.set("pmcPath", pmcPath) 70 if pmcElementText != None: 71 section.set("text", pmcElementText) 72 outputParent.append(section) 73 outputParent = section 74 else: 75 stop = True 76 77 if not stop: 78 childCounts = {} 79 for pmcChild in list(pmcElement): 80 childTag = pmcChild.tag 81 if not childCounts.has_key(childTag): 82 childCounts[childTag] = 0 83 else: 84 childCounts[childTag] += 1 85 addElements(pmcChild, includeElements, collapseElements, outputParent, pmcPath + "/" + childTag + "-" + str(childCounts[childTag]), secCount, articleId) 86 87 return outputParent
88
89 -def getText(element):
90 text = element.text 91 if text == None or text == "": 92 return text 93 for child in list(element): 94 assert child.tag in ("xref", "italic", "bold", "fig", "ext-link"), child.tag 95 if child.text != None: 96 text += child.text 97 if child.tail != None: 98 text += child.tail 99 while text[-1] == "\n": 100 text = text[:-1] 101 return text
102 103 if __name__=="__main__": 104 import sys 105 106 from optparse import OptionParser 107 # Import Psyco if available 108 try: 109 import psyco 110 psyco.full() 111 print >> sys.stderr, "Found Psyco, using" 112 except ImportError: 113 print >> sys.stderr, "Psyco not installed" 114 115 optparser = OptionParser(usage="%prog [options]\n") 116 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 117 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 118 (options, args) = optparser.parse_args() 119 120 convert(input=options.input, output=options.output) 121