1 __version__ = "$Revision: 1.2 $"
2
3 import sys,os
4 import sys
5 try:
6 import xml.etree.cElementTree as ET
7 except ImportError:
8 import cElementTree as ET
9 import Utils.ElementTreeUtils as ETUtils
10
11 -def convert(input, output=None, outputRoot=None):
12 print >> sys.stderr, "##### Convert PMC to Interaction XML #####"
13
14 print >> sys.stderr, "Loading corpus", input
15 pmcTree = ETUtils.ETFromObj(input)
16 print >> sys.stderr, "Corpus file loaded"
17 pmcRoot = pmcTree.getroot()
18
19 includeElements = [
20 "front",
21 "article-meta",
22 "title-group",
23 "article-title",
24 "abstract",
25 "body",
26 "sec",
27 "p",
28 "title"]
29 collapseElements = [
30 "front",
31 "article-meta",
32 "title-group",
33 "p"]
34
35 if outputRoot == None:
36 outputRoot = ET.Element("corpus")
37 outputRoot.set("source", "PMC")
38
39 outputRoot.append(addElements(pmcRoot, includeElements, collapseElements))
40
41 outputTree = ET.ElementTree(outputRoot)
42 if output != None:
43 print >> sys.stderr, "Writing output to", output
44 ETUtils.write(outputTree, output)
45 return outputTree
46
47 -def addElements(pmcElement, includeElements, collapseElements, outputParent=None, pmcPath="", secCount = [0], articleId=None):
48 stop = False
49 if pmcElement.tag == "article":
50 assert articleId == None
51 outputParent = ET.Element("document")
52 pmid = None
53 for idElement in pmcElement.getiterator("article-id"):
54 if idElement.get("pub-id-type") == "pmid":
55 pmid = idElement.text
56 break
57 articleId = "PMC" + ".d" + str(pmid)
58 outputParent.set("id", articleId)
59 elif pmcElement.tag in includeElements:
60 pmcElementText = getText(pmcElement)
61 if (pmcElementText != None and pmcElementText.strip() != "") or pmcElement.tag not in collapseElements:
62 section = ET.Element("section")
63 section.set("id", articleId + ".c" + str(secCount[0]))
64 secCount[0] += 1
65 section.set("type", pmcElement.tag)
66 pmcElementId = pmcElement.get("id")
67 if pmcElementId != None:
68 section.set("secId", pmcElementId)
69 section.set("pmcPath", pmcPath)
70 if pmcElementText != None:
71 section.set("text", pmcElementText)
72 outputParent.append(section)
73 outputParent = section
74 else:
75 stop = True
76
77 if not stop:
78 childCounts = {}
79 for pmcChild in list(pmcElement):
80 childTag = pmcChild.tag
81 if not childCounts.has_key(childTag):
82 childCounts[childTag] = 0
83 else:
84 childCounts[childTag] += 1
85 addElements(pmcChild, includeElements, collapseElements, outputParent, pmcPath + "/" + childTag + "-" + str(childCounts[childTag]), secCount, articleId)
86
87 return outputParent
88
89 -def getText(element):
90 text = element.text
91 if text == None or text == "":
92 return text
93 for child in list(element):
94 assert child.tag in ("xref", "italic", "bold", "fig", "ext-link"), child.tag
95 if child.text != None:
96 text += child.text
97 if child.tail != None:
98 text += child.tail
99 while text[-1] == "\n":
100 text = text[:-1]
101 return text
102
103 if __name__=="__main__":
104 import sys
105
106 from optparse import OptionParser
107
108 try:
109 import psyco
110 psyco.full()
111 print >> sys.stderr, "Found Psyco, using"
112 except ImportError:
113 print >> sys.stderr, "Psyco not installed"
114
115 optparser = OptionParser(usage="%prog [options]\n")
116 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE")
117 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.")
118 (options, args) = optparser.parse_args()
119
120 convert(input=options.input, output=options.output)
121