1 """
2 Functions for easier use of cElementTree.
3
4 Program: cElementTree Utilities
5 Date: Oct. 16, 2007
6 Author: Jari Bjoerne
7
8 Description: Convenience functions for easier use of cElementTree.
9 """
10 __version__ = "$Revision: 1.20 $"
11
12 import sys, os
13 import codecs
14
15 try:
16 import cElementTree as ElementTree
17 except ImportError:
18 import xml.etree.cElementTree as ElementTree
19
20 from gzip import GzipFile
21
23 for child in list(element):
24 removeAll(child)
25 for child in list(element):
26 element.remove(child)
27
28 -def iterparse(file, elementName, callback, limit = -1):
29 """ Parse iteratively xml-files
30
31 This function offers a simple way to use the cElementTree
32 iterparse-function the way it is often used.
33
34 Keyword arguments:
35 file -- (file) file or file-like object to parse
36 elementName -- (string) matching elements are passed to the callback
37 callback -- (function) called when parser has parsed an element
38 of name elementName
39 limit -- (int) stop after reading "limit" elements. If -1, read
40 until end of file. This is mostly useful when debugging
41 programs that parse large files.
42 """
43 context = ElementTree.iterparse(file, events=("start", "end"))
44 root = None
45
46 for event, elem in context:
47 if limit == 0:
48 return
49
50 if event == "start" and root is None:
51 root = elem
52 if event == "end" and elem.tag == elementName:
53
54 callback(elem)
55 root.clear()
56 if limit != -1:
57 limit -= 1
58
60 """ indent-function as defined in cElementTree-documentation
61
62 This function will become part of cElementTree in some future
63 release. Until then, it can be used from here. This function
64 indents the xml-tree, so that it is more readable when written
65 out.
66
67 Keyword arguments:
68 elem -- (Element) root of the tree to indent
69 level -- (int) starting level of indentation
70 """
71 i = "\n" + level*" "
72 if len(elem):
73 if not elem.text or not elem.text.strip():
74 elem.text = i + " "
75 for e in elem:
76 indent(e, level+1)
77 if not e.tail or not e.tail.strip():
78 e.tail = i
79 if level and (not elem.tail or not elem.tail.strip()):
80 elem.tail = i
81
83 """obj can be
84 1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned
85 2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned
86 3) an open input stream -> the input is parsed and the resulting ElementTree is returned
87 4) an ElementTree or an Element -> obj is returned as-is, nothing is done"""
88 if isinstance(obj,str) or isinstance(obj,unicode):
89 if obj.endswith(".xml.gz"):
90 fStream=GzipFile(obj,"rt")
91
92 elif obj.endswith(".xml") or obj.endswith(".svg") or obj.endswith(".nxml") or obj.endswith(".csml"):
93 fStream=open(obj,"rt")
94
95 else:
96 raise ValueError("%s: File format not recognized (expected .xml or .xml.gz)"%obj)
97 return ElementTree.parse(fStream)
98 elif isinstance(obj,ElementTree.ElementTree) or ElementTree.iselement(obj):
99 return obj
100 else:
101
102
103 return ElementTree.parse(obj)
104
107 if isinstance(out,str):
108 if out.endswith(".gz"):
109 self.out = GzipFile(out,"wt")
110 else:
111 self.out = open(out,"wt")
112 else:
113 self.out = obj
114 print >> self.out, '<?xml version="1.0" encoding="UTF-8"?>'
115 self.indentLevel = 0
116 self.beginString = None
117 self.tags = []
118 self.lastElement = None
119
121 while len(self.tags) > 0:
122 self.end()
123 self.out.close()
124 self.out = None
125
126
127 - def begin(self, element):
128 self.tags.append(element.tag)
129 self.beginString = self.indentLevel * " " + "<" + element.tag
130 for key in sorted(element.attrib.keys()):
131 self.beginString += " " + key + "=\"" + element.get(key) + "\""
132 self.beginString += ">"
133 self.indentLevel += 1
134 self.lastElement = element
135
137 if self.beginString != None:
138 self.out.write(self.beginString)
139 self.out.write("\n" + self.indentLevel * " ")
140 self.beginString = None
141
142
143 - def end(self, element):
144 self.indentLevel -= 1
145 if element == self.lastElement:
146 self.beginString = None
147 self.write(element)
148 else:
149 self.out.write(self.indentLevel * " " + "</" + element.tag + ">\n")
150 self.lastElement = None
151 return self.tags.pop()
152
153 - def write(self, element):
154 self._flush()
155 indent(element, self.indentLevel)
156 self.out.write(ElementTree.tostring(element, "utf-8"))
157 self.lastElement = None
158
160 """obj can be
161 1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned
162 2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned
163 3) an open input stream -> the input is parsed and the resulting ElementTree is returned
164 4) an ElementTree or an Element -> obj is returned as-is, nothing is done"""
165 if isinstance(obj,str) or isinstance(obj,unicode):
166 if obj.endswith(".gz"):
167 fStream=GzipFile(obj,"rt")
168
169 else:
170 fStream=open(obj,"rt")
171
172 for rv in ElementTree.iterparse(fStream, events):
173 yield rv
174 elif isinstance(obj,ElementTree.ElementTree) or ElementTree.iselement(obj):
175 if ElementTree.iselement(obj):
176 root = obj
177 else:
178 root = obj.getroot()
179
180
181 for element in root.getiterator():
182 yield ("memory", element)
183 else:
184
185
186 for rv in ElementTree.iterparse(obj, events):
187 yield rv
188
189 -def write(rootElement, filename):
190 if isinstance(rootElement,ElementTree.ElementTree):
191 rootElement = rootElement.getroot()
192 indent(rootElement)
193
194 if os.path.dirname(filename) != "" and not os.path.exists(os.path.dirname(filename)):
195 os.makedirs(os.path.dirname(filename))
196
197 if filename.endswith(".gz"):
198 out=GzipFile(filename,"wt")
199 else:
200 out=open(filename,"wt")
201 print >> out, '<?xml version="1.0" encoding="UTF-8"?>'
202 ElementTree.ElementTree(rootElement).write(out,"utf-8")
203 out.close()
204
205 encodeNewlines(filename)
206
208 import tempfile, shutil
209
210 tempdir = tempfile.mkdtemp()
211 tempfilepath = os.path.join(tempdir, os.path.basename(filename))
212 if filename.endswith(".gz"):
213
214 inFile = codecs.getreader("utf-8")(GzipFile(filename, "rb"))
215 out = codecs.getwriter("utf-8")(GzipFile(tempfilepath, "wb"))
216 else:
217
218 inFile=codecs.open(filename, "rt", "utf-8")
219 out = codecs.open(tempfilepath, "wt", "utf-8")
220
221 for content in inFile:
222
223
224 content = content.replace(">\n", "TEMP_PROTECT_N")
225 content = content.replace(">\r", "TEMP_PROTECT_R")
226 content = content.replace("\n", " ")
227 content = content.replace("\r", " ")
228 content = content.replace("TEMP_PROTECT_N", ">\n")
229 content = content.replace("TEMP_PROTECT_R", ">\r")
230 out.write(content)
231 inFile.close()
232 out.close()
233 shutil.copy2(tempfilepath, filename)
234 shutil.rmtree(tempdir)
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
262
263
264
265
266 result=[]
267 currElem=element
268 for tag in tagList:
269 for subElem in currElem:
270 if subElem.tag==tag:
271 break
272 else:
273 subElem=ElementTree.SubElement(currElem,tag)
274 result.append(subElem)
275 currElem=subElem
276 return result
277
278 -def toStr(element, recursive=True, removePreTag=True):
279 tag = element.tag
280 if removePreTag:
281 tag = tag.split("}")[-1]
282 s = "<" + tag
283 for key in sorted(element.attrib.keys()):
284 s += " " + key + "=\"" + element.get(key) + "\""
285
286 text = element.text
287 children = element.getchildren()
288 if text != None or len(children) > 0:
289 s += ">"
290
291 if text != None:
292 s += text
293 for child in children:
294 s += toStr(child)
295 if text != None or len(children) > 0:
296 s += "</" + tag + ">"
297 else:
298 s += "/>"
299
300 if element.tail != None:
301 s += element.tail
302
303 return s
304
306 for element in parent.getiterator():
307 if element.tag == tag:
308 found = True
309 for k, v in attDict.iteritems():
310 if element.get(k) != v:
311 found = False
312 if found:
313 return element
314 return None
315
317 element = parent.find(name)
318 if element == None:
319 element = ElementTree.Element(name)
320 parent.append(element)
321 return element
322
323 if __name__=="__main__":
324 r=ElementTree.parse("delme.xml").getroot()
325 write(r,"delme1.xml.gz")
326 r2=ETFromObj("delme1.xml.gz").getroot()
327 write(r2,"delme2.xml.gz")
328