1  """ 
  2  Functions for easier use of cElementTree. 
  3   
  4    Program:    cElementTree Utilities 
  5    Date:       Oct. 16, 2007 
  6    Author:     Jari Bjoerne 
  7   
  8    Description: Convenience functions for easier use of cElementTree. 
  9  """ 
 10  __version__ = "$Revision: 1.20 $" 
 11   
 12  import sys, os 
 13  import codecs 
 14   
 15  try: 
 16      import cElementTree as ElementTree 
 17  except ImportError: 
 18      import xml.etree.cElementTree as ElementTree 
 19   
 20  from gzip import GzipFile 
 21   
 23      for child in list(element): 
 24          removeAll(child) 
 25      for child in list(element): 
 26          element.remove(child) 
  27   
 28 -def iterparse(file, elementName, callback, limit = -1): 
  29      """ Parse iteratively xml-files 
 30       
 31      This function offers a simple way to use the cElementTree 
 32      iterparse-function the way it is often used. 
 33       
 34      Keyword arguments: 
 35      file -- (file) file or file-like object to parse  
 36      elementName -- (string) matching elements are passed to the callback 
 37      callback -- (function) called when parser has parsed an element 
 38                  of name elementName 
 39      limit -- (int) stop after reading "limit" elements. If -1, read 
 40               until end of file. This is mostly useful when debugging 
 41               programs that parse large files. 
 42      """ 
 43      context = ElementTree.iterparse(file, events=("start", "end")) 
 44      root = None 
 45   
 46      for event, elem in context: 
 47          if limit == 0: 
 48              return 
 49   
 50          if event == "start" and root is None: 
 51              root = elem      
 52          if event == "end" and elem.tag == elementName:  
 53               
 54              callback(elem) 
 55              root.clear() 
 56              if limit != -1: 
 57                  limit -= 1 
  58   
 60      """ indent-function as defined in cElementTree-documentation 
 61       
 62      This function will become part of cElementTree in some future 
 63      release. Until then, it can be used from here. This function 
 64      indents the xml-tree, so that it is more readable when written 
 65      out.  
 66       
 67      Keyword arguments: 
 68      elem -- (Element) root of the tree to indent  
 69      level -- (int) starting level of indentation 
 70      """ 
 71      i = "\n" + level*"  " 
 72      if len(elem): 
 73          if not elem.text or not elem.text.strip(): 
 74              elem.text = i + "  " 
 75          for e in elem: 
 76              indent(e, level+1) 
 77          if not e.tail or not e.tail.strip(): 
 78              e.tail = i 
 79      if level and (not elem.tail or not elem.tail.strip()): 
 80          elem.tail = i 
  81   
 83      """obj can be 
 84      1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned 
 85      2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned 
 86      3) an open input stream -> the input is parsed and the resulting ElementTree is returned 
 87      4) an ElementTree or an Element -> obj is returned as-is, nothing is done""" 
 88      if isinstance(obj,str) or isinstance(obj,unicode): 
 89          if obj.endswith(".xml.gz"): 
 90              fStream=GzipFile(obj,"rt") 
 91               
 92          elif obj.endswith(".xml") or obj.endswith(".svg") or obj.endswith(".nxml") or obj.endswith(".csml"): 
 93              fStream=open(obj,"rt") 
 94               
 95          else: 
 96              raise ValueError("%s: File format not recognized (expected .xml or .xml.gz)"%obj) 
 97          return ElementTree.parse(fStream) 
 98      elif isinstance(obj,ElementTree.ElementTree) or ElementTree.iselement(obj): 
 99          return obj 
100      else: 
101           
102           
103          return ElementTree.parse(obj) 
 104   
107          if isinstance(out,str): 
108              if out.endswith(".gz"): 
109                  self.out = GzipFile(out,"wt") 
110              else: 
111                  self.out = open(out,"wt") 
112          else: 
113              self.out = obj 
114          print >> self.out, '<?xml version="1.0" encoding="UTF-8"?>' 
115          self.indentLevel = 0 
116          self.beginString = None 
117          self.tags = [] 
118          self.lastElement = None 
 119       
121          while len(self.tags) > 0: 
122              self.end() 
123          self.out.close() 
124          self.out = None 
 125       
126       
127 -    def begin(self, element): 
 128          self.tags.append(element.tag) 
129          self.beginString = self.indentLevel * "  " + "<" + element.tag 
130          for key in sorted(element.attrib.keys()): 
131              self.beginString += " " + key + "=\"" + element.get(key) + "\"" 
132          self.beginString += ">" 
133          self.indentLevel += 1 
134          self.lastElement = element 
 135       
137          if self.beginString != None: 
138              self.out.write(self.beginString) 
139              self.out.write("\n" + self.indentLevel * "  ") 
140          self.beginString = None 
 141       
142       
143 -    def end(self, element): 
 144          self.indentLevel -= 1 
145          if element == self.lastElement: 
146              self.beginString = None 
147              self.write(element) 
148          else: 
149              self.out.write(self.indentLevel * "  " + "</" + element.tag + ">\n") 
150          self.lastElement = None 
151          return self.tags.pop() 
 152       
153 -    def write(self, element): 
 154          self._flush() 
155          indent(element, self.indentLevel) 
156          self.out.write(ElementTree.tostring(element, "utf-8")) 
157          self.lastElement = None 
  158   
160      """obj can be 
161      1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned 
162      2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned 
163      3) an open input stream -> the input is parsed and the resulting ElementTree is returned 
164      4) an ElementTree or an Element -> obj is returned as-is, nothing is done""" 
165      if isinstance(obj,str) or isinstance(obj,unicode): 
166          if obj.endswith(".gz"): 
167              fStream=GzipFile(obj,"rt") 
168               
169          else: 
170              fStream=open(obj,"rt") 
171               
172          for rv in ElementTree.iterparse(fStream, events): 
173              yield rv 
174      elif isinstance(obj,ElementTree.ElementTree) or ElementTree.iselement(obj): 
175          if ElementTree.iselement(obj): 
176              root = obj 
177          else: 
178              root = obj.getroot() 
179           
180           
181          for element in root.getiterator(): 
182              yield ("memory", element) 
183      else: 
184           
185           
186          for rv in ElementTree.iterparse(obj, events): 
187              yield rv 
 188   
189 -def write(rootElement, filename): 
 190      if isinstance(rootElement,ElementTree.ElementTree): 
191          rootElement = rootElement.getroot() 
192      indent(rootElement) 
193       
194      if os.path.dirname(filename) != "" and not os.path.exists(os.path.dirname(filename)): 
195          os.makedirs(os.path.dirname(filename)) 
196       
197      if filename.endswith(".gz"): 
198          out=GzipFile(filename,"wt") 
199      else: 
200          out=open(filename,"wt") 
201      print >> out, '<?xml version="1.0" encoding="UTF-8"?>' 
202      ElementTree.ElementTree(rootElement).write(out,"utf-8") 
203      out.close() 
204       
205      encodeNewlines(filename) 
 206   
208      import tempfile, shutil 
209       
210      tempdir = tempfile.mkdtemp() 
211      tempfilepath = os.path.join(tempdir, os.path.basename(filename)) 
212      if filename.endswith(".gz"): 
213           
214          inFile = codecs.getreader("utf-8")(GzipFile(filename, "rb")) 
215          out = codecs.getwriter("utf-8")(GzipFile(tempfilepath, "wb")) 
216      else: 
217           
218          inFile=codecs.open(filename, "rt", "utf-8") 
219          out = codecs.open(tempfilepath, "wt", "utf-8") 
220       
221      for content in inFile: 
222           
223           
224          content = content.replace(">\n", "TEMP_PROTECT_N")  
225          content = content.replace(">\r", "TEMP_PROTECT_R")  
226          content = content.replace("\n", "
")  
227          content = content.replace("\r", "
")  
228          content = content.replace("TEMP_PROTECT_N", ">\n")  
229          content = content.replace("TEMP_PROTECT_R", ">\r")  
230          out.write(content) 
231      inFile.close() 
232      out.close() 
233      shutil.copy2(tempfilepath, filename) 
234      shutil.rmtree(tempdir) 
 235       
236   
237   
238   
239   
240   
241   
242   
243   
244   
245   
246   
247   
248   
249   
250   
251   
252   
253   
254   
255   
256   
257   
258   
259   
260   
262       
263       
264       
265       
266      result=[] 
267      currElem=element 
268      for tag in tagList: 
269          for subElem in currElem: 
270              if subElem.tag==tag: 
271                  break 
272          else: 
273              subElem=ElementTree.SubElement(currElem,tag) 
274          result.append(subElem) 
275          currElem=subElem 
276      return result 
 277   
278 -def toStr(element, recursive=True, removePreTag=True): 
 279      tag = element.tag 
280      if removePreTag: 
281          tag = tag.split("}")[-1] 
282      s = "<" + tag 
283      for key in sorted(element.attrib.keys()): 
284          s += " " + key + "=\"" + element.get(key) + "\"" 
285       
286      text = element.text 
287      children = element.getchildren() 
288      if text != None or len(children) > 0:  
289          s += ">" 
290       
291      if text != None: 
292          s += text 
293      for child in children: 
294          s += toStr(child) 
295      if text != None or len(children) > 0: 
296          s += "</" + tag + ">" 
297      else: 
298          s += "/>" 
299       
300      if element.tail != None: 
301          s += element.tail 
302           
303      return s 
 304   
306      for element in parent.getiterator(): 
307          if element.tag == tag: 
308              found = True 
309              for k, v in attDict.iteritems(): 
310                  if element.get(k) != v: 
311                      found = False 
312              if found: 
313                  return element 
314      return None 
 315   
317      element = parent.find(name) 
318      if element == None: 
319          element = ElementTree.Element(name) 
320          parent.append(element) 
321      return element 
 322   
323  if __name__=="__main__": 
324      r=ElementTree.parse("delme.xml").getroot() 
325      write(r,"delme1.xml.gz") 
326      r2=ETFromObj("delme1.xml.gz").getroot() 
327      write(r2,"delme2.xml.gz") 
328