Package TEES :: Package Utils :: Module ElementTreeUtils
[hide private]

Source Code for Module TEES.Utils.ElementTreeUtils

  1  """ 
  2  Functions for easier use of cElementTree. 
  3   
  4    Program:    cElementTree Utilities 
  5    Date:       Oct. 16, 2007 
  6    Author:     Jari Bjoerne 
  7   
  8    Description: Convenience functions for easier use of cElementTree. 
  9  """ 
 10  __version__ = "$Revision: 1.20 $" 
 11   
 12  import sys, os 
 13  import codecs 
 14   
 15  try: 
 16      import cElementTree as ElementTree 
 17  except ImportError: 
 18      import xml.etree.cElementTree as ElementTree 
 19   
 20  from gzip import GzipFile 
 21   
22 -def removeAll(element):
23 for child in list(element): 24 removeAll(child) 25 for child in list(element): 26 element.remove(child)
27
28 -def iterparse(file, elementName, callback, limit = -1):
29 """ Parse iteratively xml-files 30 31 This function offers a simple way to use the cElementTree 32 iterparse-function the way it is often used. 33 34 Keyword arguments: 35 file -- (file) file or file-like object to parse 36 elementName -- (string) matching elements are passed to the callback 37 callback -- (function) called when parser has parsed an element 38 of name elementName 39 limit -- (int) stop after reading "limit" elements. If -1, read 40 until end of file. This is mostly useful when debugging 41 programs that parse large files. 42 """ 43 context = ElementTree.iterparse(file, events=("start", "end")) 44 root = None 45 46 for event, elem in context: 47 if limit == 0: 48 return 49 50 if event == "start" and root is None: 51 root = elem # the first element is root 52 if event == "end" and elem.tag == elementName: #elem.tag == "record": 53 #... process record elements ... 54 callback(elem) 55 root.clear() 56 if limit != -1: 57 limit -= 1
58
59 -def indent(elem, level=0):
60 """ indent-function as defined in cElementTree-documentation 61 62 This function will become part of cElementTree in some future 63 release. Until then, it can be used from here. This function 64 indents the xml-tree, so that it is more readable when written 65 out. 66 67 Keyword arguments: 68 elem -- (Element) root of the tree to indent 69 level -- (int) starting level of indentation 70 """ 71 i = "\n" + level*" " 72 if len(elem): 73 if not elem.text or not elem.text.strip(): 74 elem.text = i + " " 75 for e in elem: 76 indent(e, level+1) 77 if not e.tail or not e.tail.strip(): 78 e.tail = i 79 if level and (not elem.tail or not elem.tail.strip()): 80 elem.tail = i
81
82 -def ETFromObj(obj):
83 """obj can be 84 1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned 85 2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned 86 3) an open input stream -> the input is parsed and the resulting ElementTree is returned 87 4) an ElementTree or an Element -> obj is returned as-is, nothing is done""" 88 if isinstance(obj,str) or isinstance(obj,unicode): 89 if obj.endswith(".xml.gz"): 90 fStream=GzipFile(obj,"rt") 91 #fStream = codecs.getreader("utf-8")(GzipFile(obj,"rt")) 92 elif obj.endswith(".xml") or obj.endswith(".svg") or obj.endswith(".nxml") or obj.endswith(".csml"): 93 fStream=open(obj,"rt") 94 #fStream=codecs.open(obj, "rt", "utf-8") 95 else: 96 raise ValueError("%s: File format not recognized (expected .xml or .xml.gz)"%obj) 97 return ElementTree.parse(fStream) 98 elif isinstance(obj,ElementTree.ElementTree) or ElementTree.iselement(obj): 99 return obj 100 else: 101 #not a string, not a tree, not an element, should be a stream 102 #let's parse it 103 return ElementTree.parse(obj)
104
105 -class ETWriter():
106 - def __init__(self, out):
107 if isinstance(out,str): 108 if out.endswith(".gz"): 109 self.out = GzipFile(out,"wt") 110 else: 111 self.out = open(out,"wt") 112 else: 113 self.out = obj 114 print >> self.out, '<?xml version="1.0" encoding="UTF-8"?>' 115 self.indentLevel = 0 116 self.beginString = None 117 self.tags = [] 118 self.lastElement = None
119
120 - def close(self):
121 while len(self.tags) > 0: 122 self.end() 123 self.out.close() 124 self.out = None
125 126 # open element
127 - def begin(self, element):
128 self.tags.append(element.tag) 129 self.beginString = self.indentLevel * " " + "<" + element.tag 130 for key in sorted(element.attrib.keys()): 131 self.beginString += " " + key + "=\"" + element.get(key) + "\"" 132 self.beginString += ">" 133 self.indentLevel += 1 134 self.lastElement = element
135
136 - def _flush(self):
137 if self.beginString != None: 138 self.out.write(self.beginString) 139 self.out.write("\n" + self.indentLevel * " ") 140 self.beginString = None
141 142 # close element
143 - def end(self, element):
144 self.indentLevel -= 1 145 if element == self.lastElement: 146 self.beginString = None 147 self.write(element) 148 else: 149 self.out.write(self.indentLevel * " " + "</" + element.tag + ">\n") 150 self.lastElement = None 151 return self.tags.pop()
152
153 - def write(self, element):
154 self._flush() 155 indent(element, self.indentLevel) 156 self.out.write(ElementTree.tostring(element, "utf-8")) 157 self.lastElement = None
158
159 -def ETIteratorFromObj(obj, events=None, parser=None):
160 """obj can be 161 1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned 162 2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned 163 3) an open input stream -> the input is parsed and the resulting ElementTree is returned 164 4) an ElementTree or an Element -> obj is returned as-is, nothing is done""" 165 if isinstance(obj,str) or isinstance(obj,unicode): 166 if obj.endswith(".gz"): 167 fStream=GzipFile(obj,"rt") 168 #fStream = codecs.getreader("utf-8")(GzipFile(obj,"rt")) 169 else: 170 fStream=open(obj,"rt") 171 #fStream=codecs.open(obj, "rt", "utf-8") 172 for rv in ElementTree.iterparse(fStream, events): 173 yield rv 174 elif isinstance(obj,ElementTree.ElementTree) or ElementTree.iselement(obj): 175 if ElementTree.iselement(obj): 176 root = obj 177 else: 178 root = obj.getroot() 179 #if events == None: 180 # events = ["END"] 181 for element in root.getiterator(): 182 yield ("memory", element) 183 else: 184 #not a string, not a tree, not an element, should be a stream 185 #let's parse it 186 for rv in ElementTree.iterparse(obj, events): 187 yield rv
188
189 -def write(rootElement, filename):
190 if isinstance(rootElement,ElementTree.ElementTree): 191 rootElement = rootElement.getroot() 192 indent(rootElement) 193 # Create intermediate paths if needed 194 if os.path.dirname(filename) != "" and not os.path.exists(os.path.dirname(filename)): 195 os.makedirs(os.path.dirname(filename)) 196 # Open the output file 197 if filename.endswith(".gz"): 198 out=GzipFile(filename,"wt") 199 else: 200 out=open(filename,"wt") 201 print >> out, '<?xml version="1.0" encoding="UTF-8"?>' 202 ElementTree.ElementTree(rootElement).write(out,"utf-8") 203 out.close() 204 # Fix newlines inside attributes 205 encodeNewlines(filename)
206
207 -def encodeNewlines(filename):
208 import tempfile, shutil 209 # fix newlines 210 tempdir = tempfile.mkdtemp() 211 tempfilepath = os.path.join(tempdir, os.path.basename(filename)) 212 if filename.endswith(".gz"): 213 #inFile=GzipFile(filename,"rt") 214 inFile = codecs.getreader("utf-8")(GzipFile(filename, "rb")) 215 out = codecs.getwriter("utf-8")(GzipFile(tempfilepath, "wb")) 216 else: 217 #inFile=open(filename,"rt") 218 inFile=codecs.open(filename, "rt", "utf-8") 219 out = codecs.open(tempfilepath, "wt", "utf-8") 220 221 for content in inFile: 222 #content = inFile.read() 223 #inFile.close() 224 content = content.replace(">\n", "TEMP_PROTECT_N") # newlines between elements 225 content = content.replace(">\r", "TEMP_PROTECT_R") # newlines between elements 226 content = content.replace("\n", "&#10;") # newlines in attributes 227 content = content.replace("\r", "&#10;") # newlines in attributes 228 content = content.replace("TEMP_PROTECT_N", ">\n") # newlines between elements 229 content = content.replace("TEMP_PROTECT_R", ">\r") # newlines between elements 230 out.write(content) 231 inFile.close() 232 out.close() 233 shutil.copy2(tempfilepath, filename) 234 shutil.rmtree(tempdir)
235 236 # if filename.endswith(".gz"): 237 # #out=GzipFile(filename,"wt") 238 # out = codecs.getwriter("utf-8")(GzipFile(filename,"wt")) 239 # else: 240 # #out=open(filename,"wt") 241 # out=codecs.open(filename, "wt", "utf-8") 242 # out.write(content) 243 # out.close() 244 245 #def writeUTF8(rootElement,out): 246 # indent(rootElement) 247 # if isinstance(out,str): 248 # if out.endswith(".gz"): 249 # f=GzipFile(out,"wt") 250 # else: 251 # f=open(out,"wt") 252 # print >> f, '<?xml version="1.0" encoding="UTF-8"?>' 253 # ElementTree.ElementTree(rootElement).write(f,"utf-8") 254 # f.close() 255 # encodeNewlines(out) 256 # else: 257 # print >> out, '<?xml version="1.0" encoding="UTF-8"?>' 258 # ElementTree.ElementTree(rootElement).write(out,"utf-8") 259 260
261 -def makePath(element,tagList):
262 #taglist is a list of tag names 263 #a list of corresponding elements is returned 264 #if these did not exist, they are created! 265 # 266 result=[] 267 currElem=element 268 for tag in tagList: 269 for subElem in currElem: 270 if subElem.tag==tag: 271 break 272 else: 273 subElem=ElementTree.SubElement(currElem,tag) 274 result.append(subElem) 275 currElem=subElem 276 return result
277
278 -def toStr(element, recursive=True, removePreTag=True):
279 tag = element.tag 280 if removePreTag: 281 tag = tag.split("}")[-1] 282 s = "<" + tag 283 for key in sorted(element.attrib.keys()): 284 s += " " + key + "=\"" + element.get(key) + "\"" 285 # get content 286 text = element.text 287 children = element.getchildren() 288 if text != None or len(children) > 0: # if content, close opening 289 s += ">" 290 # write content 291 if text != None: 292 s += text 293 for child in children: 294 s += toStr(child) 295 if text != None or len(children) > 0: 296 s += "</" + tag + ">" 297 else: 298 s += "/>" 299 300 if element.tail != None: 301 s += element.tail 302 303 return s
304
305 -def getElementByAttrib(parent, tag, attDict):
306 for element in parent.getiterator(): 307 if element.tag == tag: 308 found = True 309 for k, v in attDict.iteritems(): 310 if element.get(k) != v: 311 found = False 312 if found: 313 return element 314 return None
315
316 -def setDefaultElement(parent, name):
317 element = parent.find(name) 318 if element == None: 319 element = ElementTree.Element(name) 320 parent.append(element) 321 return element
322 323 if __name__=="__main__": 324 r=ElementTree.parse("delme.xml").getroot() 325 write(r,"delme1.xml.gz") 326 r2=ETFromObj("delme1.xml.gz").getroot() 327 write(r2,"delme2.xml.gz") 328