Package TEES :: Package Utils :: Package InteractionXML :: Module Catenate
[hide private]

Source Code for Module TEES.Utils.InteractionXML.Catenate

  1  import sys, os 
  2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
  4  import gzip, codecs 
  5  try: 
  6      import xml.etree.cElementTree as ET 
  7  except ImportError: 
  8      import cElementTree as ET 
  9  import Utils.ElementTreeUtils as ETUtils 
 10  import RecalculateIds 
 11   
12 -def catenate(inputs, output, fast):
13 if not os.path.exists(os.path.dirname(output)): 14 os.makedirs(os.path.dirname(output)) 15 if fast: 16 catenateFiles(inputs, output) 17 else: 18 catenateElements(inputs, output) 19 return output
20
21 -def catenateFiles(inputs, output):
22 print >> sys.stderr, "##### Catenate interaction XML as files #####" 23 assert len(inputs) > 1 24 print >> sys.stderr, "Writing catenated XML to", output 25 if output.endswith(".gz"): 26 outFile = gzip.open(output, 'wb') 27 else: 28 outFile = open(output, "wb") 29 outWriter = codecs.getwriter("utf-8")(outFile) 30 for i in range(len(inputs)): 31 print >> sys.stderr, "Catenating", inputs[i] 32 if inputs[i].endswith(".gz"): 33 f = gzip.open(inputs[i], 'rb') 34 else: 35 f = open(inputs[i], "rb") 36 state = "BEGIN" 37 for line in codecs.getreader("utf-8")(f): 38 if "<corpus" in line: 39 assert state == "BEGIN" 40 state = "MIDDLE" 41 if i > 0: 42 continue 43 elif "</corpus" in line: 44 assert state == "MIDDLE" 45 state = "END" 46 if state == "BEGIN" and i > 0: 47 continue 48 if state == "END" and i < len(inputs) - 1: 49 continue 50 outWriter.write(line) 51 f.close() 52 outFile.close()
53
54 -def catenateElements(inputs, output):
55 print >> sys.stderr, "##### Catenate interaction XML as elements #####" 56 c1 = RecalculateIds.recalculateIds(input1, None, False, 0) 57 numDocs = len(c1.getroot().findall("document")) 58 print >> sys.stderr, "Documents in input 1:", numDocs 59 c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs) 60 61 print >> sys.stderr, "Appending documents" 62 c1Root = c1.getroot() 63 for document in c2.getroot().findall("document"): 64 c1Root.append(document) 65 66 print >> sys.stderr, "Validating ids" 67 ids = set() 68 for element in c1Root.getiterator("entity"): 69 id = element.get("id") 70 assert not id in ids 71 ids.add(id) 72 for element in c1Root.getiterator("interaction"): 73 id = element.get("id") 74 assert not id in ids 75 ids.add(id) 76 for element in c1Root.getiterator("sentence"): 77 id = element.get("id") 78 assert not id in ids 79 ids.add(id) 80 for element in c1Root.getiterator("document"): 81 id = element.get("id") 82 assert not id in ids 83 ids.add(id) 84 85 if output != None: 86 print >> sys.stderr, "Writing output to", output 87 ETUtils.write(c1Root, output) 88 return c1
89 90 if __name__=="__main__": 91 import sys 92 93 from optparse import OptionParser 94 # Import Psyco if available 95 try: 96 import psyco 97 psyco.full() 98 print >> sys.stderr, "Found Psyco, using" 99 except ImportError: 100 print >> sys.stderr, "Psyco not installed" 101 102 optparser = OptionParser(usage="%prog [options]\n") 103 optparser.add_option("-i", "--inputs", default=None, dest="inputs", help="A comma-separated list of corpora in interaction xml format", metavar="FILE") 104 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 105 optparser.add_option("-f", "--fast", default=False, action="store_true", dest="fast", help="Fast, but unsafe catenation") 106 (options, args) = optparser.parse_args() 107 108 if options.inputs == None: 109 print >> sys.stderr, "Error, input files not defined." 110 optparser.print_help() 111 sys.exit(1) 112 options.inputs = options.inputs.split(",") 113 if options.output == None: 114 print >> sys.stderr, "Error, output file not defined." 115 optparser.print_help() 116 sys.exit(1) 117 118 catenate(options.inputs, options.output, options.fast) 119