Package TEES :: Package Utils :: Package InteractionXML :: Module DivideSets
[hide private]

Source Code for Module TEES.Utils.InteractionXML.DivideSets

  1  import sys, os 
  2  thisPath = os.path.dirname(os.path.abspath(__file__)) 
  3  sys.path.append(os.path.abspath(os.path.join(thisPath,"../.."))) 
  4  from Utils.ProgressCounter import ProgressCounter 
  5  try: 
  6      import xml.etree.cElementTree as ET 
  7  except ImportError: 
  8      import cElementTree as ET 
  9  import Utils.ElementTreeUtils as ETUtils 
 10   
11 -def processCorpus(input, outDir, stem, tail, mergedSets=[], saveCombined=False, verbose=False):
12 newCorpora = {} 13 print >> sys.stderr, "Loading corpus file", input 14 corpusRoot = ETUtils.ETFromObj(input).getroot() 15 16 documents = corpusRoot.findall("document") 17 counter = ProgressCounter(len(documents), "Documents") 18 countsByType = {} 19 for document in documents: 20 counter.update() 21 docSet = document.get("set") 22 if docSet == None: 23 if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id") 24 if not countsByType.has_key("No set"): 25 countsByType["No set"] = 0 26 countsByType["No set"] += 1 27 continue 28 elif not newCorpora.has_key(docSet): 29 newCorpora[docSet] = ET.Element("corpus") 30 for k, v in corpusRoot.attrib.iteritems(): 31 newCorpora[docSet].set(k, v) 32 countsByType[docSet] = 0 33 newCorpora[docSet].append(document) 34 countsByType[docSet] += 1 35 36 # Make merged sets 37 for mergedSet in mergedSets: 38 tag = "-and-".join(sorted(mergedSet)) 39 if not newCorpora.has_key(tag): 40 newCorpora[tag] = ET.Element("corpus") 41 for k, v in corpusRoot.attrib.iteritems(): 42 newCorpora[tag].set(k, v) 43 countsByType[tag] = 0 44 for componentSet in mergedSet: 45 for element in newCorpora[componentSet].findall("document"): 46 newCorpora[tag].append(element) 47 countsByType[tag] += 1 48 49 print >> sys.stderr, "Documents per set" 50 for k in sorted(countsByType.keys()): 51 print >> sys.stderr, " " + str(k) + ":", countsByType[k] 52 53 if not os.path.exists(outDir): 54 os.makedirs(outDir) 55 56 print >> sys.stderr, "Writing output files to directory", outDir 57 if saveCombined: 58 print >> sys.stderr, "Saving combined input to", stem + tail 59 ETUtils.write(corpusRoot, stem + tail) 60 else: 61 print >> sys.stderr, "Combined input not saved" 62 for docSet in sorted(newCorpora.keys()): 63 outFilename = os.path.join(outDir, stem + "-" + docSet + tail) 64 print >> sys.stderr, "Writing set", docSet, "to", outFilename 65 ETUtils.write(newCorpora[docSet], outFilename)
66 67 if __name__=="__main__": 68 import sys 69 print >> sys.stderr, "##### Split elements with merged types #####" 70 71 from optparse import OptionParser 72 # Import Psyco if available 73 try: 74 import psyco 75 psyco.full() 76 print >> sys.stderr, "Found Psyco, using" 77 except ImportError: 78 print >> sys.stderr, "Psyco not installed" 79 80 optparser = OptionParser(usage="%prog [options]\nPath generator.") 81 optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in interaction xml format", metavar="FILE") 82 optparser.add_option("-o", "--output", default=None, dest="output", help="Output file in interaction xml format.") 83 optparser.add_option("-s", "--stem", default=None, dest="stem", help="Output file stem.") 84 optparser.add_option("-t", "--tail", default=None, dest="tail", help="Output file tail.") 85 optparser.add_option("-m", "--merged", default=None, dest="merged", help="Output file tail.") 86 (options, args) = optparser.parse_args() 87 88 if options.input == None: 89 print >> sys.stderr, "Error, input file not defined." 90 optparser.print_help() 91 sys.exit(1) 92 if options.output == None: 93 print >> sys.stderr, "Error, output directory not defined." 94 optparser.print_help() 95 sys.exit(1) 96 if options.stem == None: 97 print >> sys.stderr, "Error, output stem not defined." 98 optparser.print_help() 99 sys.exit(1) 100 if options.tail == None: 101 print >> sys.stderr, "Error, output tail not defined." 102 optparser.print_help() 103 sys.exit(1) 104 105 processCorpus(options.input, options.output, options.stem, options.tail) 106