Package TEES :: Package Utils :: Module Release
[hide private]

Source Code for Module TEES.Utils.Release

  1  import sys,os 
  2  import types 
  3  import shutil 
  4  import filecmp 
  5  import subprocess 
  6  mainTEESDir = os.path.abspath(os.path.join(__file__, "../..")) 
  7  print mainTEESDir 
  8  sys.path.append(mainTEESDir) 
  9   
10 -def listExecutables(filter=["Core", "FeatureBuilders", "InteractionXML", "GeniaEventsToSharedTask"]):
11 tableTitleLines = "| Program | Location | Description |\n" 12 tableTitleLines += "|:-----------|:-----------|:-----------|" 13 mainTableTitleLines = "| Program | Description |\n" 14 mainTableTitleLines += "|:-----------|:-----------|" 15 categories = ["Main Programs", "Tool Wrappers", "Other Programs"] 16 programs = {} 17 for category in categories: 18 programs[category] = [] 19 for triple in os.walk(mainTEESDir): 20 for filename in sorted(triple[2]): 21 skip = False 22 for filterRule in filter: 23 if filterRule in os.path.join(triple[0], filename): 24 skip = True 25 break 26 if skip: 27 continue 28 if filename.endswith(".py"): 29 f = open(os.path.join(triple[0], filename), "rt") 30 lines = f.readlines() 31 f.close() 32 isExecutable = False 33 description = "" 34 for line in lines: 35 if "optparser = OptionParser(" in line: 36 assert line.count("\"") in [0, 2], line 37 if line.count("\"") == 2: 38 description = line.split("\"")[1] 39 description = description.split("\\n", 1)[-1] 40 description = description.split(".")[0] 41 description = description.strip() 42 isExecutable = True 43 if isExecutable: 44 subDir = triple[0][len(mainTEESDir)+1:].strip() 45 if subDir == "": 46 category = "Main Programs" 47 elif "Tools" in subDir or "Preprocessor" in filename: 48 category = "Tool Wrappers" 49 else: 50 category = "Other Programs" 51 programs[category].append( [subDir, filename, description] ) 52 53 for category in categories: 54 print "##", category 55 if category == "Main Programs": 56 print mainTableTitleLines 57 else: 58 print tableTitleLines 59 for program in sorted(programs[category]): 60 if program[0] == "": 61 print "|", program[1], "|", program[2], "|" 62 else: 63 print "|", program[1], "|", program[0], "|", program[2], "|" 64 print
65
66 -def extractModels(input, output, tasks, classificationOutput=None):
67 assert input != None 68 assert output != None 69 assert input != output 70 if os.path.exists(output): 71 shutil.rmtree(output) 72 if not os.path.exists(output): 73 os.makedirs(output) 74 for subDir in os.listdir(input): 75 subDirAbs = os.path.join(input, subDir) 76 if os.path.isdir(subDirAbs) and subDir.split(".")[0] in tasks: 77 for suffix in ["devel", "test"]: 78 if os.path.exists(os.path.join(subDirAbs, "model-" + suffix)): 79 src = os.path.join(subDirAbs, "model-" + suffix) 80 dst = os.path.join(output, subDir.split(".")[0] + "-" + suffix) 81 print >> sys.stderr, "Copying model", src, "to", dst 82 shutil.copytree(src, dst) 83 if os.path.exists(os.path.join(subDirAbs, "log.txt")): 84 print >> sys.stderr, "Copying training log for", subDir 85 shutil.copy2(os.path.join(subDirAbs, "log.txt"), os.path.join(output, subDir.split(".")[0] + "-train-log.txt")) 86 if classificationOutput != None: 87 if os.path.exists(os.path.join(subDirAbs, "classification-" + suffix)): 88 src = os.path.join(subDirAbs, "classification-" + suffix + "/" + suffix + "-events.tar.gz") 89 dst = os.path.join(classificationOutput, subDir.split(".")[0] + "-" + suffix + "-events.tar.gz") 90 print src 91 if os.path.exists(src): 92 print >> sys.stderr, "Copying classification", src, "to", dst 93 if not os.path.exists(os.path.dirname(dst)): 94 os.makedirs(os.path.dirname(dst)) 95 shutil.copy2(src, dst)
96
97 -def linkDuplicates(input, output):
98 if os.path.exists(output): 99 print >> sys.stderr, "Removing output directory" 100 shutil.rmtree(output) 101 print >> sys.stderr, "Copying input directory" 102 shutil.copytree(input, output) 103 print >> sys.stderr, "Listing files" 104 files = [] 105 for triple in os.walk(output): 106 for filename in triple[2]: 107 filePath = os.path.join(triple[0], filename) 108 if os.path.isfile(filePath): 109 files.append(filePath) 110 print >> sys.stderr, "Detecting duplicates" 111 duplicates = {} 112 for i in range(len(files)-1): 113 if os.path.getsize(files[i]) > 1000: 114 print >> sys.stderr, "Processing", files[i] 115 for j in range(i+1, len(files)): 116 if filecmp.cmp(files[i], files[j], shallow=False): 117 if files[i] not in duplicates: 118 duplicates[files[i]] = [] 119 duplicates[files[i]].append(files[j]) 120 else: 121 print >> sys.stderr, "Skipping small file", files[i] 122 print >> sys.stderr, "Duplicates found:" 123 for key in sorted(duplicates.keys()): 124 print key, sorted(duplicates[key]) 125 print >> sys.stderr, "Replacing duplicates with links" 126 for original in sorted(duplicates.keys()): 127 for duplicate in duplicates[original]: 128 os.remove(duplicate) 129 relPath = os.path.relpath(original, os.path.commonprefix((original, duplicate))) 130 lnCommand = "cd " + os.path.dirname(duplicate) + "; ln -s " + relPath + " " + os.path.basename(duplicate) + "; cd -" 131 print >> sys.stderr, "Linking:", lnCommand 132 subprocess.call(lnCommand, shell=True)
133
134 -def buildModels(output, tasks, connection, dummy=False):
135 """ 136 Build the release models. 137 138 This function should be run on the cluster, so the connection argument is the 139 same for both the batch system and the train-program it runs. 140 """ 141 global mainTEESDir 142 from batch import batch 143 for task in tasks: 144 taskName = task 145 if task in ["GE", "GE09"]: 146 taskName += ".2" 147 command = "python " + os.path.join(mainTEESDir, "train.py") + " -t " + taskName + " -o %o/%j -c " + connection + " --clearAll" 148 batch(command, input=None, connection=connection, jobTag=task, output=output, debug=True, dummy=dummy)
149 150 if __name__=="__main__": 151 # Import Psyco if available 152 try: 153 import psyco 154 psyco.full() 155 print >> sys.stderr, "Found Psyco, using" 156 except ImportError: 157 print >> sys.stderr, "Psyco not installed" 158 159 from optparse import OptionParser 160 optparser = OptionParser(description="Make TEES release files.") 161 optparser.add_option("-i", "--input", default=None, dest="input", help="") 162 optparser.add_option("-o", "--output", default=None, dest="output", help="") 163 optparser.add_option("-a", "--action", default=None, dest="action", help="") 164 optparser.add_option("-t", "--tasks", default="GE,EPI,ID,BB,BI,BI-FULL,GE09,CO,REL,REN,DDI,DDI-FULL", dest="tasks", help="") 165 optparser.add_option("-c", "--connection", default=None, dest="connection", help="") 166 optparser.add_option("-d", "--dummy", action="store_true", default=False, dest="dummy", help="") 167 optparser.add_option("--classificationOutput", default=None, dest="classificationOutput", help="") 168 (options, args) = optparser.parse_args() 169 assert options.action in ["CONVERT_CORPORA", "BUILD_MODELS", "EXTRACT_MODELS", "PACKAGE_MODELS", "BUILD_APIDOC", "LIST_EXECUTABLES"] 170 options.tasks = options.tasks.split(",") 171 172 if options.action == "LIST_EXECUTABLES": 173 listExecutables() 174 elif options.action == "BUILD_MODELS": 175 buildModels(options.output, options.tasks, options.connection, options.dummy) 176 elif options.action == "EXTRACT_MODELS": 177 extractModels(options.input, options.output, options.tasks, options.classificationOutput) 178 elif options.action == "PACKAGE_MODELS": 179 linkDuplicates(options.input, options.output) 180