1 import sys, os
2 import types
3
4 import subprocess
5 from ClusterConnection import ClusterConnection
6
7 SLURMJobTemplate = """#!/bin/bash -l
8 ##execution shell environment
9
10 ## name of your job
11 #SBATCH -J %job
12 ## system error message output file
13 #SBATCH -e %stderr
14 ## system message output file
15 #SBATCH -o %stdout
16 ## a per-process (soft) memory limit
17 ## limit is specified in MB
18 ## example: 1 GB is 1000
19 #SBATCH --mem-per-cpu=%memory
20 ## how long a job takes, wallclock time hh:mm:ss
21 #SBATCH -t %wallTime
22 ## number of processes
23 #SBATCH -n %cores
24
25 mkdir -p %stderrDir
26 mkdir -p %stdoutDir
27
28 %commands"""
29
31 """
32 For using the Simple Linux Utility for Resource Management (https://computing.llnl.gov/linux/slurm/).
33 """
34 - def __init__(self, account=None, workdir=None, settings=None, wallTime=None, memory=None, cores=None, modules=None):
35 if wallTime == None:
36 wallTime = "48:00:00"
37 if memory == None:
38 memory = 4000
39
40
41 ClusterConnection.__init__(self, account=account, workdir=workdir, settings=settings, memory=memory, cores=cores, modules=modules, wallTime=wallTime)
42 self.submitCommand = "sbatch"
43 self.jobListCommand = "squeue"
44 self.jobTemplate = SLURMJobTemplate
45
46 - def submit(self, script=None, jobDir=None, jobName=None, stdout=None, stderr=None):
47 pstdout, pstderr = ClusterConnection.submit(self, script, jobDir, jobName, stdout, stderr)
48 if pstderr != None:
49 print >> sys.stderr, pstderr
50 print >> sys.stderr, pstdout
51 assert pstdout.startswith("Submitted batch job"), pstdout
52 jobId = int(pstdout.split()[-1])
53 return self._writeJobFile(jobDir, jobName, {"SLURMID":jobId}, append=True)
54
56 jobAttr = self._readJobFile(job)
57
58 if jobAttr == None:
59 return None
60 if "SLURMID" not in jobAttr:
61 return "FAILED"
62 for line in self.run("sacct -u " + self.getUserName() + " -j " + jobAttr["SLURMID"]):
63 line = line.strip()
64 splits = line.split()
65
66
67
68 if splits[0] == jobAttr["SLURMID"]:
69 if self.debug:
70 print >> sys.stderr, "sacct:", line
71 jobStatus = splits[5]
72 if jobStatus in ["RUNNING", "COMPLETING"]:
73 return "RUNNING"
74 elif jobStatus == "COMPLETED":
75 if "retcode" not in jobAttr:
76 return "RUNNING"
77 elif jobAttr["retcode"] == "0":
78 return "FINISHED"
79 else:
80 return "FAILED"
81 elif jobStatus in ["FAILED", "CANCELLED", "NODE_FAIL", "PREEMPTED", "TIMEOUT"]:
82 return "FAILED"
83 elif jobStatus in ["PENDING", "RESIZING", "SUSPENDED"]:
84 return "QUEUED"
85 else:
86 assert False, jobStatus
87 return "QUEUED"
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135