1
|
|
|
import sys |
2
|
|
|
import os |
3
|
|
|
from subprocess import Popen, PIPE, check_output |
|
|
|
|
4
|
|
|
import time |
5
|
|
|
import uuid |
6
|
|
|
from fabric.api import env, run, cd, get, hide, settings, remote_tunnel, show |
|
|
|
|
7
|
|
|
from fabric.tasks import execute |
|
|
|
|
8
|
|
|
from fabric.decorators import with_settings |
|
|
|
|
9
|
|
|
from datetime import timedelta |
10
|
|
|
from os.path import join as pj |
11
|
|
|
|
12
|
|
|
from jinja2 import Environment, FileSystemLoader |
|
|
|
|
13
|
|
|
|
14
|
|
|
JOB_SCHEDULERS = ('SGE', 'SLURM', 'LSF', |
15
|
|
|
'PBS', 'TORQUE', 'MAUI', 'LOADLEVELER') |
16
|
|
|
|
17
|
|
|
scheduler = None |
18
|
|
|
job_db = None |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
def get_data(filename): |
22
|
|
|
packagedir = os.path.dirname(__file__) |
23
|
|
|
dirname = pj(packagedir, '..', 'share', 'MyCluster') |
24
|
|
|
fullname = os.path.join(dirname, filename) |
25
|
|
|
# Need to check if file exists as |
26
|
|
|
# share location may also be sys.prefix/share |
27
|
|
|
if not os.path.isfile(fullname): |
28
|
|
|
dirname = pj(sys.prefix, 'share', 'MyCluster') |
29
|
|
|
fullname = os.path.join(dirname, filename) |
30
|
|
|
|
31
|
|
|
return fullname |
32
|
|
|
|
33
|
|
|
|
34
|
|
|
def load_template(template_name): |
35
|
|
|
env = Environment(loader=FileSystemLoader(os.path.join(os.path.dirname(__file__), 'templates'))) |
|
|
|
|
36
|
|
|
return env.get_template(template_name) |
37
|
|
|
|
38
|
|
|
|
39
|
|
|
def detect_scheduling_sys(): |
40
|
|
|
|
41
|
|
|
# Test for SLURM |
42
|
|
|
if os.getenv('SLURMHOME') is not None: |
43
|
|
|
return my_import('mycluster.slurm') |
44
|
|
|
|
45
|
|
|
try: |
46
|
|
|
line = check_output(['scontrol', 'ping']) |
47
|
|
|
if line.split('(')[0] == 'Slurmctld': |
48
|
|
|
return my_import('mycluster.slurm') |
49
|
|
|
except: |
|
|
|
|
50
|
|
|
pass |
51
|
|
|
|
52
|
|
|
# Test for PBS |
53
|
|
|
try: |
54
|
|
|
line = check_output(['pbsnodes', '-a']) |
55
|
|
|
return my_import('mycluster.pbs') |
56
|
|
|
except: |
|
|
|
|
57
|
|
|
pass |
58
|
|
|
|
59
|
|
|
# Test for SGE |
60
|
|
|
if os.getenv('SGE_CLUSTER_NAME') is not None: |
61
|
|
|
return my_import('mycluster.sge') |
62
|
|
|
|
63
|
|
|
# Test for lsf |
64
|
|
|
try: |
65
|
|
|
line = check_output('lsid') |
66
|
|
|
if line.split(' ')[0] == 'Platform': |
67
|
|
|
return my_import('mycluster.lsf') |
68
|
|
|
except: |
|
|
|
|
69
|
|
|
pass |
70
|
|
|
|
71
|
|
|
return None |
72
|
|
|
|
73
|
|
|
|
74
|
|
|
def queues(): |
75
|
|
|
if scheduler is not None: |
76
|
|
|
return scheduler.queues() |
77
|
|
|
else: |
78
|
|
|
return [] |
79
|
|
|
|
80
|
|
|
|
81
|
|
|
def remote_sites(): |
82
|
|
|
if job_db is not None: |
83
|
|
|
return job_db.remote_site_db |
84
|
|
|
else: |
85
|
|
|
return [] |
86
|
|
|
|
87
|
|
|
|
88
|
|
|
@with_settings(warn_only=True) |
89
|
|
|
def remote_cmd(): |
90
|
|
|
output_file = '~/.mycluster/' + str(uuid.uuid4()) |
91
|
|
|
with hide('output', 'running', 'warnings'), settings(warn_only=True): |
92
|
|
|
run('mycluster -p >' + output_file, pty=False) |
93
|
|
|
import StringIO |
|
|
|
|
94
|
|
|
contents = StringIO.StringIO() |
95
|
|
|
get(output_file, contents) |
96
|
|
|
# operate on 'contents' like a file object here, e.g. 'print |
97
|
|
|
return contents.getvalue() |
98
|
|
|
|
99
|
|
|
|
100
|
|
|
def remote_job_list(site): |
101
|
|
|
env.use_ssh_config = True |
102
|
|
|
return execute(remote_cmd, hosts=[site]) |
103
|
|
|
|
104
|
|
|
|
105
|
|
|
def print_timedelta(td): |
106
|
|
|
if (td.days > 0): |
|
|
|
|
107
|
|
|
if td.days > 1: |
108
|
|
|
out = str(td).replace(" days, ", ":") |
109
|
|
|
else: |
110
|
|
|
out = str(td).replace(" day, ", ":") |
111
|
|
|
else: |
112
|
|
|
out = "0:" + str(td) |
113
|
|
|
outAr = out.split(':') |
114
|
|
|
outAr = ["%02d" % (int(float(x))) for x in outAr] |
115
|
|
|
out = ":".join(outAr) |
116
|
|
|
return out |
117
|
|
|
|
118
|
|
|
|
119
|
|
|
def get_timedelta(date_str): |
120
|
|
|
# Returns timedelta object from string in [DD-[hh:]]mm:ss format |
121
|
|
|
days = 0 |
122
|
|
|
hours = 0 |
123
|
|
|
minutes = 0 |
124
|
|
|
seconds = 0 |
125
|
|
|
|
126
|
|
|
if date_str.count('-') == 1: |
127
|
|
|
days = int(date_str.split('-')[0]) |
128
|
|
|
date_str = date_str.partition('-')[2] |
129
|
|
|
if date_str.count(':') == 2: |
130
|
|
|
hours = int(date_str.split(':')[0]) |
131
|
|
|
date_str = date_str.partition(':')[2] |
132
|
|
|
|
133
|
|
|
try: |
134
|
|
|
minutes = int(date_str.split(':')[0]) |
135
|
|
|
seconds = int(date_str.split(':')[1]) |
136
|
|
|
except: |
|
|
|
|
137
|
|
|
pass |
138
|
|
|
|
139
|
|
|
return timedelta(days=days, |
140
|
|
|
hours=hours, |
141
|
|
|
minutes=minutes, |
142
|
|
|
seconds=seconds |
143
|
|
|
) |
144
|
|
|
|
145
|
|
|
|
146
|
|
|
def get_stats_time(stats): |
147
|
|
|
|
148
|
|
|
wallclock = '-' if 'wallclock' not in stats else stats['wallclock'] |
149
|
|
|
wallclock_delta = None |
150
|
|
|
cputime_delta = None |
151
|
|
|
if wallclock != '-': |
152
|
|
|
try: |
153
|
|
|
wallclock_delta = wallclock |
154
|
|
|
wallclock = print_timedelta(wallclock_delta) |
155
|
|
|
except: |
|
|
|
|
156
|
|
|
pass |
157
|
|
|
cputime = '-' if 'cpu' not in stats else stats['cpu'] |
158
|
|
|
if cputime != '-': |
159
|
|
|
try: |
160
|
|
|
cputime_delta = cputime |
161
|
|
|
cputime = print_timedelta(cputime_delta) |
162
|
|
|
except: |
|
|
|
|
163
|
|
|
pass |
164
|
|
|
|
165
|
|
|
time_ratio = None |
166
|
|
|
if cputime_delta and wallclock_delta: |
167
|
|
|
time_ratio = (float(cputime_delta.total_seconds()) / |
168
|
|
|
wallclock_delta.total_seconds()) |
169
|
|
|
|
170
|
|
|
return cputime, wallclock, time_ratio |
171
|
|
|
|
172
|
|
|
|
173
|
|
|
def printjobs(num_lines): |
174
|
|
|
print('User name: {0} {1}'.format(job_db.user_db['user'].first_name, |
175
|
|
|
job_db.user_db['user'].last_name)) |
176
|
|
|
jobs = job_list() |
177
|
|
|
print(' | {0:^10} | {1:^10} |\ |
178
|
|
|
{2:^10} | {3:^12} | {4:^12} |\ |
179
|
|
|
{5:^5} | {6:^20} | {7:50}'.format('Job ID', |
180
|
|
|
'Status', |
181
|
|
|
'NTasks', |
182
|
|
|
'CPU Time', |
183
|
|
|
'Wallclock', |
184
|
|
|
'Util %', |
185
|
|
|
'Job Name', |
186
|
|
|
'Job Dir',) |
187
|
|
|
) |
188
|
|
|
for i, j in enumerate(jobs): |
189
|
|
|
job_id = jobs[j].job_id |
190
|
|
|
status = jobs[j].status |
191
|
|
|
# queue = jobs[j].queue |
192
|
|
|
# site_name = job_db.queue_db[queue].site_name |
193
|
|
|
# scheduler_type = job_db.site_db[site_name].scheduler_type |
194
|
|
|
cputime, wallclock, time_ratio = get_stats_time(jobs[j].stats) |
195
|
|
|
efficiency = '-' |
196
|
|
|
if time_ratio: |
197
|
|
|
try: |
198
|
|
|
efficiency = (time_ratio / (int(jobs[j].num_tasks) * |
199
|
|
|
int(jobs[j].threads_per_task)) * 100.0) |
200
|
|
|
efficiency = '{:.1f}'.format(efficiency) |
201
|
|
|
except: |
|
|
|
|
202
|
|
|
pass |
203
|
|
|
|
204
|
|
|
if status == 'completed': |
205
|
|
|
print('{0:4} | {1:^10} |\ |
206
|
|
|
{2:^10} | {3:^10} |\ |
207
|
|
|
{4:^12} | {5:^12} |\ |
208
|
|
|
{6:^5} | {7:^20} | {8:50}'.format(i + 1, |
209
|
|
|
job_id, |
210
|
|
|
status, |
211
|
|
|
str(jobs[j].num_tasks) + |
212
|
|
|
' (' + |
213
|
|
|
str(jobs[j].threads_per_task) + |
|
|
|
|
214
|
|
|
')', |
215
|
|
|
cputime, |
216
|
|
|
wallclock, |
217
|
|
|
efficiency, |
218
|
|
|
jobs[j].job_name, |
219
|
|
|
jobs[j].job_dir) |
220
|
|
|
) |
221
|
|
|
elif status == 'running': |
222
|
|
|
stats = scheduler.running_stats(job_id) |
223
|
|
|
cputime, wallclock, time_ratio = get_stats_time(stats) |
224
|
|
|
efficiency = '-' |
225
|
|
|
if time_ratio: |
226
|
|
|
try: |
227
|
|
|
efficiency = (time_ratio / (int(jobs[j].num_tasks) * |
228
|
|
|
int(jobs[j].threads_per_task)) * 100.0) |
229
|
|
|
efficiency = '{:.1f}'.format(efficiency) |
230
|
|
|
except: |
|
|
|
|
231
|
|
|
pass |
232
|
|
|
print('{0:4} | {1:^10} | {2:^10} |\ |
233
|
|
|
{3:^10} | {4:^12} | {5:^12} |\ |
234
|
|
|
{6:^5} | {7:^20} | {8:50}'.format(i + 1, |
235
|
|
|
job_id, |
236
|
|
|
status, |
237
|
|
|
str(jobs[j].num_tasks) + |
238
|
|
|
' (' + |
239
|
|
|
str(jobs[j].threads_per_task) + |
|
|
|
|
240
|
|
|
')', |
241
|
|
|
cputime, |
242
|
|
|
wallclock, |
243
|
|
|
efficiency, |
244
|
|
|
jobs[j].job_name, |
245
|
|
|
jobs[j].job_dir) |
246
|
|
|
) |
247
|
|
|
else: |
248
|
|
|
print('{0:4} | {1:^10} | {2:^10} |\ |
249
|
|
|
{3:^10} | {4:^12} | {5:^12} |\ |
250
|
|
|
{6:^5} | {7:^20} | {8:50}'.format(i + 1, |
251
|
|
|
job_id, |
252
|
|
|
status, |
253
|
|
|
str(jobs[j].num_tasks) + |
254
|
|
|
' (' + |
255
|
|
|
str(jobs[j].threads_per_task) + |
|
|
|
|
256
|
|
|
')', |
257
|
|
|
'-', |
258
|
|
|
'-', |
259
|
|
|
efficiency, |
260
|
|
|
jobs[j].job_name, |
261
|
|
|
jobs[j].job_dir) |
262
|
|
|
) |
263
|
|
|
|
264
|
|
|
remotes = remote_sites() |
265
|
|
|
for i, j in enumerate(remotes): |
266
|
|
|
print('Remote Site: ' + remotes[j].name) |
267
|
|
|
remote_list = remote_job_list(remotes[j].user + '@' + remotes[j].name) |
268
|
|
|
for r in remote_list: |
269
|
|
|
print(remote_list[r]) |
270
|
|
|
|
271
|
|
|
|
272
|
|
|
def print_queue_info(): |
273
|
|
|
print('{0:25} | {1:^15} | {2:^15} | {3:^15} |\ |
274
|
|
|
{4:^15} | {5:^15}'.format('Queue Name', 'Node Max Task', |
275
|
|
|
'Node Max Thread', 'Node Max Memory', |
276
|
|
|
'Max Task', 'Available Task')) |
277
|
|
|
for q in queues(): |
278
|
|
|
try: |
279
|
|
|
nc = scheduler.node_config(q) |
280
|
|
|
tpn = scheduler.tasks_per_node(q) |
281
|
|
|
avail = scheduler.available_tasks(q) |
282
|
|
|
except: |
|
|
|
|
283
|
|
|
nc = None |
284
|
|
|
tpn = None |
285
|
|
|
avail = None |
286
|
|
|
print('{0:25} | {1:^15} | {2:^15} |\ |
287
|
|
|
{3:^15} | {4:^15} | {5:^15}'.format(q, tpn, |
288
|
|
|
nc['max thread'], |
289
|
|
|
nc['max memory'], |
290
|
|
|
avail['max tasks'], |
291
|
|
|
avail['available'])) |
292
|
|
|
|
293
|
|
|
|
294
|
|
|
def create_submit(queue_id, script_name=None, **kwargs): |
295
|
|
|
|
296
|
|
|
if job_db is not None: |
297
|
|
|
if 'user_email' not in kwargs: |
298
|
|
|
email = job_db.user_db['user'].email |
299
|
|
|
if email != 'unknown': |
300
|
|
|
kwargs['user_email'] = email |
301
|
|
|
|
302
|
|
|
if scheduler is not None: |
303
|
|
|
script = scheduler.create_submit(queue_id, **kwargs) |
304
|
|
|
|
305
|
|
|
if script_name is not None: |
306
|
|
|
import os.path |
|
|
|
|
307
|
|
|
if not os.path.isfile(script_name): |
308
|
|
|
with open(script_name, 'w') as f: |
309
|
|
|
f.write(script) |
310
|
|
|
else: |
311
|
|
|
print('Warning file: {0} already exists.\ |
312
|
|
|
Please choose a different name'.format(script_name)) |
313
|
|
|
return script |
314
|
|
|
else: |
315
|
|
|
print('Warning job scheduler not detected') |
316
|
|
|
return None |
317
|
|
|
|
318
|
|
|
|
319
|
|
|
def submit(script_name, immediate, depends=None): |
320
|
|
|
|
321
|
|
|
if scheduler is None: |
322
|
|
|
return None |
323
|
|
|
|
324
|
|
|
job_id = -1 |
325
|
|
|
import os.path |
|
|
|
|
326
|
|
|
if os.path.isfile(script_name): |
327
|
|
|
job_id = scheduler.submit(script_name, immediate, depends) |
328
|
|
|
if job_id is not None: |
329
|
|
|
print('Job submitted with ID {0}'.format(job_id)) |
330
|
|
|
if job_db is not None and job_id is not None: |
331
|
|
|
from persist import Job |
|
|
|
|
332
|
|
|
job = Job(job_id, time.time()) |
333
|
|
|
with open(script_name, 'r') as f: |
334
|
|
|
for line in f: |
335
|
|
|
if line.split('=')[0] == 'export NUM_TASKS': |
336
|
|
|
job.num_tasks = line.split('=')[1].strip() |
337
|
|
|
if line.split('=')[0] == 'export TASKS_PER_NODE': |
338
|
|
|
job.tasks_per_node = line.split('=')[1].strip() |
339
|
|
|
if line.split('=')[0] == 'export THREADS_PER_TASK': |
340
|
|
|
job.threads_per_task = line.split('=')[1].strip() |
341
|
|
|
if line.split('=')[0] == 'export NUM_NODES': |
342
|
|
|
job.num_nodes = line.split('=')[1].strip() |
343
|
|
|
if line.split('=')[0] == 'export MYCLUSTER_QUEUE': |
344
|
|
|
job.queue = line.split('=')[1].strip() |
345
|
|
|
if line.split('=')[0] == 'export MYCLUSTER_JOB_NAME': |
346
|
|
|
job.job_name = line.split('=')[1].strip() |
347
|
|
|
|
348
|
|
|
job.script_name = script_name |
349
|
|
|
job.job_dir = os.path.dirname(os.path.abspath(script_name)) |
350
|
|
|
job_db.add_job(job) |
351
|
|
|
job_db.add_queue(job.queue, scheduler.name()) |
352
|
|
|
else: |
353
|
|
|
print('Error file: {0} does not exist.'.format(script_name)) |
354
|
|
|
|
355
|
|
|
return job_id |
356
|
|
|
|
357
|
|
|
|
358
|
|
|
def delete(job_id): |
359
|
|
|
# Add check |
360
|
|
|
job = job_db.get(job_id) |
361
|
|
|
site_name = job.queue.site_name |
362
|
|
|
scheduler_type = job_db.site_db[site_name].scheduler_type |
363
|
|
|
|
364
|
|
|
if (scheduler.name() == site_name and |
365
|
|
|
scheduler.scheduler_type() == scheduler_type): |
366
|
|
|
scheduler.delete(job_id) |
367
|
|
|
else: |
368
|
|
|
print('JobID: ' + str(job_id) + ' not found at current site') |
369
|
|
|
|
370
|
|
|
|
371
|
|
|
def add_remote(remote_site): |
372
|
|
|
if job_db is not None: |
373
|
|
|
job_db.add_remote(remote_site) |
374
|
|
|
|
375
|
|
|
|
376
|
|
|
def export(job_id): |
377
|
|
|
pass |
378
|
|
|
|
379
|
|
|
|
380
|
|
|
def job_list(): |
381
|
|
|
if job_db is not None: |
382
|
|
|
return job_db.job_db |
383
|
|
|
return [] |
384
|
|
|
|
385
|
|
|
|
386
|
|
|
def get_job(job_id): |
387
|
|
|
if job_db is not None: |
388
|
|
|
return job_db.get(job_id) |
389
|
|
|
return None |
390
|
|
|
|
391
|
|
|
|
392
|
|
|
def my_import(name): |
393
|
|
|
mod = __import__(name) |
394
|
|
|
components = name.split('.') |
395
|
|
|
for comp in components[1:]: |
396
|
|
|
mod = getattr(mod, comp) |
397
|
|
|
return mod |
398
|
|
|
|
399
|
|
|
|
400
|
|
|
def get_directory(): |
401
|
|
|
from os.path import expanduser |
402
|
|
|
home = expanduser("~") |
403
|
|
|
directory = home + '/.mycluster/' |
404
|
|
|
return directory |
405
|
|
|
|
406
|
|
|
|
407
|
|
|
def create_directory(): |
408
|
|
|
directory = get_directory() |
409
|
|
|
if not os.path.exists(directory): |
410
|
|
|
os.makedirs(directory) |
411
|
|
|
return True |
412
|
|
|
else: |
413
|
|
|
return False |
414
|
|
|
|
415
|
|
|
|
416
|
|
|
def create_db(): |
417
|
|
|
global job_db |
418
|
|
|
try: |
419
|
|
|
from persist import JobDB |
|
|
|
|
420
|
|
|
job_db = JobDB() |
421
|
|
|
except Exception as e: |
422
|
|
|
print('Database failed to initialise. Error Message: ' + str(e)) |
423
|
|
|
|
424
|
|
|
return job_db |
425
|
|
|
|
426
|
|
|
|
427
|
|
|
def update_db(): |
428
|
|
|
try: |
429
|
|
|
if scheduler is not None: |
430
|
|
|
status_dict = scheduler.status() |
431
|
|
|
jobs = job_list() |
432
|
|
|
for j in jobs: |
433
|
|
|
if jobs[j].status != 'completed': |
434
|
|
|
job_id = jobs[j].job_id |
435
|
|
|
if job_id in status_dict: |
436
|
|
|
state = status_dict[job_id] |
437
|
|
|
if state == 'r': |
438
|
|
|
jobs[j].update_status('running') |
439
|
|
|
else: |
440
|
|
|
jobs[j].update_status('completed') |
441
|
|
|
jobs[j].update_stats(scheduler.job_stats(job_id)) |
442
|
|
|
except Exception as e: |
443
|
|
|
print('Database failed to update. Error Message: ' + str(e)) |
444
|
|
|
|
445
|
|
|
|
446
|
|
|
def sysscribe_update(job_id): |
447
|
|
|
if job_db is not None: |
448
|
|
|
from sysscribe import system |
|
|
|
|
449
|
|
|
job_db.get(job_id).update_sysscribe(system.system_dict()) |
450
|
|
|
|
451
|
|
|
|
452
|
|
|
def email_update(email): |
453
|
|
|
if job_db is not None: |
454
|
|
|
job_db.user_db['user'].update_email(email) |
455
|
|
|
|
456
|
|
|
|
457
|
|
|
def firstname_update(name): |
458
|
|
|
if job_db is not None: |
459
|
|
|
job_db.user_db['user'].firstname(name) |
460
|
|
|
|
461
|
|
|
|
462
|
|
|
def lastname_update(name): |
463
|
|
|
if job_db is not None: |
464
|
|
|
job_db.user_db['user'].lastname(name) |
465
|
|
|
|
466
|
|
|
|
467
|
|
|
def get_user(): |
468
|
|
|
if job_db is not None: |
469
|
|
|
return (job_db.user_db['user'].first_name + ' ' + |
470
|
|
|
job_db.user_db['user'].last_name) |
471
|
|
|
else: |
472
|
|
|
return 'unknown' |
473
|
|
|
|
474
|
|
|
|
475
|
|
|
def get_email(): |
476
|
|
|
if job_db is not None: |
477
|
|
|
return job_db.user_db['user'].email |
478
|
|
|
else: |
479
|
|
|
return 'unknown' |
480
|
|
|
|
481
|
|
|
|
482
|
|
|
def get_site(): |
483
|
|
|
return 'unknown' |
484
|
|
|
|
485
|
|
|
|
486
|
|
|
def appname_update(job_id, appname): |
487
|
|
|
if job_db is not None: |
488
|
|
|
job_db.get(job_id).appname(appname) |
489
|
|
|
|
490
|
|
|
|
491
|
|
|
def appdata_update(job_id, appdata): |
492
|
|
|
if job_db is not None: |
493
|
|
|
job_db.get(job_id).appdata(appdata) |
494
|
|
|
|
495
|
|
|
|
496
|
|
|
def init(): |
497
|
|
|
global scheduler |
498
|
|
|
scheduler = detect_scheduling_sys() |
499
|
|
|
created = create_directory() |
500
|
|
|
if create_db() is not None: |
501
|
|
|
update_db() |
502
|
|
|
|
503
|
|
|
print('MyCluster Initialisation Info') |
504
|
|
|
print('-----------------------------') |
505
|
|
|
print('Local database in: ' + get_directory()) |
506
|
|
|
print('User: ' + get_user()) |
507
|
|
|
print('Email: ' + get_email()) |
508
|
|
|
if not scheduler: |
509
|
|
|
print('Local job scheduler: None') |
510
|
|
|
else: |
511
|
|
|
print('Local job scheduler: ' + scheduler.scheduler_type()) |
512
|
|
|
print('Site name: ' + get_site()) |
513
|
|
|
print('') |
514
|
|
|
|