1
2
3
4
5
6
7
8
9
10
11
12
13
14 from __future__ import absolute_import
15 from __future__ import print_function
16 import subprocess
17 import logging
18 import os
19 import time
20 import re
21 import glob
22 import inspect
23 import sys
24 import six
25 from six.moves import range
26 from six.moves import input
27
28 logger = logging.getLogger('madgraph.cluster')
29
30 try:
31 from madgraph import MadGraph5Error
32 import madgraph.various.misc as misc
33 except Exception as error:
34 if __debug__:
35 print(str(error))
36 from internal import MadGraph5Error
37 import internal.misc as misc
38
39 pjoin = os.path.join
43
46
47
48 multiple_try = misc.multiple_try
49 pjoin = os.path.join
53
54 def deco_interupt(f):
55 def deco_f_interupt(self, *args, **opt):
56 try:
57 return f(self, *args, **opt)
58 except error:
59 try:
60 self.remove(*args, **opt)
61 except Exception:
62 pass
63 raise error
64 return deco_f_interupt
65 return deco_interupt
66
79 return deco_f_store
80 return deco_store
81
83 """ This function checks whether compression of input files are necessary
84 given the running options given. """
85
86 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
87 return False
88 else:
89 return True
90
92 """Basic Class for all cluster type submission"""
93 name = 'mother class'
94 identifier_length = 14
95
97 """Init the cluster"""
98
99 self.submitted = 0
100 self.submitted_ids = []
101 self.finish = 0
102 self.submitted_dirs = []
103 self.submitted_exes = []
104 self.submitted_args = []
105
106 if 'cluster_queue' in opts:
107 self.cluster_queue = opts['cluster_queue']
108 else:
109 self.cluster_queue = 'madgraph'
110 if 'cluster_temp_path' in opts:
111 self.temp_dir = opts['cluster_temp_path']
112 else:
113 self.temp_dir = None
114 self.options = {'cluster_status_update': (600, 30)}
115 for key,value in opts.items():
116 self.options[key] = value
117 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
118 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
119 self.options = dict(opts)
120 self.retry_args = {}
121
122 self.packet = {}
123 self.id_to_packet = {}
124
125 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
126 log=None, required_output=[], nb_submit=0):
127 """How to make one submission. Return status id on the cluster."""
128 raise NotImplemented('No implementation of how to submit a job to cluster \'%s\'' % self.name)
129
130
131 @store_input()
132 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
133 log=None, input_files=[], output_files=[], required_output=[],
134 nb_submit=0):
135 """How to make one submission. Return status id on the cluster.
136 NO SHARE DISK"""
137
138 if cwd is None:
139 cwd = os.getcwd()
140 if not os.path.exists(prog):
141 prog = os.path.join(cwd, prog)
142
143 if not required_output and output_files:
144 required_output = output_files
145
146 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
147 (input_files == [] == output_files):
148
149 return self.submit(prog, argument, cwd, stdout, stderr, log,
150 required_output=required_output, nb_submit=nb_submit)
151
152 if not input_files and not output_files:
153
154 return self.submit(prog, argument, cwd, stdout, stderr, log,
155 required_output=required_output, nb_submit=nb_submit)
156
157 if cwd is None:
158 cwd = os.getcwd()
159 if not os.path.exists(prog):
160 prog = os.path.join(cwd, prog)
161 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
162
163 text = """#!/bin/bash
164 MYTMP=%(tmpdir)s/run$%(job_id)s
165 MYPWD=%(cwd)s
166 mkdir -p $MYTMP
167 cd $MYPWD
168 input_files=( %(input_files)s )
169 for i in ${input_files[@]}
170 do
171 cp -R -L $i $MYTMP
172 done
173 cd $MYTMP
174 echo '%(arguments)s' > arguments
175 chmod +x ./%(script)s
176 %(program)s ./%(script)s %(arguments)s
177 exit=$?
178 output_files=( %(output_files)s )
179 for i in ${output_files[@]}
180 do
181 cp -r $MYTMP/$i $MYPWD
182 done
183 # if [ "$exit" -eq "0" ]
184 # then
185 rm -rf $MYTMP
186 # fi
187 """
188
189 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
190 'cwd': cwd, 'job_id': self.job_id,
191 'input_files': ' '.join(input_files + [prog]),
192 'output_files': ' '.join(output_files),
193 'arguments': ' '.join([str(a) for a in argument]),
194 'program': ' ' if '.py' in prog else 'bash'}
195
196
197 new_prog = pjoin(cwd, temp_file_name)
198 open(new_prog, 'w').write(text % dico)
199 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
200
201 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
202 required_output=required_output, nb_submit=nb_submit)
203
204
205 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
206 log=None, input_files=[], output_files=[], required_output=[],
207 nb_submit=0, packet_member=None):
208 """This function wrap the cluster submition with cluster independant
209 method should not be overwritten (but for DAG type submission)"""
210
211 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
212 output_files, required_output, nb_submit)
213
214
215 if not packet_member:
216 return id
217 else:
218 if isinstance(packet_member, Packet):
219 self.id_to_packet[id] = packet_member
220 packet_member.put(id)
221 if packet_member.tag not in self.packet:
222 self.packet[packet_member.tag] = packet_member
223 else:
224 if packet_member in self.packet:
225 packet = self.packet[packet_member]
226 packet.put(id)
227 self.id_to_packet[id] = packet
228 return id
229
231 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
232 if not self.submitted_ids:
233 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
234 idle, run, fail = 0, 0, 0
235 for pid in self.submitted_ids[:]:
236 status = self.control_one_job(id)
237 if status == 'I':
238 idle += 1
239 elif status == 'R':
240 run += 1
241 elif status == 'F':
242 self.finish +=1
243 self.submitted_ids.remove(pid)
244 else:
245 fail += 1
246
247 return idle, run, self.finish, fail
248
250 """ control the status of a single job with it's cluster id """
251 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
252
254 """get a unique run_name for all the jobs helps to identify the runs
255 in the controller for some cluster."""
256
257 if second_path:
258 path = os.path.realpath(pjoin(path, second_path))
259 elif not os.path.exists(path):
260 return path
261
262 if 'SubProcesses' in path:
263 target = path.rsplit('/SubProcesses',1)[0]
264 elif 'MCatNLO' in path:
265 target = path.rsplit('/MCatNLO',1)[0]
266 elif 'PY8_parallelization' in path:
267 target = path.rsplit('/PY8_parallelization',1)[0]
268 elif second_path:
269 target=path
270 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
271 else:
272 target = path
273
274 if target.endswith('/'):
275 target = target[:-1]
276
277 target = misc.digest(target.encode())[-self.identifier_length:]
278 if not target[0].isalpha():
279 target = 'a' + target[1:]
280
281 return target
282
283
284 @check_interupt()
285 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
286 """Wait that all job are finish.
287 if minimal_job set, then return if idle + run is lower than that number"""
288
289
290 mode = 1
291 nb_iter = 0
292 nb_short = 0
293 change_at = 5
294
295 if update_first:
296 idle, run, finish, fail = self.control(me_dir)
297 update_first(idle, run, finish)
298
299
300 longtime, shorttime = self.options['cluster_status_update']
301
302 nb_job = 0
303
304 if self.options['cluster_type'] == 'htcaas2':
305 me_dir = self.metasubmit(self)
306
307 while 1:
308 old_mode = mode
309 nb_iter += 1
310 idle, run, finish, fail = self.control(me_dir)
311 if nb_job:
312 if idle + run + finish + fail != nb_job:
313 nb_job = idle + run + finish + fail
314 nb_iter = 1
315 else:
316 nb_job = idle + run + finish + fail
317 if fail:
318 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
319 if idle + run == 0:
320
321 logger.info('All jobs finished')
322 fct(idle, run, finish)
323 break
324 if idle + run < minimal_job:
325 return
326 fct(idle, run, finish)
327
328 if nb_iter < change_at:
329 mode = 1
330 elif idle < run:
331 if old_mode == 0:
332 if nb_short:
333 mode = 0
334
335 elif idle:
336 if nb_iter > change_at + int(longtime)//shorttime:
337 mode = 0
338 else:
339 mode = 1
340 nb_short =0
341 else:
342 mode = 1
343 nb_short = 0
344 elif old_mode == 1:
345 nb_short +=1
346 if nb_short > 3* max(change_at, int(longtime)//shorttime):
347 mode = 0
348 else:
349 mode = 0
350
351
352 if old_mode > mode:
353 logger.info('''Start to wait %ss between checking status.
354 Note that you can change this time in the configuration file.
355 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
356
357
358 if mode == 0:
359 try:
360 time.sleep(self.options['cluster_status_update'][0])
361 except KeyboardInterrupt:
362 logger.info('start to update the status')
363 nb_iter = min(0, change_at -2)
364 nb_short = 0
365 else:
366 time.sleep(self.options['cluster_status_update'][1])
367
368
369 self.submitted = 0
370 self.submitted_ids = []
371 self.id_to_packet = {}
372
374 """Check the termination of the jobs with job_id and relaunch it if needed."""
375
376
377 if job_id not in self.retry_args:
378 if job_id in self.id_to_packet:
379 nb_in_packet = self.id_to_packet[job_id].remove_one()
380 if nb_in_packet == 0:
381
382 packet = self.id_to_packet[job_id]
383
384 packet.queue.join()
385
386 packet.fct(*packet.args)
387 del self.id_to_packet[job_id]
388 return 'resubmit'
389 else:
390 return True
391
392 args = self.retry_args[job_id]
393 if 'time_check' in args:
394 time_check = args['time_check']
395 else:
396 time_check = 0
397
398 for path in args['required_output']:
399 if args['cwd']:
400 path = pjoin(args['cwd'], path)
401
402 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
403 break
404 else:
405
406 if time_check > 0:
407 logger.info('Job %s Finally found the missing output.' % (job_id))
408 del self.retry_args[job_id]
409 self.submitted_ids.remove(job_id)
410
411 if job_id in self.id_to_packet:
412 nb_in_packet = self.id_to_packet[job_id].remove_one()
413 if nb_in_packet == 0:
414
415 packet = self.id_to_packet[job_id]
416
417 packet.queue.join()
418
419 packet.fct(*packet.args)
420 del self.id_to_packet[job_id]
421 return 'resubmit'
422
423 return 'done'
424
425 if time_check == 0:
426 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
427 args['time_check'] = time.time()
428 return 'wait'
429 elif self.cluster_retry_wait > time.time() - time_check:
430 return 'wait'
431
432
433 if self.nb_retry < 0:
434 logger.critical('''Fail to run correctly job %s.
435 with option: %s
436 file missing: %s''' % (job_id, args, path))
437 input('press enter to continue.')
438 elif self.nb_retry == 0:
439 logger.critical('''Fail to run correctly job %s.
440 with option: %s
441 file missing: %s.
442 Stopping all runs.''' % (job_id, args, path))
443 self.remove()
444 elif args['nb_submit'] >= self.nb_retry:
445 logger.critical('''Fail to run correctly job %s.
446 with option: %s
447 file missing: %s
448 Fails %s times
449 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
450 self.remove()
451 else:
452 args['nb_submit'] += 1
453 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
454 del self.retry_args[job_id]
455 self.submitted_ids.remove(job_id)
456 if 'time_check' in args:
457 del args['time_check']
458 if job_id in self.id_to_packet:
459 self.id_to_packet[job_id].remove_one()
460 args['packet_member'] = self.id_to_packet[job_id]
461 del self.id_to_packet[job_id]
462 self.cluster_submit(**args)
463 else:
464 self.submit2(**args)
465 return 'resubmit'
466 return 'done'
467
468 @check_interupt()
469 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
470 stderr=None, log=None, required_output=[], nb_submit=0,
471 input_files=[], output_files=[]):
472 """launch one job on the cluster and wait for it"""
473
474 special_output = False
475 if stderr == -2 and stdout:
476
477 special_output = True
478 stderr = stdout + '.err'
479
480 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
481 required_output=required_output, input_files=input_files,
482 output_files=output_files)
483
484 if self.options['cluster_type']=='htcaas2':
485 if self.submitted == self.submitted_ids[-1]:
486 id = self.metasubmit(self)
487
488 frame = inspect.currentframe()
489 args, _, _, values = inspect.getargvalues(frame)
490 args = dict([(i, values[i]) for i in args if i != 'self'])
491 self.retry_args[id] = args
492
493 nb_wait=0
494 while 1:
495 nb_wait+=1
496 status = self.control_one_job(id)
497 if not status in ['R','I']:
498 status = self.check_termination(id)
499 if status in ['wait']:
500 time.sleep(30)
501 continue
502 elif status in ['resubmit']:
503 id = self.submitted_ids[0]
504 time.sleep(30)
505 continue
506
507 time.sleep(30)
508 break
509 time.sleep(self.options['cluster_status_update'][1])
510
511 if required_output:
512 status = self.check_termination(id)
513 if status == 'wait':
514 run += 1
515 elif status == 'resubmit':
516 idle += 1
517
518
519 if special_output:
520
521
522 for i in range(5):
523 if os.path.exists(stdout):
524 if not os.path.exists(stderr):
525 time.sleep(5)
526 if os.path.exists(stderr):
527 err_text = open(stderr).read()
528 if not err_text:
529 return
530 logger.warning(err_text)
531 text = open(stdout).read()
532 open(stdout,'w').write(text + err_text)
533 else:
534 return
535 time.sleep(10)
536
537 - def remove(self, *args, **opts):
538 """ """
539 logger.warning("""This cluster didn't support job removal,
540 the jobs are still running on the cluster.""")
541
542 @store_input()
546
548 """routine which allow to modify the run_card/mg5cmd object to change the
549 default behavior of the runs.
550 This is called at the time of the compilation of the run_card.
551 Note that this function can be called multiple times by run.
552 """
553
554 return
555
557 """ an object for handling packet of job, it is designed to be thread safe
558 """
559
560 - def __init__(self, name, fct, args, opts={}):
561 import six.moves.queue
562 import threading
563 self.queue = six.moves.queue.Queue()
564 self.tag = name
565 self.fct = fct
566 self.args = args
567 self.opts = opts
568 self.done = threading.Event()
569
570 - def put(self, *args, **opts):
572
573 append = put
574
579
581 """class for dealing with the submission in multiple node"""
582
583 job_id = "$"
584
586 """Init the cluster """
587
588
589 super(MultiCore, self).__init__(self, *args, **opt)
590
591 import six.moves.queue
592 import threading
593 import six.moves._thread
594 self.queue = six.moves.queue.Queue()
595 self.done = six.moves.queue.Queue()
596 self.submitted = six.moves.queue.Queue()
597 self.stoprequest = threading.Event()
598 self.demons = []
599 self.nb_done =0
600 if 'nb_core' in opt:
601 self.nb_core = opt['nb_core']
602 elif isinstance(args[0],int):
603 self.nb_core = args[0]
604 else:
605 self.nb_core = 1
606 self.update_fct = None
607
608 self.lock = threading.Event()
609 self.pids = six.moves.queue.Queue()
610 self.done_pid = []
611 self.done_pid_queue = six.moves.queue.Queue()
612 self.fail_msg = None
613
614
615 for _ in range(self.nb_core):
616 self.start_demon()
617
618
620 import threading
621 t = threading.Thread(target=self.worker)
622 t.daemon = True
623 t.start()
624 self.demons.append(t)
625
626
628 import six.moves.queue
629 import six.moves._thread
630 while not self.stoprequest.isSet():
631 try:
632 args = self.queue.get()
633 tag, exe, arg, opt = args
634 try:
635
636 if isinstance(exe,str):
637 if os.path.exists(exe) and not exe.startswith('/'):
638 exe = './' + exe
639 if isinstance(opt['stdout'],str):
640 opt['stdout'] = open(opt['stdout'],'w')
641 if opt['stderr'] == None:
642 opt['stderr'] = subprocess.STDOUT
643 if arg:
644 proc = misc.Popen([exe] + arg, **opt)
645 else:
646 proc = misc.Popen(exe, **opt)
647 pid = proc.pid
648 self.pids.put(pid)
649 proc.wait()
650 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
651 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
652 (' '.join([exe]+arg), proc.returncode)
653 logger.warning(fail_msg)
654 self.stoprequest.set()
655 self.remove(fail_msg)
656
657
658
659
660 else:
661 pid = tag
662 self.pids.put(pid)
663
664
665 returncode = exe(*arg, **opt)
666 if returncode != 0:
667 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
668 self.stoprequest.set()
669 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
670 except Exception as error:
671 self.fail_msg = sys.exc_info()
672 logger.warning(str(error))
673 self.stoprequest.set()
674 self.remove(error)
675
676 if __debug__:
677 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
678
679 self.queue.task_done()
680 self.done.put(tag)
681 self.done_pid_queue.put(pid)
682
683 try:
684 self.lock.set()
685 except six.moves._thread.error:
686 continue
687 except six.moves.queue.Empty:
688 continue
689
690
691
692
693 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
694 log=None, required_output=[], nb_submit=0):
695 """submit a job on multicore machine"""
696
697 tag = (prog, tuple(argument), cwd, nb_submit)
698 if isinstance(prog, str):
699
700 opt = {'cwd': cwd,
701 'stdout':stdout,
702 'stderr': stderr}
703
704 self.queue.put((tag, prog, argument, opt))
705 self.submitted.put(1)
706 return tag
707 else:
708
709 self.queue.put((tag, prog, argument, {}))
710 self.submitted.put(1)
711 return tag
712
713 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
714 stderr=None, log=None, **opts):
715 """launch one job and wait for it"""
716 if isinstance(stdout, str):
717 stdout = open(stdout, 'w')
718 if isinstance(stderr, str):
719 stdout = open(stderr, 'w')
720 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
721
722 - def remove(self, error=None):
723 """Ensure that all thread are killed"""
724
725
726 self.stoprequest.set()
727 if error and not self.fail_msg:
728 self.fail_msg = error
729
730
731 while not self.done_pid_queue.empty():
732 pid = self.done_pid_queue.get()
733 self.done_pid.append(pid)
734
735
736 while not self.pids.empty():
737 pid = self.pids.get()
738 self.pids.task_done()
739 if isinstance(pid, tuple):
740 continue
741 if pid in self.done_pid:
742 continue
743 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
744 % {'pid':pid} )
745 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
746
747
748 - def wait(self, me_dir, update_status, update_first=None):
749 """Waiting that all the jobs are done. This function also control that
750 the submission by packet are handle correctly (i.e. submit the function)"""
751
752 import six.moves.queue
753 import threading
754
755 try:
756 last_status = (0, 0, 0)
757 sleep_time = 1
758 use_lock = True
759 first = True
760 while True:
761 force_one_more_loop = False
762
763
764
765 while self.done.qsize():
766 try:
767 tag = self.done.get(True, 1)
768 except six.moves.queue.Empty:
769 pass
770 else:
771 if self.id_to_packet and tuple(tag) in self.id_to_packet:
772 packet = self.id_to_packet[tuple(tag)]
773 remaining = packet.remove_one()
774 if remaining == 0:
775
776 packet.queue.join()
777 self.submit(packet.fct, packet.args)
778 force_one_more_loop = True
779 self.nb_done += 1
780 self.done.task_done()
781
782
783
784 Idle = self.queue.qsize()
785 Done = self.nb_done + self.done.qsize()
786 Running = max(0, self.submitted.qsize() - Idle - Done)
787
788 if Idle + Running <= 0 and not force_one_more_loop:
789 update_status(Idle, Running, Done)
790
791
792 self.queue.join()
793 break
794
795 if (Idle, Running, Done) != last_status:
796 if first and update_first:
797 update_first(Idle, Running, Done)
798 first = False
799 else:
800 update_status(Idle, Running, Done)
801 last_status = (Idle, Running, Done)
802
803
804 while not self.done_pid_queue.empty():
805 pid = self.done_pid_queue.get()
806 self.done_pid.append(pid)
807 self.done_pid_queue.task_done()
808
809
810
811 if use_lock:
812
813 use_lock = self.lock.wait(300)
814 self.lock.clear()
815 if not use_lock and Idle > 0:
816 use_lock = True
817 else:
818
819
820 time.sleep(sleep_time)
821 sleep_time = min(sleep_time + 2, 180)
822 if update_first:
823 update_first(Idle, Running, Done)
824
825 if self.stoprequest.isSet():
826 if isinstance(self.fail_msg, Exception):
827 raise self.fail_msg
828 elif isinstance(self.fail_msg, str):
829 raise Exception(self.fail_msg)
830 else:
831 misc.sprint(self.fail_msg)
832 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
833
834 try:
835 self.lock.clear()
836 except Exception:
837 pass
838 self.done = six.moves.queue.Queue()
839 self.done_pid = []
840 self.done_pid_queue = six.moves.queue.Queue()
841 self.nb_done = 0
842 self.submitted = six.moves.queue.Queue()
843 self.pids = six.moves.queue.Queue()
844 self.stoprequest.clear()
845 self.id_to_packet = {}
846
847 except KeyboardInterrupt:
848
849 if isinstance(self.fail_msg, Exception):
850 raise self.fail_msg
851 elif isinstance(self.fail_msg, str):
852 raise Exception(self.fail_msg)
853 elif self.fail_msg:
854 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2])
855
856 raise
857
859 """Basic class for dealing with cluster submission"""
860
861 name = 'condor'
862 job_id = 'CONDOR_ID'
863
864
865
866 @multiple_try()
867 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
868 required_output=[], nb_submit=0):
869 """Submit a job prog to a Condor cluster"""
870
871 text = """Executable = %(prog)s
872 output = %(stdout)s
873 error = %(stderr)s
874 log = %(log)s
875 %(argument)s
876 environment = CONDOR_ID=$(Cluster).$(Process)
877 Universe = vanilla
878 notification = Error
879 Initialdir = %(cwd)s
880 %(requirement)s
881 getenv=True
882 queue 1
883 """
884
885 if self.cluster_queue not in ['None', None]:
886 requirement = 'Requirements = %s=?=True' % self.cluster_queue
887 else:
888 requirement = ''
889
890 if cwd is None:
891 cwd = os.getcwd()
892 if stdout is None:
893 stdout = '/dev/null'
894 if stderr is None:
895 stderr = '/dev/null'
896 if log is None:
897 log = '/dev/null'
898 if not os.path.exists(prog):
899 prog = os.path.join(cwd, prog)
900 if argument:
901 argument = 'Arguments = %s' % ' '.join(argument)
902 else:
903 argument = ''
904
905
906 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
907 'stderr': stderr,'log': log,'argument': argument,
908 'requirement': requirement}
909
910
911 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
912 stdin=subprocess.PIPE)
913 output, _ = a.communicate((text % dico).encode())
914
915
916
917
918 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
919 output = output.decode(errors='ignore')
920 try:
921 id = pat.search(output).groups()[0]
922 except:
923 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
924 % output)
925 self.submitted += 1
926 self.submitted_ids.append(id)
927 return id
928
929 @store_input()
930 @multiple_try()
931 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
932 log=None, input_files=[], output_files=[], required_output=[],
933 nb_submit=0):
934 """Submit the job on the cluster NO SHARE DISK
935 input/output file should be give relative to cwd
936 """
937
938 if not required_output and output_files:
939 required_output = output_files
940
941 if (input_files == [] == output_files):
942 return self.submit(prog, argument, cwd, stdout, stderr, log,
943 required_output=required_output, nb_submit=nb_submit)
944
945 text = """Executable = %(prog)s
946 output = %(stdout)s
947 error = %(stderr)s
948 log = %(log)s
949 %(argument)s
950 should_transfer_files = YES
951 when_to_transfer_output = ON_EXIT
952 transfer_input_files = %(input_files)s
953 %(output_files)s
954 Universe = vanilla
955 notification = Error
956 Initialdir = %(cwd)s
957 %(requirement)s
958 getenv=True
959 queue 1
960 """
961
962 if self.cluster_queue not in ['None', None]:
963 requirement = 'Requirements = %s=?=True' % self.cluster_queue
964 else:
965 requirement = ''
966
967 if cwd is None:
968 cwd = os.getcwd()
969 if stdout is None:
970 stdout = '/dev/null'
971 if stderr is None:
972 stderr = '/dev/null'
973 if log is None:
974 log = '/dev/null'
975 if not os.path.exists(prog):
976 prog = os.path.join(cwd, prog)
977 if argument:
978 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
979 else:
980 argument = ''
981
982 if input_files:
983 input_files = ','.join(input_files)
984 else:
985 input_files = ''
986 if output_files:
987 output_files = 'transfer_output_files = %s' % ','.join(output_files)
988 else:
989 output_files = ''
990
991
992
993 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
994 'stderr': stderr,'log': log,'argument': argument,
995 'requirement': requirement, 'input_files':input_files,
996 'output_files':output_files}
997
998
999 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
1000 stdin=subprocess.PIPE)
1001 output, _ = a.communicate((text % dico).encode())
1002
1003
1004
1005
1006 output = output.decode(errors='ignore')
1007 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
1008 try:
1009 id = pat.search(output).groups()[0]
1010 except:
1011 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1012 % output)
1013 self.submitted += 1
1014 self.submitted_ids.append(id)
1015 return id
1016
1017
1018
1019
1020
1021 @multiple_try(nb_try=10, sleep=10)
1023 """ control the status of a single job with it's cluster id """
1024 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1025 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1026 stderr=subprocess.PIPE)
1027
1028 error = status.stderr.read().decode(errors='ignore')
1029 if status.returncode or error:
1030 raise ClusterManagmentError('condor_q returns error: %s' % error)
1031
1032 return status.stdout.readline().decode(errors='ignore').strip()
1033
1034 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'}
1035 @check_interupt()
1036 @multiple_try(nb_try=10, sleep=10)
1038 """ control the status of a single job with it's cluster id """
1039
1040 if not self.submitted_ids:
1041 return 0, 0, 0, 0
1042
1043 packet = 15000
1044 idle, run, fail = 0, 0, 0
1045 ongoing = []
1046 for i in range(1+(len(self.submitted_ids)-1)//packet):
1047 start = i * packet
1048 stop = (i+1) * packet
1049 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1050 " -format \"%d \" ClusterId " + \
1051 " -format \"%d\\n\" JobStatus "
1052
1053 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1054 stderr=subprocess.PIPE)
1055 error = status.stderr.read().decode(errors='ignore')
1056 if status.returncode or error:
1057 raise ClusterManagmentError('condor_q returns error: %s' % error)
1058
1059 for line in status.stdout:
1060 id, status = line.decode(errors='ignore').strip().split()
1061 status = self.jobstatus[status]
1062 ongoing.append(id)
1063 if status in ['I','U']:
1064 idle += 1
1065 elif status == 'R':
1066 run += 1
1067 elif status != 'C':
1068 fail += 1
1069
1070 for id in list(self.submitted_ids):
1071 if id not in ongoing:
1072 status = self.check_termination(id)
1073 if status == 'wait':
1074 run += 1
1075 elif status == 'resubmit':
1076 idle += 1
1077
1078 return idle, run, self.submitted - (idle+run+fail), fail
1079
1080 @multiple_try()
1081 - def remove(self, *args, **opts):
1082 """Clean the jobson the cluster"""
1083
1084 if not self.submitted_ids:
1085 return
1086 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1087
1088 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1089 self.submitted_ids = []
1090
1092 """Basic class for dealing with cluster submission"""
1093
1094 name = 'pbs'
1095 job_id = 'PBS_JOBID'
1096 idle_tag = ['Q']
1097 running_tag = ['T','E','R']
1098 complete_tag = ['C']
1099
1100 maximum_submited_jobs = 2500
1101
1102 @multiple_try()
1103 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1104 required_output=[], nb_submit=0):
1105 """Submit a job prog to a PBS cluster"""
1106
1107 me_dir = self.get_jobs_identifier(cwd, prog)
1108
1109 if len(self.submitted_ids) > self.maximum_submited_jobs:
1110 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1111 self.wait(me_dir, fct, self.maximum_submited_jobs)
1112
1113
1114 text = ""
1115 if cwd is None:
1116 cwd = os.getcwd()
1117 else:
1118 text = " cd %s;" % cwd
1119 if stdout is None:
1120 stdout = '/dev/null'
1121 if stderr is None:
1122 stderr = '/dev/null'
1123 elif stderr == -2:
1124 stderr = stdout
1125 if log is None:
1126 log = '/dev/null'
1127
1128 if not os.path.isabs(prog):
1129 text += "./%s" % prog
1130 else:
1131 text+= prog
1132
1133 if argument:
1134 text += ' ' + ' '.join(argument)
1135
1136 command = ['qsub','-o', stdout,
1137 '-N', me_dir,
1138 '-e', stderr,
1139 '-V']
1140
1141 if self.cluster_queue and self.cluster_queue != 'None':
1142 command.extend(['-q', self.cluster_queue])
1143
1144 a = misc.Popen(command, stdout=subprocess.PIPE,
1145 stderr=subprocess.STDOUT,
1146 stdin=subprocess.PIPE, cwd=cwd)
1147
1148 output = a.communicate(text.encode())[0].decode(errors='ignore')
1149 id = output.split('.')[0]
1150 if not id.isdigit() or a.returncode !=0:
1151 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1152 % output)
1153
1154 self.submitted += 1
1155 self.submitted_ids.append(id)
1156 return id
1157
1158 @multiple_try()
1160 """ control the status of a single job with it's cluster id """
1161 cmd = 'qstat '+str(id)
1162 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1163 stderr=subprocess.STDOUT)
1164
1165 for line in status.stdout:
1166 line = line.decode(errors='ignore').strip()
1167 if 'cannot connect to server' in line or 'cannot read reply' in line:
1168 raise ClusterManagmentError('server disconnected')
1169 if 'Unknown' in line:
1170 return 'F'
1171 elif line.startswith(str(id)):
1172 jobstatus = line.split()[4]
1173 else:
1174 jobstatus=""
1175
1176 if status.returncode != 0 and status.returncode is not None:
1177 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode)
1178 if jobstatus in self.idle_tag:
1179 return 'I'
1180 elif jobstatus in self.running_tag:
1181 return 'R'
1182 return 'F'
1183
1184
1185 @multiple_try()
1187 """ control the status of a single job with it's cluster id """
1188 cmd = "qstat"
1189 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1190
1191 me_dir = self.get_jobs_identifier(me_dir)
1192
1193 ongoing = []
1194
1195 idle, run, fail = 0, 0, 0
1196 for line in status.stdout:
1197 line = line.decode(errors='ignore')
1198 if 'cannot connect to server' in line or 'cannot read reply' in line:
1199 raise ClusterManagmentError('server disconnected')
1200 if me_dir in line:
1201 ongoing.append(line.split()[0].split('.')[0])
1202 status2 = line.split()[4]
1203 if status2 in self.idle_tag:
1204 idle += 1
1205 elif status2 in self.running_tag:
1206 run += 1
1207 elif status2 in self.complete_tag:
1208 if not self.check_termination(line.split()[0].split('.')[0]):
1209 idle += 1
1210 else:
1211 fail += 1
1212
1213 if status.returncode != 0 and status.returncode is not None:
1214 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode)
1215
1216 for id in list(self.submitted_ids):
1217 if id not in ongoing:
1218 status2 = self.check_termination(id)
1219 if status2 == 'wait':
1220 run += 1
1221 elif status2 == 'resubmit':
1222 idle += 1
1223
1224 return idle, run, self.submitted - (idle+run+fail), fail
1225
1226 @multiple_try()
1227 - def remove(self, *args, **opts):
1228 """Clean the jobs on the cluster"""
1229
1230 if not self.submitted_ids:
1231 return
1232 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1233 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1234 self.submitted_ids = []
1235
1238 """Basic class for dealing with cluster submission"""
1239
1240
1241 name = 'sge'
1242 job_id = 'JOB_ID'
1243 idle_tag = ['qw', 'hqw','hRqw','w']
1244 running_tag = ['r','t','Rr','Rt']
1245 identifier_length = 10
1246
1248 """replace string for path issues"""
1249 location = os.path.realpath(location)
1250 homePath = os.getenv("HOME")
1251 if homePath:
1252 location = location.replace(homePath,'$HOME')
1253 return location
1254
1255 @multiple_try()
1256 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1257 required_output=[], nb_submit=0):
1258 """Submit a job prog to an SGE cluster"""
1259
1260 me_dir = self.get_jobs_identifier(cwd, prog)
1261
1262
1263 if cwd is None:
1264
1265 cwd = self.def_get_path(os.getcwd())
1266 cwd1 = self.def_get_path(cwd)
1267 text = " cd %s;" % cwd1
1268 if stdout is None:
1269 stdout = '/dev/null'
1270 else:
1271 stdout = self.def_get_path(stdout)
1272 if stderr is None:
1273 stderr = '/dev/null'
1274 elif stderr == -2:
1275 stderr = stdout
1276 else:
1277 stderr = self.def_get_path(stderr)
1278
1279 if log is None:
1280 log = '/dev/null'
1281 else:
1282 log = self.def_get_path(log)
1283
1284 text += prog
1285 if argument:
1286 text += ' ' + ' '.join(argument)
1287
1288
1289
1290
1291 homePath = os.getenv("HOME")
1292 if homePath:
1293 text = text.replace(homePath,'$HOME')
1294
1295 logger.debug("!=== input %s" % text)
1296 logger.debug("!=== output %s" % stdout)
1297 logger.debug("!=== error %s" % stderr)
1298 logger.debug("!=== logs %s" % log)
1299
1300 command = ['qsub','-o', stdout,
1301 '-N', me_dir,
1302 '-e', stderr,
1303 '-V']
1304
1305 if self.cluster_queue and self.cluster_queue != 'None':
1306 command.extend(['-q', self.cluster_queue])
1307
1308 a = misc.Popen(command, stdout=subprocess.PIPE,
1309 stderr=subprocess.STDOUT,
1310 stdin=subprocess.PIPE, cwd=cwd)
1311
1312 output = a.communicate(text.encode())[0].decode(errors='ignore')
1313 id = output.split(' ')[2]
1314 if not id.isdigit():
1315 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1316 % output)
1317 self.submitted += 1
1318 self.submitted_ids.append(id)
1319 logger.debug(output)
1320
1321 return id
1322
1323 @multiple_try()
1325 """ control the status of a single job with it's cluster id """
1326
1327 cmd = 'qstat '
1328 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1329 for line in status.stdout:
1330 line = line.decode(errors='ignore')
1331
1332
1333
1334
1335
1336
1337 if str(id) in line:
1338 status = line.split()[4]
1339
1340 if status in self.idle_tag:
1341 return 'I'
1342 elif status in self.running_tag:
1343 return 'R'
1344 return 'F'
1345
1346 @multiple_try()
1348 """ control the status of a single job with it's cluster id """
1349 cmd = "qstat "
1350 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1351
1352 me_dir = self.get_jobs_identifier(me_dir)
1353
1354 finished = list(self.submitted_ids)
1355
1356 idle, run, fail = 0, 0, 0
1357 for line in status.stdout:
1358 line = line.decode(errors='ignore')
1359 if me_dir in line:
1360 id,_,_,_,status = line.split()[:5]
1361 if status in self.idle_tag:
1362 idle += 1
1363 finished.remove(id)
1364 elif status in self.running_tag:
1365 run += 1
1366 finished.remove(id)
1367 else:
1368 logger.debug(line)
1369 fail += 1
1370 finished.remove(id)
1371
1372 for id in finished:
1373 self.check_termination(id)
1374
1375 return idle, run, self.submitted - (idle+run+fail), fail
1376
1377
1378
1379 @multiple_try()
1380 - def remove(self, *args, **opts):
1381 """Clean the jobs on the cluster"""
1382
1383 if not self.submitted_ids:
1384 return
1385 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1386 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1387 self.submitted_ids = []
1388
1391 """Basic class for dealing with cluster submission"""
1392
1393 name = 'lsf'
1394 job_id = 'LSB_JOBID'
1395
1396 @multiple_try()
1397 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1398 required_output=[], nb_submit=0):
1399 """Submit the job prog to an LSF cluster"""
1400
1401
1402 me_dir = self.get_jobs_identifier(cwd, prog)
1403
1404 text = ""
1405 command = ['bsub', '-C0', '-J', me_dir]
1406 if cwd is None:
1407 cwd = os.getcwd()
1408 else:
1409 text = " cd %s;" % cwd
1410 if stdout and isinstance(stdout, str):
1411 command.extend(['-o', stdout])
1412 if stderr and isinstance(stdout, str):
1413 command.extend(['-e', stderr])
1414 elif stderr == -2:
1415 pass
1416 if log is None:
1417 log = '/dev/null'
1418
1419 text += prog
1420 if argument:
1421 text += ' ' + ' '.join(argument)
1422
1423 if self.cluster_queue and self.cluster_queue != 'None':
1424 command.extend(['-q', self.cluster_queue])
1425
1426 a = misc.Popen(command, stdout=subprocess.PIPE,
1427 stderr=subprocess.STDOUT,
1428 stdin=subprocess.PIPE, cwd=cwd)
1429
1430 output = a.communicate(text.encode())[0].decode(errors='ignore')
1431
1432 try:
1433 id = output.split('>',1)[0].split('<')[1]
1434 except:
1435 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1436 % output)
1437 if not id.isdigit():
1438 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1439 % output)
1440 self.submitted += 1
1441 self.submitted_ids.append(id)
1442 return id
1443
1444
1445 @multiple_try()
1447 """ control the status of a single job with it's cluster id """
1448
1449 cmd = 'bjobs '+str(id)
1450 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1451
1452 for line in status.stdout:
1453 line = line.decode(errors='ignore').strip().upper()
1454 if 'JOBID' in line:
1455 continue
1456 elif str(id) not in line:
1457 continue
1458 status = line.split()[2]
1459 if status == 'RUN':
1460 return 'R'
1461 elif status == 'PEND':
1462 return 'I'
1463 elif status == 'DONE':
1464 return 'F'
1465 else:
1466 return 'H'
1467 return 'F'
1468
1469 @multiple_try()
1471 """ control the status of a single job with it's cluster id """
1472
1473 if not self.submitted_ids:
1474 return 0, 0, 0, 0
1475
1476 cmd = "bjobs " + ' '.join(self.submitted_ids)
1477 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1478
1479 jobstatus = {}
1480 for line in status.stdout:
1481 line = line.decode(errors='ignore').strip()
1482 if 'JOBID' in line:
1483 continue
1484 splitline = line.split()
1485 id = splitline[0]
1486 if id not in self.submitted_ids:
1487 continue
1488 jobstatus[id] = splitline[2]
1489
1490 idle, run, fail = 0, 0, 0
1491 for id in self.submitted_ids[:]:
1492 if id in jobstatus:
1493 status = jobstatus[id]
1494 else:
1495 status = 'MISSING'
1496 if status == 'RUN':
1497 run += 1
1498 elif status == 'PEND':
1499 idle += 1
1500 else:
1501 status = self.check_termination(id)
1502 if status == 'wait':
1503 run += 1
1504 elif status == 'resubmit':
1505 idle += 1
1506
1507 return idle, run, self.submitted - (idle+run+fail), fail
1508
1509 @multiple_try()
1510 - def remove(self, *args,**opts):
1511 """Clean the jobs on the cluster"""
1512
1513 if not self.submitted_ids:
1514 return
1515 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1516 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1517 self.submitted_ids = []
1518
1520 """Class for dealing with cluster submission on a GE cluster"""
1521
1522 name = 'ge'
1523 job_id = 'JOB_ID'
1524 idle_tag = ['qw']
1525 running_tag = ['r']
1526
1527 @multiple_try()
1528 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1529 required_output=[], nb_submit=0):
1530 """Submit a job prog to a GE cluster"""
1531
1532 text = ""
1533 if cwd is None:
1534 cwd = os.getcwd()
1535 else:
1536 text = " cd %s; bash " % cwd
1537 if stdout is None:
1538 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1539 if stderr is None:
1540 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1541 elif stderr == -2:
1542 stderr = stdout
1543 if log is None:
1544 log = '/dev/null'
1545
1546 text += prog
1547 if argument:
1548 text += ' ' + ' '.join(argument)
1549 text += '\n'
1550 tmp_submit = os.path.join(cwd, 'tmp_submit')
1551 open(tmp_submit,'w').write(text)
1552
1553 a = misc.Popen(['qsub','-o', stdout,
1554 '-e', stderr,
1555 tmp_submit],
1556 stdout=subprocess.PIPE,
1557 stderr=subprocess.STDOUT,
1558 stdin=subprocess.PIPE, cwd=cwd)
1559
1560 output = a.communicate()[0].decode(errors='ignore')
1561
1562 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1563 try:
1564 id = pat.search(output).groups()[0]
1565 except:
1566 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \
1567 % output)
1568 self.submitted += 1
1569 self.submitted_ids.append(id)
1570 return id
1571
1572 @multiple_try()
1574 """ control the status of a single job with it's cluster id """
1575 cmd = 'qstat | grep '+str(id)
1576 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1577 if not status:
1578 return 'F'
1579
1580 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1581 stat = ''
1582 for line in status.stdout.read().decode(errors='ignore').split('\n'):
1583 if not line:
1584 continue
1585 line = line.strip()
1586 try:
1587 groups = pat.search(line).groups()
1588 except:
1589 raise ClusterManagmentError('bad syntax for stat: \n\"%s\"' % line)
1590 if groups[0] != id: continue
1591 stat = groups[1]
1592 if not stat:
1593 return 'F'
1594 if stat in self.idle_tag:
1595 return 'I'
1596 if stat in self.running_tag:
1597 return 'R'
1598
1599 @multiple_try()
1601 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1602 if not self.submitted_ids:
1603 return 0, 0, 0, 0
1604 idle, run, fail = 0, 0, 0
1605 ongoing = []
1606 for statusflag in ['p', 'r', 'sh']:
1607 cmd = 'qstat -s %s' % statusflag
1608 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1609
1610 pat = re.compile("^(\d+)")
1611 for line in status.stdout.read().decode(errors='ignore').split('\n'):
1612 line = line.strip()
1613 try:
1614 id = pat.search(line).groups()[0]
1615 except Exception:
1616 pass
1617 else:
1618 if id not in self.submitted_ids:
1619 continue
1620 ongoing.append(id)
1621 if statusflag == 'p':
1622 idle += 1
1623 if statusflag == 'r':
1624 run += 1
1625 if statusflag == 'sh':
1626 fail += 1
1627 for id in list(self.submitted_ids):
1628 if id not in ongoing:
1629 self.check_termination(id)
1630
1631
1632 return idle, run, self.submitted - idle - run - fail, fail
1633
1634 @multiple_try()
1635 - def remove(self, *args, **opts):
1636 """Clean the jobs on the cluster"""
1637
1638 if not self.submitted_ids:
1639 return
1640 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1641 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1642 self.submitted_ids = []
1643
1645 """start a computation and not wait for it to finish.
1646 this fonction returns a lock which is locked as long as the job is
1647 running."""
1648
1649 mc = MultiCore(1)
1650 mc.submit(exe, argument, cwd, stdout, **opt)
1651 mc.need_waiting = True
1652 return mc.lock
1653
1656 """Basic class for dealing with cluster submission"""
1657
1658 name = 'slurm'
1659 job_id = 'SLURM_JOBID'
1660 idle_tag = ['Q','PD','S','CF']
1661 running_tag = ['R', 'CG']
1662 complete_tag = ['C']
1663 identifier_length = 8
1664
1665 @multiple_try()
1666 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1667 required_output=[], nb_submit=0):
1668 """Submit a job prog to a SLURM cluster"""
1669
1670 me_dir = self.get_jobs_identifier(cwd, prog)
1671
1672
1673 if cwd is None:
1674 cwd = os.getcwd()
1675 if stdout is None:
1676 stdout = '/dev/null'
1677 if stderr is None:
1678 stderr = '/dev/null'
1679 elif stderr == -2:
1680 stderr = stdout
1681 if log is None:
1682 log = '/dev/null'
1683
1684 command = ['sbatch', '-o', stdout,
1685 '-J', me_dir,
1686 '-e', stderr, prog] + argument
1687
1688 if self.cluster_queue and self.cluster_queue != 'None':
1689 command.insert(1, '-p')
1690 command.insert(2, self.cluster_queue)
1691
1692 a = misc.Popen(command, stdout=subprocess.PIPE,
1693 stderr=subprocess.STDOUT,
1694 stdin=subprocess.PIPE, cwd=cwd)
1695
1696 output = a.communicate()
1697 output_arr = output[0].decode(errors='ignore').split(' ')
1698 id = output_arr[3].rstrip()
1699
1700 if not id.isdigit():
1701 id = re.findall('Submitted batch job ([\d\.]+)', ' '.join(output_arr))
1702
1703 if not id or len(id)>1:
1704 raise ClusterManagmentError( 'fail to submit to the cluster: \n%s' \
1705 % ('stdout: %s\nstderr %s' %(output[0],output[1])))
1706 id = id[0]
1707
1708
1709 self.submitted += 1
1710 self.submitted_ids.append(id)
1711 return id
1712
1713 @multiple_try()
1715 """ control the status of a single job with it's cluster id """
1716 cmd = 'squeue j'+str(id)
1717 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1718 stderr=open(os.devnull,'w'))
1719
1720 for line in status.stdout:
1721 line = line.decode(errors='ignore').strip()
1722 if 'Invalid' in line:
1723 return 'F'
1724 elif line.startswith(str(id)):
1725 status = line.split()[4]
1726 if status in self.idle_tag:
1727 return 'I'
1728 elif status in self.running_tag:
1729 return 'R'
1730 return 'F'
1731
1732 @multiple_try()
1734 """ control the status of a single job with it's cluster id """
1735 cmd = "squeue"
1736 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1737
1738 me_dir = self.get_jobs_identifier(me_dir)
1739
1740 idle, run, fail = 0, 0, 0
1741 ongoing=[]
1742 for line in pstatus.stdout:
1743 line = line.decode(errors='ignore')
1744 if me_dir in line:
1745 id, _, _,_ , status,_ = line.split(None,5)
1746 ongoing.append(id)
1747 if status in self.idle_tag:
1748 idle += 1
1749 elif status in self.running_tag:
1750 run += 1
1751 elif status in self.complete_tag:
1752 status = self.check_termination(id)
1753 if status == 'wait':
1754 run += 1
1755 elif status == 'resubmit':
1756 idle += 1
1757 else:
1758 fail += 1
1759
1760
1761 for id in list(self.submitted_ids):
1762 if id not in ongoing:
1763 status = self.check_termination(id)
1764 if status == 'wait':
1765 run += 1
1766 elif status == 'resubmit':
1767 idle += 1
1768
1769
1770 return idle, run, self.submitted - (idle+run+fail), fail
1771
1772 @multiple_try()
1773 - def remove(self, *args, **opts):
1774 """Clean the jobs on the cluster"""
1775
1776 if not self.submitted_ids:
1777 return
1778 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1779 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1780 self.submitted_ids = []
1781
1783 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1784
1785 name= 'htcaas'
1786 job_id = 'HTCAAS_JOBID'
1787 idle_tag = ['waiting']
1788 running_tag = ['preparing','running']
1789 complete_tag = ['done']
1790
1791 @store_input()
1792 @multiple_try()
1793 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1794 log=None, input_files=[], output_files=[], required_output=[],
1795 nb_submit=0):
1796 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1797 input/output file should be given as relative to CWd
1798 """
1799
1800 cur_usr = os.getenv('USER')
1801
1802 if cwd is None:
1803 cwd = os.getcwd()
1804
1805 cwd_cp = cwd.rsplit("/",2)
1806
1807 if not stdout is None:
1808 print("stdout: %s" % stdout)
1809
1810 if not os.path.exists(prog):
1811 prog = os.path.join(cwd, prog)
1812
1813 if not required_output and output_files:
1814 required_output = output_files
1815
1816 logger.debug(prog)
1817 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1818 cwd_arg = cwd+"/arguments"
1819 temp = ' '.join([str(a) for a in argument])
1820 arg_cmd="echo '"+temp+"' > " + cwd_arg
1821 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1822 if argument :
1823 command.extend(['-a ', '='.join([str(a) for a in argument])])
1824 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1825 id = a.stdout.read().strip()
1826
1827 else:
1828 cwd_arg = cwd+"/arguments"
1829 temp = ' '.join([str(a) for a in argument])
1830 temp_file_name = "sub." + os.path.basename(prog)
1831 text = """#!/bin/bash
1832 MYPWD=%(cwd)s
1833 cd $MYPWD
1834 input_files=(%(input_files)s )
1835 for i in ${input_files[@]}
1836 do
1837 chmod -f +x $i
1838 done
1839 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1840 """
1841 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1842 'arguments': ' '.join([str(a) for a in argument]),
1843 'program': ' ' if '.py' in prog else 'bash'}
1844
1845
1846 new_prog = pjoin(cwd, temp_file_name)
1847 open(new_prog, 'w').write(text % dico)
1848 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1849 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1850 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1851 id = a.stdout.read().strip()
1852 logger.debug(id)
1853
1854 nb_try=0
1855 nb_limit=5
1856 if not id.isdigit() :
1857 print("[ID is not digit]:" + id)
1858
1859 while not id.isdigit() :
1860 nb_try+=1
1861 print("[fail_retry]:"+ nb_try)
1862 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1863 id = a.stdout.read().strip()
1864 if nb_try > nb_limit :
1865 raise ClusterManagementError('fail to submit to the HTCaaS cluster: \n %s' % id)
1866 break
1867
1868 self.submitted += 1
1869 self.submitted_ids.append(id)
1870
1871 return id
1872
1873 @multiple_try(nb_try=10, sleep=5)
1875 """ control the status of a single job with it's cluster id """
1876
1877 if id == 0 :
1878 status_out ='C'
1879 else :
1880 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1881 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1882 stderr=subprocess.PIPE)
1883 error = status.stderr.read().decode(errors='ignore')
1884 if status.returncode or error:
1885 raise ClusterManagmentError('htcaas-job-submit returns error: %s' % error)
1886 status_out= status.stdout.read().decode(errors='ignore').strip()
1887 status_out= status_out.split(":",1)[1]
1888 if status_out == 'waiting':
1889 status_out='I'
1890 elif status_out == 'preparing' or status_out == 'running':
1891 status_out = 'R'
1892 elif status_out != 'done':
1893 status_out = 'F'
1894 elif status_out == 'done':
1895 status_out = 'C'
1896
1897 return status_out
1898
1899 @multiple_try()
1901 """ control the status of a single job with it's cluster id """
1902 if not self.submitted_ids:
1903 logger.debug("self.submitted_ids not exists")
1904 return 0, 0, 0, 0
1905
1906 ongoing = []
1907 idle, run, fail = 0, 0, 0
1908
1909 start = self.submitted_ids[0]
1910 end = self.submitted_ids[-1]
1911
1912 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1913 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1914
1915 for line in status.stdout:
1916
1917 status2 = line.decode(errors='ignore').split()[-1]
1918 if status2 != 'null' or line.split()[0].strip() != '0':
1919 ongoing.append(line.split()[0].strip())
1920 logger.debug("["+line.split()[0].strip()+"]"+status2)
1921 if status2 != 'null' or line.split()[0].strip() != '0':
1922 idle += 1
1923 elif status2 in self.idle_tag:
1924 idle += 1
1925 elif status2 in self.running_tag:
1926 run += 1
1927 elif status2 in self.complete_tag:
1928 if not self.check_termination(line.split()[0]):
1929 idle +=1
1930 else:
1931 fail += 1
1932
1933 return idle, run, self.submitted - (idle+run+fail), fail
1934
1935 @multiple_try()
1936 - def remove(self, *args, **opts):
1937 """Clean the jobson the cluster"""
1938
1939 if not self.submitted_ids:
1940 return
1941 for i in range(len(self.submitted_ids)):
1942 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1943 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1944
1946 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1947
1948 name= 'htcaas2'
1949 job_id = 'HTCAAS2_JOBID'
1950 idle_tag = ['waiting']
1951 running_tag = ['preparing','running']
1952 complete_tag = ['done']
1953
1954 @store_input()
1955 @multiple_try()
1956 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1957 log=None, input_files=[], output_files=[], required_output=[],
1958 nb_submit=0):
1959
1960 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1961 input/output file should be given as relative to CWD
1962 """
1963 if cwd is None:
1964 cwd = os.getcwd()
1965
1966 if not os.path.exists(prog):
1967 prog = os.path.join(cwd, prog)
1968
1969 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1970 if cwd or prog :
1971 self.submitted_dirs.append(cwd)
1972 self.submitted_exes.append(prog)
1973 else:
1974 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1975
1976 if argument :
1977 self.submitted_args.append('='.join([str(a) for a in argument]))
1978
1979 if cwd or prog :
1980 self.submitted += 1
1981 id = self.submitted
1982 self.submitted_ids.append(id)
1983 else:
1984 logger.debug("cwd and prog are not exist! ")
1985 id = 0
1986
1987 else:
1988 temp_file_name = "sub."+ os.path.basename(prog)
1989 text = """#!/bin/bash
1990 MYPWD=%(cwd)s
1991 cd $MYPWD
1992 input_files=(%(input_files)s )
1993 for i in ${input_files[@]}
1994 do
1995 chmod -f +x $i
1996 done
1997 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1998 """
1999 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
2000 'arguments': ' '.join([str(a) for a in argument]),
2001 'program': ' ' if '.py' in prog else 'bash'}
2002
2003 new_prog = pjoin(cwd, temp_file_name)
2004 open(new_prog, 'w').write(text % dico)
2005 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
2006 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
2007 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
2008 id = a.stdout.read().strip()
2009 logger.debug("[mode2]-["+str(id)+"]")
2010 if cwd and prog :
2011 self.submitted += 1
2012 self.submitted_ids.append(id)
2013 else:
2014 logger.debug("cwd and prog are not exist! ")
2015 id = 0
2016
2017 return id
2018
2019 @multiple_try()
2063
2064
2065 @multiple_try(nb_try=10, sleep=5)
2067 """ control the status of a single job with it's cluster id """
2068
2069 if self.submitted == self.submitted_ids[-1] :
2070 id = self.metasubmit(self)
2071 tempid = self.submitted_ids[-1]
2072 self.submitted_ids.remove(self.submitted_ids[-1])
2073 self.submitted_ids.append(id)
2074 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2075
2076 if id == 0 :
2077 status_out ='C'
2078 else:
2079 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2080 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2081 stderr=subprocess.PIPE)
2082 error = status.stderr.read().decode(errors='ignore')
2083 if status.returncode or error:
2084 raise ClusterManagmentError('htcaas-job-status returns error: %s' % error)
2085 status_out= status.stdout.read().decode(errors='ignore').strip()
2086 status_out= status_out.split(":",1)[1]
2087 logger.debug("[["+str(id)+"]]"+status_out)
2088 if status_out == 'waiting':
2089 status_out='I'
2090 elif status_out == 'preparing' or status_out == 'running':
2091 status_out = 'R'
2092 elif status_out != 'done':
2093 status_out = 'F'
2094 elif status_out == 'done':
2095 status_out = 'C'
2096 self.submitted -= 1
2097
2098 return status_out
2099
2100 @multiple_try()
2102 """ control the status of a single job with it's cluster id """
2103 if not self.submitted_ids:
2104 logger.debug("self.submitted_ids not exists")
2105 return 0, 0, 0, 0
2106
2107 if "//" in me_dir :
2108 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2109 start = me_dir.split("//")[0]
2110 end = me_dir.split("//")[1]
2111 else :
2112 start = me_dir.split("//")[1]
2113 end = me_dir.split("//")[0]
2114 elif "/" in me_dir :
2115 start = 0
2116 end = 0
2117 elif me_dir.isdigit():
2118 start = me_dir
2119 end = me_dir
2120 elif not me_dir.isdigit():
2121 me_dir = self.submitted_ids[0]
2122 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2123
2124 ongoing = []
2125 idle, run, fail, done = 0, 0, 0, 0
2126
2127 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2128 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2129
2130 for line in status.stdout:
2131 line = line.decode(errors='ignore')
2132 status2 = line.split()[-1]
2133 if status2 != 'null' or line.split()[0].strip() != '0':
2134 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2135 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2136
2137 if status2 == 'null' or line.split()[0].strip() == '0':
2138 idle += 1
2139 elif status2 in self.idle_tag:
2140 idle += 1
2141 elif status2 in self.running_tag:
2142 run += 1
2143 elif status2 in self.complete_tag:
2144 done += 1
2145 self.submitted -= 1
2146 if not self.check_termination(line.split()[1]):
2147 idle +=1
2148 else:
2149 fail += 1
2150
2151 return idle, run, self.submitted - (idle+run+fail), fail
2152
2153 @multiple_try()
2154 - def remove(self, *args, **opts):
2155 """Clean the jobson the cluster"""
2156
2157 if not self.submitted_ids:
2158 return
2159 id = self.submitted_ids[0]
2160 if id:
2161 cmd = "htcaas-job-cancel -m %s" % str(id)
2162 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2163
2164 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2165 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2166 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2167
2168 onecore=MultiCore(1)
2169
2170