1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21 import sys
22
23 logger = logging.getLogger('madgraph.cluster')
24
25 try:
26 from madgraph import MadGraph5Error
27 import madgraph.various.misc as misc
28 except Exception, error:
29 if __debug__:
30 print str(error)
31 from internal import MadGraph5Error
32 import internal.misc as misc
33
34 pjoin = os.path.join
38
41
42
43 multiple_try = misc.multiple_try
44 pjoin = os.path.join
48
49 def deco_interupt(f):
50 def deco_f_interupt(self, *args, **opt):
51 try:
52 return f(self, *args, **opt)
53 except error:
54 try:
55 self.remove(*args, **opt)
56 except Exception:
57 pass
58 raise error
59 return deco_f_interupt
60 return deco_interupt
61
74 return deco_f_store
75 return deco_store
76
78 """ This function checks whether compression of input files are necessary
79 given the running options given. """
80
81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
82 return False
83 else:
84 return True
85
87 """Basic Class for all cluster type submission"""
88 name = 'mother class'
89 identifier_length = 14
90
92 """Init the cluster"""
93
94 self.submitted = 0
95 self.submitted_ids = []
96 self.finish = 0
97 self.submitted_dirs = []
98 self.submitted_exes = []
99 self.submitted_args = []
100
101 if 'cluster_queue' in opts:
102 self.cluster_queue = opts['cluster_queue']
103 else:
104 self.cluster_queue = 'madgraph'
105 if 'cluster_temp_path' in opts:
106 self.temp_dir = opts['cluster_temp_path']
107 else:
108 self.temp_dir = None
109 self.options = {'cluster_status_update': (600, 30)}
110 for key,value in opts.items():
111 self.options[key] = value
112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
114 self.options = dict(opts)
115 self.retry_args = {}
116
117 self.packet = {}
118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster."""
123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124
125
126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
128 log=None, input_files=[], output_files=[], required_output=[],
129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster.
131 NO SHARE DISK"""
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137
138 if not required_output and output_files:
139 required_output = output_files
140
141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
142 (input_files == [] == output_files):
143 return self.submit(prog, argument, cwd, stdout, stderr, log,
144 required_output=required_output, nb_submit=nb_submit)
145
146 if not input_files and not output_files:
147
148 return self.submit(prog, argument, cwd, stdout, stderr, log,
149 required_output=required_output, nb_submit=nb_submit)
150
151 if cwd is None:
152 cwd = os.getcwd()
153 if not os.path.exists(prog):
154 prog = os.path.join(cwd, prog)
155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
156
157 text = """#!/bin/bash
158 MYTMP=%(tmpdir)s/run$%(job_id)s
159 MYPWD=%(cwd)s
160 mkdir -p $MYTMP
161 cd $MYPWD
162 input_files=( %(input_files)s )
163 for i in ${input_files[@]}
164 do
165 cp -R -L $i $MYTMP
166 done
167 cd $MYTMP
168 echo '%(arguments)s' > arguments
169 chmod +x ./%(script)s
170 %(program)s ./%(script)s %(arguments)s
171 exit=$?
172 output_files=( %(output_files)s )
173 for i in ${output_files[@]}
174 do
175 cp -r $MYTMP/$i $MYPWD
176 done
177 # if [ "$exit" -eq "0" ]
178 # then
179 rm -rf $MYTMP
180 # fi
181 """
182
183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
184 'cwd': cwd, 'job_id': self.job_id,
185 'input_files': ' '.join(input_files + [prog]),
186 'output_files': ' '.join(output_files),
187 'arguments': ' '.join([str(a) for a in argument]),
188 'program': ' ' if '.py' in prog else 'bash'}
189
190
191 new_prog = pjoin(cwd, temp_file_name)
192 open(new_prog, 'w').write(text % dico)
193 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
194
195 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
196 required_output=required_output, nb_submit=nb_submit)
197
198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
200 log=None, input_files=[], output_files=[], required_output=[],
201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant
203 method should not be overwritten (but for DAG type submission)"""
204
205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
206 output_files, required_output, nb_submit)
207
208
209 if not packet_member:
210 return id
211 else:
212 if isinstance(packet_member, Packet):
213 self.id_to_packet[id] = packet_member
214 packet_member.put(id)
215 if packet_member.tag not in self.packet:
216 self.packet[packet_member.tag] = packet_member
217 else:
218 if packet_member in self.packet:
219 packet = self.packet[packet_member]
220 packet.put(id)
221 self.id_to_packet[id] = packet
222 return id
223
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
226 if not self.submitted_ids:
227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
228 idle, run, fail = 0, 0, 0
229 for pid in self.submitted_ids[:]:
230 status = self.control_one_job(id)
231 if status == 'I':
232 idle += 1
233 elif status == 'R':
234 run += 1
235 elif status == 'F':
236 self.finish +=1
237 self.submitted_ids.remove(pid)
238 else:
239 fail += 1
240
241 return idle, run, self.finish, fail
242
244 """ control the status of a single job with it's cluster id """
245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
248 """get a unique run_name for all the jobs helps to identify the runs
249 in the controller for some cluster."""
250
251 if second_path:
252 path = os.path.realpath(pjoin(path, second_path))
253 elif not os.path.exists(path):
254 return path
255
256 if 'SubProcesses' in path:
257 target = path.rsplit('/SubProcesses',1)[0]
258 elif 'MCatNLO' in path:
259 target = path.rsplit('/MCatNLO',1)[0]
260 elif 'PY8_parallelization' in path:
261 target = path.rsplit('/PY8_parallelization',1)[0]
262 elif second_path:
263 target=path
264 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
265 else:
266 target = path
267
268 if target.endswith('/'):
269 target = target[:-1]
270
271 target = misc.digest(target)[-self.identifier_length:]
272 if not target[0].isalpha():
273 target = 'a' + target[1:]
274
275 return target
276
277
278 @check_interupt()
279 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
280 """Wait that all job are finish.
281 if minimal_job set, then return if idle + run is lower than that number"""
282
283
284 mode = 1
285 nb_iter = 0
286 nb_short = 0
287 change_at = 5
288
289 if update_first:
290 idle, run, finish, fail = self.control(me_dir)
291 update_first(idle, run, finish)
292
293
294 longtime, shorttime = self.options['cluster_status_update']
295
296 nb_job = 0
297
298 if self.options['cluster_type'] == 'htcaas2':
299 me_dir = self.metasubmit(self)
300
301 while 1:
302 old_mode = mode
303 nb_iter += 1
304 idle, run, finish, fail = self.control(me_dir)
305 if nb_job:
306 if idle + run + finish + fail != nb_job:
307 nb_job = idle + run + finish + fail
308 nb_iter = 1
309 else:
310 nb_job = idle + run + finish + fail
311 if fail:
312 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
313 if idle + run == 0:
314
315 logger.info('All jobs finished')
316 fct(idle, run, finish)
317 break
318 if idle + run < minimal_job:
319 return
320 fct(idle, run, finish)
321
322 if nb_iter < change_at:
323 mode = 1
324 elif idle < run:
325 if old_mode == 0:
326 if nb_short:
327 mode = 0
328
329 elif idle:
330 if nb_iter > change_at + int(longtime)//shorttime:
331 mode = 0
332 else:
333 mode = 1
334 nb_short =0
335 else:
336 mode = 1
337 nb_short = 0
338 elif old_mode == 1:
339 nb_short +=1
340 if nb_short > 3* max(change_at, int(longtime)//shorttime):
341 mode = 0
342 else:
343 mode = 0
344
345
346 if old_mode > mode:
347 logger.info('''Start to wait %ss between checking status.
348 Note that you can change this time in the configuration file.
349 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
350
351
352 if mode == 0:
353 try:
354 time.sleep(self.options['cluster_status_update'][0])
355 except KeyboardInterrupt:
356 logger.info('start to update the status')
357 nb_iter = min(0, change_at -2)
358 nb_short = 0
359 else:
360 time.sleep(self.options['cluster_status_update'][1])
361
362
363 self.submitted = 0
364 self.submitted_ids = []
365
367 """Check the termination of the jobs with job_id and relaunch it if needed."""
368
369
370 if job_id not in self.retry_args:
371 if job_id in self.id_to_packet:
372 nb_in_packet = self.id_to_packet[job_id].remove_one()
373 if nb_in_packet == 0:
374
375 packet = self.id_to_packet[job_id]
376
377 packet.queue.join()
378
379 packet.fct(*packet.args)
380 del self.id_to_packet[job_id]
381 return 'resubmit'
382 else:
383 return True
384
385 args = self.retry_args[job_id]
386 if 'time_check' in args:
387 time_check = args['time_check']
388 else:
389 time_check = 0
390
391 for path in args['required_output']:
392 if args['cwd']:
393 path = pjoin(args['cwd'], path)
394
395 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
396 break
397 else:
398
399 if time_check > 0:
400 logger.info('Job %s Finally found the missing output.' % (job_id))
401 del self.retry_args[job_id]
402 self.submitted_ids.remove(job_id)
403
404 if job_id in self.id_to_packet:
405 nb_in_packet = self.id_to_packet[job_id].remove_one()
406 if nb_in_packet == 0:
407
408 packet = self.id_to_packet[job_id]
409
410 packet.queue.join()
411
412 packet.fct(*packet.args)
413 del self.id_to_packet[job_id]
414 return 'resubmit'
415
416 return 'done'
417
418 if time_check == 0:
419 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
420 args['time_check'] = time.time()
421 return 'wait'
422 elif self.cluster_retry_wait > time.time() - time_check:
423 return 'wait'
424
425
426 if self.nb_retry < 0:
427 logger.critical('''Fail to run correctly job %s.
428 with option: %s
429 file missing: %s''' % (job_id, args, path))
430 raw_input('press enter to continue.')
431 elif self.nb_retry == 0:
432 logger.critical('''Fail to run correctly job %s.
433 with option: %s
434 file missing: %s.
435 Stopping all runs.''' % (job_id, args, path))
436 self.remove()
437 elif args['nb_submit'] >= self.nb_retry:
438 logger.critical('''Fail to run correctly job %s.
439 with option: %s
440 file missing: %s
441 Fails %s times
442 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
443 self.remove()
444 else:
445 args['nb_submit'] += 1
446 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
447 del self.retry_args[job_id]
448 self.submitted_ids.remove(job_id)
449 if 'time_check' in args:
450 del args['time_check']
451 if job_id in self.id_to_packet:
452 self.id_to_packet[job_id].remove_one()
453 args['packet_member'] = self.id_to_packet[job_id]
454 del self.id_to_packet[job_id]
455 self.cluster_submit(**args)
456 else:
457 self.submit2(**args)
458 return 'resubmit'
459 return 'done'
460
461 @check_interupt()
462 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
463 stderr=None, log=None, required_output=[], nb_submit=0,
464 input_files=[], output_files=[]):
465 """launch one job on the cluster and wait for it"""
466
467 special_output = False
468 if stderr == -2 and stdout:
469
470 special_output = True
471 stderr = stdout + '.err'
472
473 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
474 required_output=required_output, input_files=input_files,
475 output_files=output_files)
476
477 if self.options['cluster_type']=='htcaas2':
478 if self.submitted == self.submitted_ids[-1]:
479 id = self.metasubmit(self)
480
481 frame = inspect.currentframe()
482 args, _, _, values = inspect.getargvalues(frame)
483 args = dict([(i, values[i]) for i in args if i != 'self'])
484 self.retry_args[id] = args
485
486 nb_wait=0
487 while 1:
488 nb_wait+=1
489 status = self.control_one_job(id)
490 if not status in ['R','I']:
491 status = self.check_termination(id)
492 if status in ['wait']:
493 time.sleep(30)
494 continue
495 elif status in ['resubmit']:
496 id = self.submitted_ids[0]
497 time.sleep(30)
498 continue
499
500 time.sleep(30)
501 break
502 time.sleep(self.options['cluster_status_update'][1])
503
504 if required_output:
505 status = self.check_termination(id)
506 if status == 'wait':
507 run += 1
508 elif status == 'resubmit':
509 idle += 1
510
511
512 if special_output:
513
514
515 for i in range(5):
516 if os.path.exists(stdout):
517 if not os.path.exists(stderr):
518 time.sleep(5)
519 if os.path.exists(stderr):
520 err_text = open(stderr).read()
521 if not err_text:
522 return
523 logger.warning(err_text)
524 text = open(stdout).read()
525 open(stdout,'w').write(text + err_text)
526 else:
527 return
528 time.sleep(10)
529
530 - def remove(self, *args, **opts):
531 """ """
532 logger.warning("""This cluster didn't support job removal,
533 the jobs are still running on the cluster.""")
534
535 @store_input()
539
541 """routine which allow to modify the run_card/mg5cmd object to change the
542 default behavior of the runs.
543 This is called at the time of the compilation of the run_card.
544 Note that this function can be called multiple times by run.
545 """
546
547 return
548
550 """ an object for handling packet of job, it is designed to be thread safe
551 """
552
553 - def __init__(self, name, fct, args, opts={}):
554 import Queue
555 import threading
556 self.queue = Queue.Queue()
557 self.tag = name
558 self.fct = fct
559 self.args = args
560 self.opts = opts
561 self.done = threading.Event()
562
563 - def put(self, *args, **opts):
565
566 append = put
567
572
574 """class for dealing with the submission in multiple node"""
575
576 job_id = "$"
577
579 """Init the cluster """
580
581
582 super(MultiCore, self).__init__(self, *args, **opt)
583
584 import Queue
585 import threading
586 import thread
587 self.queue = Queue.Queue()
588 self.done = Queue.Queue()
589 self.submitted = Queue.Queue()
590 self.stoprequest = threading.Event()
591 self.demons = []
592 self.nb_done =0
593 if 'nb_core' in opt:
594 self.nb_core = opt['nb_core']
595 elif isinstance(args[0],int):
596 self.nb_core = args[0]
597 else:
598 self.nb_core = 1
599 self.update_fct = None
600
601 self.lock = threading.Event()
602 self.pids = Queue.Queue()
603 self.done_pid = []
604 self.done_pid_queue = Queue.Queue()
605 self.fail_msg = None
606
607
608 for _ in range(self.nb_core):
609 self.start_demon()
610
611
613 import threading
614 t = threading.Thread(target=self.worker)
615 t.daemon = True
616 t.start()
617 self.demons.append(t)
618
619
621 import Queue
622 import thread
623 while not self.stoprequest.isSet():
624 try:
625 args = self.queue.get()
626 tag, exe, arg, opt = args
627 try:
628
629 if isinstance(exe,str):
630 if os.path.exists(exe) and not exe.startswith('/'):
631 exe = './' + exe
632 if isinstance(opt['stdout'],str):
633 opt['stdout'] = open(opt['stdout'],'w')
634 if opt['stderr'] == None:
635 opt['stderr'] = subprocess.STDOUT
636 proc = misc.Popen([exe] + arg, **opt)
637 pid = proc.pid
638 self.pids.put(pid)
639 proc.wait()
640 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
641 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
642 (' '.join([exe]+arg), proc.returncode)
643 logger.warning(fail_msg)
644 self.stoprequest.set()
645 self.remove(fail_msg)
646
647
648
649
650 else:
651 pid = tag
652 self.pids.put(pid)
653
654
655 returncode = exe(*arg, **opt)
656 if returncode != 0:
657 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
658 self.stoprequest.set()
659 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
660 except Exception,error:
661 self.fail_msg = sys.exc_info()
662 logger.warning(str(error))
663 self.stoprequest.set()
664 self.remove(error)
665
666 if __debug__:
667 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2]
668
669 self.queue.task_done()
670 self.done.put(tag)
671 self.done_pid_queue.put(pid)
672
673 try:
674 self.lock.set()
675 except thread.error:
676 continue
677 except Queue.Empty:
678 continue
679
680
681
682
683 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
684 log=None, required_output=[], nb_submit=0):
685 """submit a job on multicore machine"""
686
687 tag = (prog, tuple(argument), cwd, nb_submit)
688 if isinstance(prog, str):
689
690 opt = {'cwd': cwd,
691 'stdout':stdout,
692 'stderr': stderr}
693 self.queue.put((tag, prog, argument, opt))
694 self.submitted.put(1)
695 return tag
696 else:
697
698 self.queue.put((tag, prog, argument, {}))
699 self.submitted.put(1)
700 return tag
701
702 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
703 stderr=None, log=None, **opts):
704 """launch one job and wait for it"""
705 if isinstance(stdout, str):
706 stdout = open(stdout, 'w')
707 if isinstance(stderr, str):
708 stdout = open(stderr, 'w')
709 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
710
711 - def remove(self, error=None):
712 """Ensure that all thread are killed"""
713
714
715 self.stoprequest.set()
716 if error and not self.fail_msg:
717 self.fail_msg = error
718
719
720 while not self.done_pid_queue.empty():
721 pid = self.done_pid_queue.get()
722 self.done_pid.append(pid)
723
724
725 while not self.pids.empty():
726 pid = self.pids.get()
727 self.pids.task_done()
728 if isinstance(pid, tuple):
729 continue
730 if pid in self.done_pid:
731 continue
732 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
733 % {'pid':pid} )
734 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
735
736
737 - def wait(self, me_dir, update_status, update_first=None):
738 """Waiting that all the jobs are done. This function also control that
739 the submission by packet are handle correctly (i.e. submit the function)"""
740
741 import Queue
742 import threading
743
744 try:
745 last_status = (0, 0, 0)
746 sleep_time = 1
747 use_lock = True
748 first = True
749 while True:
750 force_one_more_loop = False
751
752
753
754 while self.done.qsize():
755 try:
756 tag = self.done.get(True, 1)
757 except Queue.Empty:
758 pass
759 else:
760 if self.id_to_packet and tuple(tag) in self.id_to_packet:
761 packet = self.id_to_packet[tuple(tag)]
762 remaining = packet.remove_one()
763 if remaining == 0:
764
765 packet.queue.join()
766 self.submit(packet.fct, packet.args)
767 force_one_more_loop = True
768 self.nb_done += 1
769 self.done.task_done()
770
771
772
773 Idle = self.queue.qsize()
774 Done = self.nb_done + self.done.qsize()
775 Running = max(0, self.submitted.qsize() - Idle - Done)
776
777 if Idle + Running <= 0 and not force_one_more_loop:
778 update_status(Idle, Running, Done)
779
780
781 self.queue.join()
782 break
783
784 if (Idle, Running, Done) != last_status:
785 if first and update_first:
786 update_first(Idle, Running, Done)
787 first = False
788 else:
789 update_status(Idle, Running, Done)
790 last_status = (Idle, Running, Done)
791
792
793 while not self.done_pid_queue.empty():
794 pid = self.done_pid_queue.get()
795 self.done_pid.append(pid)
796 self.done_pid_queue.task_done()
797
798
799
800 if use_lock:
801
802 use_lock = self.lock.wait(300)
803 self.lock.clear()
804 if not use_lock and Idle > 0:
805 use_lock = True
806 else:
807
808
809 time.sleep(sleep_time)
810 sleep_time = min(sleep_time + 2, 180)
811 if update_first:
812 update_first(Idle, Running, Done)
813
814 if self.stoprequest.isSet():
815 if isinstance(self.fail_msg, Exception):
816 raise self.fail_msg
817 elif isinstance(self.fail_msg, str):
818 raise Exception, self.fail_msg
819 else:
820 misc.sprint(self.fail_msg)
821 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
822
823 try:
824 self.lock.clear()
825 except Exception:
826 pass
827 self.done = Queue.Queue()
828 self.done_pid = []
829 self.done_pid_queue = Queue.Queue()
830 self.nb_done = 0
831 self.submitted = Queue.Queue()
832 self.pids = Queue.Queue()
833 self.stoprequest.clear()
834
835 except KeyboardInterrupt:
836
837 if isinstance(self.fail_msg, Exception):
838 raise self.fail_msg
839 elif isinstance(self.fail_msg, str):
840 raise Exception, self.fail_msg
841 elif self.fail_msg:
842 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
843
844 raise
845
847 """Basic class for dealing with cluster submission"""
848
849 name = 'condor'
850 job_id = 'CONDOR_ID'
851
852
853
854 @multiple_try()
855 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
856 required_output=[], nb_submit=0):
857 """Submit a job prog to a Condor cluster"""
858
859 text = """Executable = %(prog)s
860 output = %(stdout)s
861 error = %(stderr)s
862 log = %(log)s
863 %(argument)s
864 environment = CONDOR_ID=$(Cluster).$(Process)
865 Universe = vanilla
866 notification = Error
867 Initialdir = %(cwd)s
868 %(requirement)s
869 getenv=True
870 queue 1
871 """
872
873 if self.cluster_queue not in ['None', None]:
874 requirement = 'Requirements = %s=?=True' % self.cluster_queue
875 else:
876 requirement = ''
877
878 if cwd is None:
879 cwd = os.getcwd()
880 if stdout is None:
881 stdout = '/dev/null'
882 if stderr is None:
883 stderr = '/dev/null'
884 if log is None:
885 log = '/dev/null'
886 if not os.path.exists(prog):
887 prog = os.path.join(cwd, prog)
888 if argument:
889 argument = 'Arguments = %s' % ' '.join(argument)
890 else:
891 argument = ''
892
893
894 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
895 'stderr': stderr,'log': log,'argument': argument,
896 'requirement': requirement}
897
898
899 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
900 stdin=subprocess.PIPE)
901 output, _ = a.communicate(text % dico)
902
903
904
905
906 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
907 try:
908 id = pat.search(output).groups()[0]
909 except:
910 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
911 % output
912 self.submitted += 1
913 self.submitted_ids.append(id)
914 return id
915
916 @store_input()
917 @multiple_try()
918 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
919 log=None, input_files=[], output_files=[], required_output=[],
920 nb_submit=0):
921 """Submit the job on the cluster NO SHARE DISK
922 input/output file should be give relative to cwd
923 """
924
925 if not required_output and output_files:
926 required_output = output_files
927
928 if (input_files == [] == output_files):
929 return self.submit(prog, argument, cwd, stdout, stderr, log,
930 required_output=required_output, nb_submit=nb_submit)
931
932 text = """Executable = %(prog)s
933 output = %(stdout)s
934 error = %(stderr)s
935 log = %(log)s
936 %(argument)s
937 should_transfer_files = YES
938 when_to_transfer_output = ON_EXIT
939 transfer_input_files = %(input_files)s
940 %(output_files)s
941 Universe = vanilla
942 notification = Error
943 Initialdir = %(cwd)s
944 %(requirement)s
945 getenv=True
946 queue 1
947 """
948
949 if self.cluster_queue not in ['None', None]:
950 requirement = 'Requirements = %s=?=True' % self.cluster_queue
951 else:
952 requirement = ''
953
954 if cwd is None:
955 cwd = os.getcwd()
956 if stdout is None:
957 stdout = '/dev/null'
958 if stderr is None:
959 stderr = '/dev/null'
960 if log is None:
961 log = '/dev/null'
962 if not os.path.exists(prog):
963 prog = os.path.join(cwd, prog)
964 if argument:
965 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
966 else:
967 argument = ''
968
969 if input_files:
970 input_files = ','.join(input_files)
971 else:
972 input_files = ''
973 if output_files:
974 output_files = 'transfer_output_files = %s' % ','.join(output_files)
975 else:
976 output_files = ''
977
978
979
980 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
981 'stderr': stderr,'log': log,'argument': argument,
982 'requirement': requirement, 'input_files':input_files,
983 'output_files':output_files}
984
985
986 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
987 stdin=subprocess.PIPE)
988 output, _ = a.communicate(text % dico)
989
990
991
992
993 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
994 try:
995 id = pat.search(output).groups()[0]
996 except:
997 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
998 % output
999 self.submitted += 1
1000 self.submitted_ids.append(id)
1001 return id
1002
1003
1004
1005
1006
1007 @multiple_try(nb_try=10, sleep=10)
1009 """ control the status of a single job with it's cluster id """
1010 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1011 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1012 stderr=subprocess.PIPE)
1013
1014 error = status.stderr.read()
1015 if status.returncode or error:
1016 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1017
1018 return status.stdout.readline().strip()
1019
1020 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'}
1021 @check_interupt()
1022 @multiple_try(nb_try=10, sleep=10)
1024 """ control the status of a single job with it's cluster id """
1025
1026 if not self.submitted_ids:
1027 return 0, 0, 0, 0
1028
1029 packet = 15000
1030 idle, run, fail = 0, 0, 0
1031 ongoing = []
1032 for i in range(1+(len(self.submitted_ids)-1)//packet):
1033 start = i * packet
1034 stop = (i+1) * packet
1035 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1036 " -format \"%d \" ClusterId " + \
1037 " -format \"%d\\n\" JobStatus "
1038
1039 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1040 stderr=subprocess.PIPE)
1041 error = status.stderr.read()
1042 if status.returncode or error:
1043 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1044
1045 for line in status.stdout:
1046 id, status = line.strip().split()
1047 status = self.jobstatus[status]
1048 ongoing.append(id)
1049 if status in ['I','U']:
1050 idle += 1
1051 elif status == 'R':
1052 run += 1
1053 elif status != 'C':
1054 fail += 1
1055
1056 for id in list(self.submitted_ids):
1057 if id not in ongoing:
1058 status = self.check_termination(id)
1059 if status == 'wait':
1060 run += 1
1061 elif status == 'resubmit':
1062 idle += 1
1063
1064 return idle, run, self.submitted - (idle+run+fail), fail
1065
1066 @multiple_try()
1067 - def remove(self, *args, **opts):
1068 """Clean the jobson the cluster"""
1069
1070 if not self.submitted_ids:
1071 return
1072 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1073
1074 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1075 self.submitted_ids = []
1076
1078 """Basic class for dealing with cluster submission"""
1079
1080 name = 'pbs'
1081 job_id = 'PBS_JOBID'
1082 idle_tag = ['Q']
1083 running_tag = ['T','E','R']
1084 complete_tag = ['C']
1085
1086 maximum_submited_jobs = 2500
1087
1088 @multiple_try()
1089 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1090 required_output=[], nb_submit=0):
1091 """Submit a job prog to a PBS cluster"""
1092
1093 me_dir = self.get_jobs_identifier(cwd, prog)
1094
1095 if len(self.submitted_ids) > self.maximum_submited_jobs:
1096 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1097 self.wait(me_dir, fct, self.maximum_submited_jobs)
1098
1099
1100 text = ""
1101 if cwd is None:
1102 cwd = os.getcwd()
1103 else:
1104 text = " cd %s;" % cwd
1105 if stdout is None:
1106 stdout = '/dev/null'
1107 if stderr is None:
1108 stderr = '/dev/null'
1109 elif stderr == -2:
1110 stderr = stdout
1111 if log is None:
1112 log = '/dev/null'
1113
1114 if not os.path.isabs(prog):
1115 text += "./%s" % prog
1116 else:
1117 text+= prog
1118
1119 if argument:
1120 text += ' ' + ' '.join(argument)
1121
1122 command = ['qsub','-o', stdout,
1123 '-N', me_dir,
1124 '-e', stderr,
1125 '-V']
1126
1127 if self.cluster_queue and self.cluster_queue != 'None':
1128 command.extend(['-q', self.cluster_queue])
1129
1130 a = misc.Popen(command, stdout=subprocess.PIPE,
1131 stderr=subprocess.STDOUT,
1132 stdin=subprocess.PIPE, cwd=cwd)
1133
1134 output = a.communicate(text)[0]
1135 id = output.split('.')[0]
1136 if not id.isdigit() or a.returncode !=0:
1137 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1138 % output
1139
1140 self.submitted += 1
1141 self.submitted_ids.append(id)
1142 return id
1143
1144 @multiple_try()
1146 """ control the status of a single job with it's cluster id """
1147 cmd = 'qstat '+str(id)
1148 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1149 stderr=subprocess.STDOUT)
1150
1151 for line in status.stdout:
1152 line = line.strip()
1153 if 'cannot connect to server' in line or 'cannot read reply' in line:
1154 raise ClusterManagmentError, 'server disconnected'
1155 if 'Unknown' in line:
1156 return 'F'
1157 elif line.startswith(str(id)):
1158 jobstatus = line.split()[4]
1159 else:
1160 jobstatus=""
1161
1162 if status.returncode != 0 and status.returncode is not None:
1163 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1164 if jobstatus in self.idle_tag:
1165 return 'I'
1166 elif jobstatus in self.running_tag:
1167 return 'R'
1168 return 'F'
1169
1170
1171 @multiple_try()
1173 """ control the status of a single job with it's cluster id """
1174 cmd = "qstat"
1175 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1176
1177 me_dir = self.get_jobs_identifier(me_dir)
1178
1179 ongoing = []
1180
1181 idle, run, fail = 0, 0, 0
1182 for line in status.stdout:
1183 if 'cannot connect to server' in line or 'cannot read reply' in line:
1184 raise ClusterManagmentError, 'server disconnected'
1185 if me_dir in line:
1186 ongoing.append(line.split()[0].split('.')[0])
1187 status2 = line.split()[4]
1188 if status2 in self.idle_tag:
1189 idle += 1
1190 elif status2 in self.running_tag:
1191 run += 1
1192 elif status2 in self.complete_tag:
1193 if not self.check_termination(line.split()[0].split('.')[0]):
1194 idle += 1
1195 else:
1196 fail += 1
1197
1198 if status.returncode != 0 and status.returncode is not None:
1199 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1200
1201 for id in list(self.submitted_ids):
1202 if id not in ongoing:
1203 status2 = self.check_termination(id)
1204 if status2 == 'wait':
1205 run += 1
1206 elif status2 == 'resubmit':
1207 idle += 1
1208
1209 return idle, run, self.submitted - (idle+run+fail), fail
1210
1211 @multiple_try()
1212 - def remove(self, *args, **opts):
1213 """Clean the jobs on the cluster"""
1214
1215 if not self.submitted_ids:
1216 return
1217 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1218 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1219 self.submitted_ids = []
1220
1223 """Basic class for dealing with cluster submission"""
1224
1225
1226 name = 'sge'
1227 job_id = 'JOB_ID'
1228 idle_tag = ['qw', 'hqw','hRqw','w']
1229 running_tag = ['r','t','Rr','Rt']
1230 identifier_length = 10
1231
1233 """replace string for path issues"""
1234 location = os.path.realpath(location)
1235 homePath = os.getenv("HOME")
1236 if homePath:
1237 location = location.replace(homePath,'$HOME')
1238 return location
1239
1240 @multiple_try()
1241 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1242 required_output=[], nb_submit=0):
1243 """Submit a job prog to an SGE cluster"""
1244
1245 me_dir = self.get_jobs_identifier(cwd, prog)
1246
1247
1248 if cwd is None:
1249
1250 cwd = self.def_get_path(os.getcwd())
1251 cwd1 = self.def_get_path(cwd)
1252 text = " cd %s;" % cwd1
1253 if stdout is None:
1254 stdout = '/dev/null'
1255 else:
1256 stdout = self.def_get_path(stdout)
1257 if stderr is None:
1258 stderr = '/dev/null'
1259 elif stderr == -2:
1260 stderr = stdout
1261 else:
1262 stderr = self.def_get_path(stderr)
1263
1264 if log is None:
1265 log = '/dev/null'
1266 else:
1267 log = self.def_get_path(log)
1268
1269 text += prog
1270 if argument:
1271 text += ' ' + ' '.join(argument)
1272
1273
1274
1275
1276 homePath = os.getenv("HOME")
1277 if homePath:
1278 text = text.replace(homePath,'$HOME')
1279
1280 logger.debug("!=== input %s" % text)
1281 logger.debug("!=== output %s" % stdout)
1282 logger.debug("!=== error %s" % stderr)
1283 logger.debug("!=== logs %s" % log)
1284
1285 command = ['qsub','-o', stdout,
1286 '-N', me_dir,
1287 '-e', stderr,
1288 '-V']
1289
1290 if self.cluster_queue and self.cluster_queue != 'None':
1291 command.extend(['-q', self.cluster_queue])
1292
1293 a = misc.Popen(command, stdout=subprocess.PIPE,
1294 stderr=subprocess.STDOUT,
1295 stdin=subprocess.PIPE, cwd=cwd)
1296
1297 output = a.communicate(text)[0]
1298 id = output.split(' ')[2]
1299 if not id.isdigit():
1300 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1301 % output
1302 self.submitted += 1
1303 self.submitted_ids.append(id)
1304 logger.debug(output)
1305
1306 return id
1307
1308 @multiple_try()
1310 """ control the status of a single job with it's cluster id """
1311
1312 cmd = 'qstat '
1313 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1314 for line in status.stdout:
1315
1316
1317
1318
1319
1320
1321 if str(id) in line:
1322 status = line.split()[4]
1323
1324 if status in self.idle_tag:
1325 return 'I'
1326 elif status in self.running_tag:
1327 return 'R'
1328 return 'F'
1329
1330 @multiple_try()
1332 """ control the status of a single job with it's cluster id """
1333 cmd = "qstat "
1334 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1335
1336 me_dir = self.get_jobs_identifier(me_dir)
1337
1338 finished = list(self.submitted_ids)
1339
1340 idle, run, fail = 0, 0, 0
1341 for line in status.stdout:
1342 if me_dir in line:
1343 id,_,_,_,status = line.split()[:5]
1344 if status in self.idle_tag:
1345 idle += 1
1346 finished.remove(id)
1347 elif status in self.running_tag:
1348 run += 1
1349 finished.remove(id)
1350 else:
1351 logger.debug(line)
1352 fail += 1
1353 finished.remove(id)
1354
1355 for id in finished:
1356 self.check_termination(id)
1357
1358 return idle, run, self.submitted - (idle+run+fail), fail
1359
1360
1361
1362 @multiple_try()
1363 - def remove(self, *args, **opts):
1364 """Clean the jobs on the cluster"""
1365
1366 if not self.submitted_ids:
1367 return
1368 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1369 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1370 self.submitted_ids = []
1371
1374 """Basic class for dealing with cluster submission"""
1375
1376 name = 'lsf'
1377 job_id = 'LSB_JOBID'
1378
1379 @multiple_try()
1380 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1381 required_output=[], nb_submit=0):
1382 """Submit the job prog to an LSF cluster"""
1383
1384
1385 me_dir = self.get_jobs_identifier(cwd, prog)
1386
1387 text = ""
1388 command = ['bsub', '-C0', '-J', me_dir]
1389 if cwd is None:
1390 cwd = os.getcwd()
1391 else:
1392 text = " cd %s;" % cwd
1393 if stdout and isinstance(stdout, str):
1394 command.extend(['-o', stdout])
1395 if stderr and isinstance(stdout, str):
1396 command.extend(['-e', stderr])
1397 elif stderr == -2:
1398 pass
1399 if log is None:
1400 log = '/dev/null'
1401
1402 text += prog
1403 if argument:
1404 text += ' ' + ' '.join(argument)
1405
1406 if self.cluster_queue and self.cluster_queue != 'None':
1407 command.extend(['-q', self.cluster_queue])
1408
1409 a = misc.Popen(command, stdout=subprocess.PIPE,
1410 stderr=subprocess.STDOUT,
1411 stdin=subprocess.PIPE, cwd=cwd)
1412
1413 output = a.communicate(text)[0]
1414
1415 try:
1416 id = output.split('>',1)[0].split('<')[1]
1417 except:
1418 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1419 % output
1420 if not id.isdigit():
1421 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1422 % output
1423 self.submitted += 1
1424 self.submitted_ids.append(id)
1425 return id
1426
1427
1428 @multiple_try()
1430 """ control the status of a single job with it's cluster id """
1431
1432 cmd = 'bjobs '+str(id)
1433 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1434
1435 for line in status.stdout:
1436 line = line.strip().upper()
1437 if 'JOBID' in line:
1438 continue
1439 elif str(id) not in line:
1440 continue
1441 status = line.split()[2]
1442 if status == 'RUN':
1443 return 'R'
1444 elif status == 'PEND':
1445 return 'I'
1446 elif status == 'DONE':
1447 return 'F'
1448 else:
1449 return 'H'
1450 return 'F'
1451
1452 @multiple_try()
1454 """ control the status of a single job with it's cluster id """
1455
1456 if not self.submitted_ids:
1457 return 0, 0, 0, 0
1458
1459 cmd = "bjobs " + ' '.join(self.submitted_ids)
1460 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1461
1462 jobstatus = {}
1463 for line in status.stdout:
1464 line = line.strip()
1465 if 'JOBID' in line:
1466 continue
1467 splitline = line.split()
1468 id = splitline[0]
1469 if id not in self.submitted_ids:
1470 continue
1471 jobstatus[id] = splitline[2]
1472
1473 idle, run, fail = 0, 0, 0
1474 for id in self.submitted_ids[:]:
1475 if id in jobstatus:
1476 status = jobstatus[id]
1477 else:
1478 status = 'MISSING'
1479 if status == 'RUN':
1480 run += 1
1481 elif status == 'PEND':
1482 idle += 1
1483 else:
1484 status = self.check_termination(id)
1485 if status == 'wait':
1486 run += 1
1487 elif status == 'resubmit':
1488 idle += 1
1489
1490 return idle, run, self.submitted - (idle+run+fail), fail
1491
1492 @multiple_try()
1493 - def remove(self, *args,**opts):
1494 """Clean the jobs on the cluster"""
1495
1496 if not self.submitted_ids:
1497 return
1498 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1499 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1500 self.submitted_ids = []
1501
1503 """Class for dealing with cluster submission on a GE cluster"""
1504
1505 name = 'ge'
1506 job_id = 'JOB_ID'
1507 idle_tag = ['qw']
1508 running_tag = ['r']
1509
1510 @multiple_try()
1511 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1512 required_output=[], nb_submit=0):
1513 """Submit a job prog to a GE cluster"""
1514
1515 text = ""
1516 if cwd is None:
1517 cwd = os.getcwd()
1518 else:
1519 text = " cd %s; bash " % cwd
1520 if stdout is None:
1521 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1522 if stderr is None:
1523 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1524 elif stderr == -2:
1525 stderr = stdout
1526 if log is None:
1527 log = '/dev/null'
1528
1529 text += prog
1530 if argument:
1531 text += ' ' + ' '.join(argument)
1532 text += '\n'
1533 tmp_submit = os.path.join(cwd, 'tmp_submit')
1534 open(tmp_submit,'w').write(text)
1535
1536 a = misc.Popen(['qsub','-o', stdout,
1537 '-e', stderr,
1538 tmp_submit],
1539 stdout=subprocess.PIPE,
1540 stderr=subprocess.STDOUT,
1541 stdin=subprocess.PIPE, cwd=cwd)
1542
1543 output = a.communicate()[0]
1544
1545 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1546 try:
1547 id = pat.search(output).groups()[0]
1548 except:
1549 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1550 % output
1551 self.submitted += 1
1552 self.submitted_ids.append(id)
1553 return id
1554
1555 @multiple_try()
1557 """ control the status of a single job with it's cluster id """
1558 cmd = 'qstat | grep '+str(id)
1559 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1560 if not status:
1561 return 'F'
1562
1563 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1564 stat = ''
1565 for line in status.stdout.read().split('\n'):
1566 if not line:
1567 continue
1568 line = line.strip()
1569 try:
1570 groups = pat.search(line).groups()
1571 except:
1572 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1573 if groups[0] != id: continue
1574 stat = groups[1]
1575 if not stat:
1576 return 'F'
1577 if stat in self.idle_tag:
1578 return 'I'
1579 if stat in self.running_tag:
1580 return 'R'
1581
1582 @multiple_try()
1584 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1585 if not self.submitted_ids:
1586 return 0, 0, 0, 0
1587 idle, run, fail = 0, 0, 0
1588 ongoing = []
1589 for statusflag in ['p', 'r', 'sh']:
1590 cmd = 'qstat -s %s' % statusflag
1591 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1592
1593 pat = re.compile("^(\d+)")
1594 for line in status.stdout.read().split('\n'):
1595 line = line.strip()
1596 try:
1597 id = pat.search(line).groups()[0]
1598 except Exception:
1599 pass
1600 else:
1601 if id not in self.submitted_ids:
1602 continue
1603 ongoing.append(id)
1604 if statusflag == 'p':
1605 idle += 1
1606 if statusflag == 'r':
1607 run += 1
1608 if statusflag == 'sh':
1609 fail += 1
1610 for id in list(self.submitted_ids):
1611 if id not in ongoing:
1612 self.check_termination(id)
1613
1614
1615 return idle, run, self.submitted - idle - run - fail, fail
1616
1617 @multiple_try()
1618 - def remove(self, *args, **opts):
1619 """Clean the jobs on the cluster"""
1620
1621 if not self.submitted_ids:
1622 return
1623 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1624 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1625 self.submitted_ids = []
1626
1628 """start a computation and not wait for it to finish.
1629 this fonction returns a lock which is locked as long as the job is
1630 running."""
1631
1632 mc = MultiCore(1)
1633 mc.submit(exe, argument, cwd, stdout, **opt)
1634 mc.need_waiting = True
1635 return mc.lock
1636
1639 """Basic class for dealing with cluster submission"""
1640
1641 name = 'slurm'
1642 job_id = 'SLURM_JOBID'
1643 idle_tag = ['Q','PD','S','CF']
1644 running_tag = ['R', 'CG']
1645 complete_tag = ['C']
1646 identifier_length = 8
1647
1648 @multiple_try()
1649 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1650 required_output=[], nb_submit=0):
1651 """Submit a job prog to a SLURM cluster"""
1652
1653 me_dir = self.get_jobs_identifier(cwd, prog)
1654
1655
1656 if cwd is None:
1657 cwd = os.getcwd()
1658 if stdout is None:
1659 stdout = '/dev/null'
1660 if stderr is None:
1661 stderr = '/dev/null'
1662 elif stderr == -2:
1663 stderr = stdout
1664 if log is None:
1665 log = '/dev/null'
1666
1667 command = ['sbatch', '-o', stdout,
1668 '-J', me_dir,
1669 '-e', stderr, prog] + argument
1670
1671 if self.cluster_queue and self.cluster_queue != 'None':
1672 command.insert(1, '-p')
1673 command.insert(2, self.cluster_queue)
1674
1675 a = misc.Popen(command, stdout=subprocess.PIPE,
1676 stderr=subprocess.STDOUT,
1677 stdin=subprocess.PIPE, cwd=cwd)
1678
1679 output = a.communicate()
1680 output_arr = output[0].split(' ')
1681 id = output_arr[3].rstrip()
1682
1683 if not id.isdigit():
1684 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1685 % (output[0] + '\n' + output[1])
1686
1687 self.submitted += 1
1688 self.submitted_ids.append(id)
1689 return id
1690
1691 @multiple_try()
1693 """ control the status of a single job with it's cluster id """
1694 cmd = 'squeue j'+str(id)
1695 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1696 stderr=open(os.devnull,'w'))
1697
1698 for line in status.stdout:
1699 line = line.strip()
1700 if 'Invalid' in line:
1701 return 'F'
1702 elif line.startswith(str(id)):
1703 status = line.split()[4]
1704 if status in self.idle_tag:
1705 return 'I'
1706 elif status in self.running_tag:
1707 return 'R'
1708 return 'F'
1709
1710 @multiple_try()
1712 """ control the status of a single job with it's cluster id """
1713 cmd = "squeue"
1714 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1715
1716 me_dir = self.get_jobs_identifier(me_dir)
1717
1718 idle, run, fail = 0, 0, 0
1719 ongoing=[]
1720 for line in pstatus.stdout:
1721 if me_dir in line:
1722 id, _, _,_ , status,_ = line.split(None,5)
1723 ongoing.append(id)
1724 if status in self.idle_tag:
1725 idle += 1
1726 elif status in self.running_tag:
1727 run += 1
1728 elif status in self.complete_tag:
1729 status = self.check_termination(id)
1730 if status == 'wait':
1731 run += 1
1732 elif status == 'resubmit':
1733 idle += 1
1734 else:
1735 fail += 1
1736
1737
1738 for id in list(self.submitted_ids):
1739 if id not in ongoing:
1740 status = self.check_termination(id)
1741 if status == 'wait':
1742 run += 1
1743 elif status == 'resubmit':
1744 idle += 1
1745
1746
1747 return idle, run, self.submitted - (idle+run+fail), fail
1748
1749 @multiple_try()
1750 - def remove(self, *args, **opts):
1751 """Clean the jobs on the cluster"""
1752
1753 if not self.submitted_ids:
1754 return
1755 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1756 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1757 self.submitted_ids = []
1758
1760 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1761
1762 name= 'htcaas'
1763 job_id = 'HTCAAS_JOBID'
1764 idle_tag = ['waiting']
1765 running_tag = ['preparing','running']
1766 complete_tag = ['done']
1767
1768 @store_input()
1769 @multiple_try()
1770 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1771 log=None, input_files=[], output_files=[], required_output=[],
1772 nb_submit=0):
1773 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1774 input/output file should be given as relative to CWd
1775 """
1776
1777 cur_usr = os.getenv('USER')
1778
1779 if cwd is None:
1780 cwd = os.getcwd()
1781
1782 cwd_cp = cwd.rsplit("/",2)
1783
1784 if not stdout is None:
1785 print "stdout: %s" % stdout
1786
1787 if not os.path.exists(prog):
1788 prog = os.path.join(cwd, prog)
1789
1790 if not required_output and output_files:
1791 required_output = output_files
1792
1793 logger.debug(prog)
1794 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1795 cwd_arg = cwd+"/arguments"
1796 temp = ' '.join([str(a) for a in argument])
1797 arg_cmd="echo '"+temp+"' > " + cwd_arg
1798 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1799 if argument :
1800 command.extend(['-a ', '='.join([str(a) for a in argument])])
1801 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1802 id = a.stdout.read().strip()
1803
1804 else:
1805 cwd_arg = cwd+"/arguments"
1806 temp = ' '.join([str(a) for a in argument])
1807 temp_file_name = "sub." + os.path.basename(prog)
1808 text = """#!/bin/bash
1809 MYPWD=%(cwd)s
1810 cd $MYPWD
1811 input_files=(%(input_files)s )
1812 for i in ${input_files[@]}
1813 do
1814 chmod -f +x $i
1815 done
1816 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1817 """
1818 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1819 'arguments': ' '.join([str(a) for a in argument]),
1820 'program': ' ' if '.py' in prog else 'bash'}
1821
1822
1823 new_prog = pjoin(cwd, temp_file_name)
1824 open(new_prog, 'w').write(text % dico)
1825 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1826 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1827 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1828 id = a.stdout.read().strip()
1829 logger.debug(id)
1830
1831 nb_try=0
1832 nb_limit=5
1833 if not id.isdigit() :
1834 print "[ID is not digit]:" + id
1835
1836 while not id.isdigit() :
1837 nb_try+=1
1838 print "[fail_retry]:"+ nb_try
1839 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1840 id = a.stdout.read().strip()
1841 if nb_try > nb_limit :
1842 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1843 break
1844
1845 self.submitted += 1
1846 self.submitted_ids.append(id)
1847
1848 return id
1849
1850 @multiple_try(nb_try=10, sleep=5)
1852 """ control the status of a single job with it's cluster id """
1853
1854 if id == 0 :
1855 status_out ='C'
1856 else :
1857 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1858 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1859 stderr=subprocess.PIPE)
1860 error = status.stderr.read()
1861 if status.returncode or error:
1862 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1863 status_out= status.stdout.read().strip()
1864 status_out= status_out.split(":",1)[1]
1865 if status_out == 'waiting':
1866 status_out='I'
1867 elif status_out == 'preparing' or status_out == 'running':
1868 status_out = 'R'
1869 elif status_out != 'done':
1870 status_out = 'F'
1871 elif status_out == 'done':
1872 status_out = 'C'
1873
1874 return status_out
1875
1876 @multiple_try()
1878 """ control the status of a single job with it's cluster id """
1879 if not self.submitted_ids:
1880 logger.debug("self.submitted_ids not exists")
1881 return 0, 0, 0, 0
1882
1883 ongoing = []
1884 idle, run, fail = 0, 0, 0
1885
1886 start = self.submitted_ids[0]
1887 end = self.submitted_ids[-1]
1888
1889 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1890 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1891
1892 for line in status.stdout:
1893
1894 status2 = line.split()[-1]
1895 if status2 is not 'null' or line.split()[0].strip() is not '0':
1896 ongoing.append(line.split()[0].strip())
1897 logger.debug("["+line.split()[0].strip()+"]"+status2)
1898 if status2 is 'null' or line.split()[0].strip() is '0':
1899 idle += 1
1900 elif status2 in self.idle_tag:
1901 idle += 1
1902 elif status2 in self.running_tag:
1903 run += 1
1904 elif status2 in self.complete_tag:
1905 if not self.check_termination(line.split()[0]):
1906 idle +=1
1907 else:
1908 fail += 1
1909
1910 return idle, run, self.submitted - (idle+run+fail), fail
1911
1912 @multiple_try()
1913 - def remove(self, *args, **opts):
1914 """Clean the jobson the cluster"""
1915
1916 if not self.submitted_ids:
1917 return
1918 for i in range(len(self.submitted_ids)):
1919 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1920 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1921
1923 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1924
1925 name= 'htcaas2'
1926 job_id = 'HTCAAS2_JOBID'
1927 idle_tag = ['waiting']
1928 running_tag = ['preparing','running']
1929 complete_tag = ['done']
1930
1931 @store_input()
1932 @multiple_try()
1933 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1934 log=None, input_files=[], output_files=[], required_output=[],
1935 nb_submit=0):
1936
1937 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1938 input/output file should be given as relative to CWD
1939 """
1940 if cwd is None:
1941 cwd = os.getcwd()
1942
1943 if not os.path.exists(prog):
1944 prog = os.path.join(cwd, prog)
1945
1946 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1947 if cwd or prog :
1948 self.submitted_dirs.append(cwd)
1949 self.submitted_exes.append(prog)
1950 else:
1951 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1952
1953 if argument :
1954 self.submitted_args.append('='.join([str(a) for a in argument]))
1955
1956 if cwd or prog :
1957 self.submitted += 1
1958 id = self.submitted
1959 self.submitted_ids.append(id)
1960 else:
1961 logger.debug("cwd and prog are not exist! ")
1962 id = 0
1963
1964 else:
1965 temp_file_name = "sub."+ os.path.basename(prog)
1966 text = """#!/bin/bash
1967 MYPWD=%(cwd)s
1968 cd $MYPWD
1969 input_files=(%(input_files)s )
1970 for i in ${input_files[@]}
1971 do
1972 chmod -f +x $i
1973 done
1974 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1975 """
1976 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1977 'arguments': ' '.join([str(a) for a in argument]),
1978 'program': ' ' if '.py' in prog else 'bash'}
1979
1980 new_prog = pjoin(cwd, temp_file_name)
1981 open(new_prog, 'w').write(text % dico)
1982 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1983 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
1984 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1985 id = a.stdout.read().strip()
1986 logger.debug("[mode2]-["+str(id)+"]")
1987 if cwd and prog :
1988 self.submitted += 1
1989 self.submitted_ids.append(id)
1990 else:
1991 logger.debug("cwd and prog are not exist! ")
1992 id = 0
1993
1994 return id
1995
1996 @multiple_try()
2040
2041
2042 @multiple_try(nb_try=10, sleep=5)
2044 """ control the status of a single job with it's cluster id """
2045
2046 if self.submitted == self.submitted_ids[-1] :
2047 id = self.metasubmit(self)
2048 tempid = self.submitted_ids[-1]
2049 self.submitted_ids.remove(self.submitted_ids[-1])
2050 self.submitted_ids.append(id)
2051 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2052
2053 if id == 0 :
2054 status_out ='C'
2055 else:
2056 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2057 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2058 stderr=subprocess.PIPE)
2059 error = status.stderr.read()
2060 if status.returncode or error:
2061 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error
2062 status_out= status.stdout.read().strip()
2063 status_out= status_out.split(":",1)[1]
2064 logger.debug("[["+str(id)+"]]"+status_out)
2065 if status_out == 'waiting':
2066 status_out='I'
2067 elif status_out == 'preparing' or status_out == 'running':
2068 status_out = 'R'
2069 elif status_out != 'done':
2070 status_out = 'F'
2071 elif status_out == 'done':
2072 status_out = 'C'
2073 self.submitted -= 1
2074
2075 return status_out
2076
2077 @multiple_try()
2079 """ control the status of a single job with it's cluster id """
2080 if not self.submitted_ids:
2081 logger.debug("self.submitted_ids not exists")
2082 return 0, 0, 0, 0
2083
2084 if "//" in me_dir :
2085 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2086 start = me_dir.split("//")[0]
2087 end = me_dir.split("//")[1]
2088 else :
2089 start = me_dir.split("//")[1]
2090 end = me_dir.split("//")[0]
2091 elif "/" in me_dir :
2092 start = 0
2093 end = 0
2094 elif me_dir.isdigit():
2095 start = me_dir
2096 end = me_dir
2097 elif not me_dir.isdigit():
2098 me_dir = self.submitted_ids[0]
2099 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2100
2101 ongoing = []
2102 idle, run, fail, done = 0, 0, 0, 0
2103
2104 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2105 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2106
2107 for line in status.stdout:
2108 status2 = line.split()[-1]
2109 if status2 is not 'null' or line.split()[0].strip() is not '0':
2110 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2111 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2112
2113 if status2 is 'null' or line.split()[0].strip() is '0':
2114 idle += 1
2115 elif status2 in self.idle_tag:
2116 idle += 1
2117 elif status2 in self.running_tag:
2118 run += 1
2119 elif status2 in self.complete_tag:
2120 done += 1
2121 self.submitted -= 1
2122 if not self.check_termination(line.split()[1]):
2123 idle +=1
2124 else:
2125 fail += 1
2126
2127 return idle, run, self.submitted - (idle+run+fail), fail
2128
2129 @multiple_try()
2130 - def remove(self, *args, **opts):
2131 """Clean the jobson the cluster"""
2132
2133 if not self.submitted_ids:
2134 return
2135 id = self.submitted_ids[0]
2136 if id is not 0 :
2137 cmd = "htcaas-job-cancel -m %s" % str(id)
2138 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2139
2140 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2141 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2142 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2143
2144 onecore=MultiCore(1)
2145
2146