1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21 import sys
22
23 logger = logging.getLogger('madgraph.cluster')
24
25 try:
26 from madgraph import MadGraph5Error
27 import madgraph.various.misc as misc
28 except Exception, error:
29 if __debug__:
30 print str(error)
31 from internal import MadGraph5Error
32 import internal.misc as misc
33
34 pjoin = os.path.join
38
41
42
43 multiple_try = misc.multiple_try
44 pjoin = os.path.join
48
49 def deco_interupt(f):
50 def deco_f_interupt(self, *args, **opt):
51 try:
52 return f(self, *args, **opt)
53 except error:
54 try:
55 self.remove(*args, **opt)
56 except Exception:
57 pass
58 raise error
59 return deco_f_interupt
60 return deco_interupt
61
74 return deco_f_store
75 return deco_store
76
78 """ This function checks whether compression of input files are necessary
79 given the running options given. """
80
81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
82 return False
83 else:
84 return True
85
87 """Basic Class for all cluster type submission"""
88 name = 'mother class'
89 identifier_length = 14
90
92 """Init the cluster"""
93
94 self.submitted = 0
95 self.submitted_ids = []
96 self.finish = 0
97 if 'cluster_queue' in opts:
98 self.cluster_queue = opts['cluster_queue']
99 else:
100 self.cluster_queue = 'madgraph'
101 if 'cluster_temp_path' in opts:
102 self.temp_dir = opts['cluster_temp_path']
103 else:
104 self.temp_dir = None
105 self.options = {'cluster_status_update': (600, 30)}
106 for key,value in opts.items():
107 self.options[key] = value
108 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
109 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300
110 self.options = dict(opts)
111 self.retry_args = {}
112
113 self.packet = {}
114 self.id_to_packet = {}
115
116 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
117 log=None, required_output=[], nb_submit=0):
118 """How to make one submission. Return status id on the cluster."""
119 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
120
121
122 @store_input()
123 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
124 log=None, input_files=[], output_files=[], required_output=[],
125 nb_submit=0):
126 """How to make one submission. Return status id on the cluster.
127 NO SHARE DISK"""
128
129 if cwd is None:
130 cwd = os.getcwd()
131 if not os.path.exists(prog):
132 prog = os.path.join(cwd, prog)
133
134 if not required_output and output_files:
135 required_output = output_files
136
137 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
138 (input_files == [] == output_files):
139 return self.submit(prog, argument, cwd, stdout, stderr, log,
140 required_output=required_output, nb_submit=nb_submit)
141
142 if not input_files and not output_files:
143
144 return self.submit(prog, argument, cwd, stdout, stderr, log,
145 required_output=required_output, nb_submit=nb_submit)
146
147 if cwd is None:
148 cwd = os.getcwd()
149 if not os.path.exists(prog):
150 prog = os.path.join(cwd, prog)
151 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
152
153 text = """#!/bin/bash
154 MYTMP=%(tmpdir)s/run$%(job_id)s
155 MYPWD=%(cwd)s
156 mkdir -p $MYTMP
157 cd $MYPWD
158 input_files=( %(input_files)s )
159 for i in ${input_files[@]}
160 do
161 cp -R -L $i $MYTMP
162 done
163 cd $MYTMP
164 echo '%(arguments)s' > arguments
165 chmod +x ./%(script)s
166 %(program)s ./%(script)s %(arguments)s
167 exit=$?
168 output_files=( %(output_files)s )
169 for i in ${output_files[@]}
170 do
171 cp -r $MYTMP/$i $MYPWD
172 done
173 # if [ "$exit" -eq "0" ]
174 # then
175 rm -rf $MYTMP
176 # fi
177 """
178
179 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
180 'cwd': cwd, 'job_id': self.job_id,
181 'input_files': ' '.join(input_files + [prog]),
182 'output_files': ' '.join(output_files),
183 'arguments': ' '.join([str(a) for a in argument]),
184 'program': ' ' if '.py' in prog else 'bash'}
185
186
187 new_prog = pjoin(cwd, temp_file_name)
188 open(new_prog, 'w').write(text % dico)
189 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
190
191 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
192 required_output=required_output, nb_submit=nb_submit)
193
194
195 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
196 log=None, input_files=[], output_files=[], required_output=[],
197 nb_submit=0, packet_member=None):
198 """This function wrap the cluster submition with cluster independant
199 method should not be overwritten (but for DAG type submission)"""
200
201 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
202 output_files, required_output, nb_submit)
203
204
205 if not packet_member:
206 return id
207 else:
208 if isinstance(packet_member, Packet):
209 self.id_to_packet[id] = packet_member
210 packet_member.put(id)
211 if packet_member.tag not in self.packet:
212 self.packet[packet_member.tag] = packet_member
213 else:
214 if packet_member in self.packet:
215 packet = self.packet[packet_member]
216 packet.put(id)
217 self.id_to_packet[id] = packet
218 return id
219
221 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
222 if not self.submitted_ids:
223 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
224 idle, run, fail = 0, 0, 0
225 for pid in self.submitted_ids[:]:
226 status = self.control_one_job(id)
227 if status == 'I':
228 idle += 1
229 elif status == 'R':
230 run += 1
231 elif status == 'F':
232 self.finish +=1
233 self.submitted_ids.remove(pid)
234 else:
235 fail += 1
236
237 return idle, run, self.finish, fail
238
240 """ control the status of a single job with it's cluster id """
241 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
242
244 """get a unique run_name for all the jobs helps to identify the runs
245 in the controller for some cluster."""
246
247 if second_path:
248 path = os.path.realpath(pjoin(path, second_path))
249 elif not os.path.exists(path):
250 return path
251
252 if 'SubProcesses' in path:
253 target = path.rsplit('/SubProcesses',1)[0]
254 elif 'MCatNLO' in path:
255 target = path.rsplit('/MCatNLO',1)[0]
256 elif second_path:
257 target=path
258 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
259 else:
260 target = path
261
262 if target.endswith('/'):
263 target = target[:-1]
264
265 target = misc.digest(target)[-self.identifier_length:]
266 if not target[0].isalpha():
267 target = 'a' + target[1:]
268
269 return target
270
271
272 @check_interupt()
273 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
274 """Wait that all job are finish.
275 if minimal_job set, then return if idle + run is lower than that number"""
276
277
278 mode = 1
279 nb_iter = 0
280 nb_short = 0
281 change_at = 5
282
283 if update_first:
284 idle, run, finish, fail = self.control(me_dir)
285 update_first(idle, run, finish)
286
287
288 longtime, shorttime = self.options['cluster_status_update']
289
290 nb_job = 0
291 while 1:
292 old_mode = mode
293 nb_iter += 1
294 idle, run, finish, fail = self.control(me_dir)
295 if nb_job:
296 if idle + run + finish + fail != nb_job:
297 nb_job = idle + run + finish + fail
298 nb_iter = 1
299 else:
300 nb_job = idle + run + finish + fail
301 if fail:
302 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
303 if idle + run == 0:
304
305 logger.info('All jobs finished')
306 fct(idle, run, finish)
307 break
308 if idle + run < minimal_job:
309 return
310 fct(idle, run, finish)
311
312 if nb_iter < change_at:
313 mode = 1
314 elif idle < run:
315 if old_mode == 0:
316 if nb_short:
317 mode = 0
318
319 elif idle:
320 if nb_iter > change_at + int(longtime)//shorttime:
321 mode = 0
322 else:
323 mode = 1
324 nb_short =0
325 else:
326 mode = 1
327 nb_short = 0
328 elif old_mode == 1:
329 nb_short +=1
330 if nb_short > 3* max(change_at, int(longtime)//shorttime):
331 mode = 0
332 else:
333 mode = 0
334
335
336 if old_mode > mode:
337 logger.info('''Start to wait %ss between checking status.
338 Note that you can change this time in the configuration file.
339 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
340
341
342 if mode == 0:
343 try:
344 time.sleep(self.options['cluster_status_update'][0])
345 except KeyboardInterrupt:
346 logger.info('start to update the status')
347 nb_iter = min(0, change_at -2)
348 nb_short = 0
349 else:
350 time.sleep(self.options['cluster_status_update'][1])
351
352
353 self.submitted = 0
354 self.submitted_ids = []
355
357 """Check the termination of the jobs with job_id and relaunch it if needed."""
358
359
360 if job_id not in self.retry_args:
361 return True
362
363 args = self.retry_args[job_id]
364 if 'time_check' in args:
365 time_check = args['time_check']
366 else:
367 time_check = 0
368
369 for path in args['required_output']:
370 if args['cwd']:
371 path = pjoin(args['cwd'], path)
372
373 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
374 break
375 else:
376
377 if time_check > 0:
378 logger.info('Job %s Finally found the missing output.' % (job_id))
379 del self.retry_args[job_id]
380 self.submitted_ids.remove(job_id)
381
382 if job_id in self.id_to_packet:
383 nb_in_packet = self.id_to_packet[job_id].remove_one()
384 if nb_in_packet == 0:
385
386 packet = self.id_to_packet[job_id]
387
388 packet.queue.join()
389
390 packet.fct(*packet.args)
391 del self.id_to_packet[job_id]
392 return 'resubmit'
393
394 return 'done'
395
396 if time_check == 0:
397 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
398 args['time_check'] = time.time()
399 return 'wait'
400 elif self.cluster_retry_wait > time.time() - time_check:
401 return 'wait'
402
403
404 if self.nb_retry < 0:
405 logger.critical('''Fail to run correctly job %s.
406 with option: %s
407 file missing: %s''' % (job_id, args, path))
408 raw_input('press enter to continue.')
409 elif self.nb_retry == 0:
410 logger.critical('''Fail to run correctly job %s.
411 with option: %s
412 file missing: %s.
413 Stopping all runs.''' % (job_id, args, path))
414 self.remove()
415 elif args['nb_submit'] >= self.nb_retry:
416 logger.critical('''Fail to run correctly job %s.
417 with option: %s
418 file missing: %s
419 Fails %s times
420 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
421 self.remove()
422 else:
423 args['nb_submit'] += 1
424 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
425 del self.retry_args[job_id]
426 self.submitted_ids.remove(job_id)
427 if 'time_check' in args:
428 del args['time_check']
429 if job_id in self.id_to_packet:
430 self.id_to_packet[job_id].remove_one()
431 args['packet_member'] = self.id_to_packet[job_id]
432 del self.id_to_packet[job_id]
433 self.cluster_submit(**args)
434 else:
435 self.submit2(**args)
436 return 'resubmit'
437 return 'done'
438
439 @check_interupt()
440 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
441 stderr=None, log=None, required_output=[], nb_submit=0,
442 input_files=[], output_files=[]):
443 """launch one job on the cluster and wait for it"""
444
445 special_output = False
446 if stderr == -2 and stdout:
447
448 special_output = True
449 stderr = stdout + '.err'
450
451 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
452 required_output=required_output, input_files=input_files,
453 output_files=output_files)
454
455 frame = inspect.currentframe()
456 args, _, _, values = inspect.getargvalues(frame)
457 args = dict([(i, values[i]) for i in args if i != 'self'])
458 self.retry_args[id] = args
459
460 nb_wait=0
461 while 1:
462 nb_wait+=1
463 status = self.control_one_job(id)
464 if not status in ['R','I']:
465 status = self.check_termination(id)
466 if status in ['wait']:
467 time.sleep(30)
468 continue
469 elif status in ['resubmit']:
470 id = self.submitted_ids[0]
471 time.sleep(30)
472 continue
473
474 time.sleep(30)
475 break
476 time.sleep(self.options['cluster_status_update'][1])
477
478 if required_output:
479 status = self.check_termination(id)
480 if status == 'wait':
481 run += 1
482 elif status == 'resubmit':
483 idle += 1
484
485
486 if special_output:
487
488
489 for i in range(5):
490 if os.path.exists(stdout):
491 if not os.path.exists(stderr):
492 time.sleep(5)
493 if os.path.exists(stderr):
494 err_text = open(stderr).read()
495 if not err_text:
496 return
497 logger.warning(err_text)
498 text = open(stdout).read()
499 open(stdout,'w').write(text + err_text)
500 else:
501 return
502 time.sleep(10)
503
504 - def remove(self, *args, **opts):
505 """ """
506 logger.warning("""This cluster didn't support job removal,
507 the jobs are still running on the cluster.""")
508
510 """ an object for handling packet of job, it is designed to be thread safe
511 """
512
513 - def __init__(self, name, fct, args, opts={}):
514 import Queue
515 import threading
516 self.queue = Queue.Queue()
517 self.tag = name
518 self.fct = fct
519 self.args = args
520 self.opts = opts
521 self.done = threading.Event()
522
523 - def put(self, *args, **opts):
525
526 append = put
527
532
534 """class for dealing with the submission in multiple node"""
535
536 job_id = "$"
537
539 """Init the cluster """
540
541
542 super(MultiCore, self).__init__(self, *args, **opt)
543
544 import Queue
545 import threading
546 import thread
547 self.queue = Queue.Queue()
548 self.done = Queue.Queue()
549 self.submitted = Queue.Queue()
550 self.stoprequest = threading.Event()
551 self.demons = []
552 self.nb_done =0
553 if 'nb_core' in opt:
554 self.nb_core = opt['nb_core']
555 elif isinstance(args[0],int):
556 self.nb_core = args[0]
557 else:
558 self.nb_core = 1
559 self.update_fct = None
560
561 self.lock = threading.Event()
562 self.pids = Queue.Queue()
563 self.done_pid = []
564 self.done_pid_queue = Queue.Queue()
565 self.fail_msg = None
566
567
568 for _ in range(self.nb_core):
569 self.start_demon()
570
571
573 import threading
574 t = threading.Thread(target=self.worker)
575 t.daemon = True
576 t.start()
577 self.demons.append(t)
578
579
581 import Queue
582 import thread
583 while not self.stoprequest.isSet():
584 try:
585 args = self.queue.get()
586 tag, exe, arg, opt = args
587 try:
588
589 if isinstance(exe,str):
590 if os.path.exists(exe) and not exe.startswith('/'):
591 exe = './' + exe
592 if opt['stderr'] == None:
593 opt['stderr'] = subprocess.STDOUT
594 proc = misc.Popen([exe] + arg, **opt)
595 pid = proc.pid
596 self.pids.put(pid)
597 proc.wait()
598 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
599 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
600 (' '.join([exe]+arg), proc.returncode)
601 logger.warning(fail_msg)
602 self.stoprequest.set()
603 self.remove(fail_msg)
604
605
606
607
608 else:
609 pid = tag
610 self.pids.put(pid)
611
612
613 returncode = exe(*arg, **opt)
614 if returncode != 0:
615 logger.warning("fct %s does not return 0. Starts to stop the code in a clean way.", exe)
616 self.stoprequest.set()
617 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
618 except Exception,error:
619 self.fail_msg = sys.exc_info()
620 logger.warning(str(error))
621 self.stoprequest.set()
622 self.remove(error)
623
624 if __debug__:
625 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2]
626
627 self.queue.task_done()
628 self.done.put(tag)
629 self.done_pid_queue.put(pid)
630
631 try:
632 self.lock.set()
633 except thread.error:
634 continue
635 except Queue.Empty:
636 continue
637
638
639
640
641 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
642 log=None, required_output=[], nb_submit=0):
643 """submit a job on multicore machine"""
644
645 tag = (prog, tuple(argument), cwd, nb_submit)
646 if isinstance(prog, str):
647
648
649 opt = {'cwd': cwd,
650 'stdout':stdout,
651 'stderr': stderr}
652 self.queue.put((tag, prog, argument, opt))
653 self.submitted.put(1)
654 return tag
655 else:
656
657 self.queue.put((tag, prog, argument, {}))
658 self.submitted.put(1)
659 return tag
660
661 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
662 stderr=None, log=None, **opts):
663 """launch one job and wait for it"""
664 if isinstance(stdout, str):
665 stdout = open(stdout, 'w')
666 if isinstance(stderr, str):
667 stdout = open(stderr, 'w')
668 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
669
670 - def remove(self, error=None):
671 """Ensure that all thread are killed"""
672
673
674 self.stoprequest.set()
675 if error and not self.fail_msg:
676 self.fail_msg = error
677
678
679 while not self.done_pid_queue.empty():
680 pid = self.done_pid_queue.get()
681 self.done_pid.append(pid)
682
683
684 while not self.pids.empty():
685 pid = self.pids.get()
686 self.pids.task_done()
687 if isinstance(pid, tuple):
688 continue
689 if pid in self.done_pid:
690 continue
691 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
692 % {'pid':pid} )
693 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
694
695
696 - def wait(self, me_dir, update_status, update_first=None):
697 """Waiting that all the jobs are done. This function also control that
698 the submission by packet are handle correctly (i.e. submit the function)"""
699
700 import Queue
701 import threading
702
703 try:
704 last_status = (0, 0, 0)
705 sleep_time = 1
706 use_lock = True
707 first = True
708 while True:
709 force_one_more_loop = False
710
711
712
713 while self.done.qsize():
714 try:
715 tag = self.done.get(True, 1)
716 except Queue.Empty:
717 pass
718 else:
719 if self.id_to_packet and tuple(tag) in self.id_to_packet:
720 packet = self.id_to_packet[tuple(tag)]
721 remaining = packet.remove_one()
722 if remaining == 0:
723
724 packet.queue.join()
725 self.submit(packet.fct, packet.args)
726 force_one_more_loop = True
727 self.nb_done += 1
728 self.done.task_done()
729
730
731
732 Idle = self.queue.qsize()
733 Done = self.nb_done + self.done.qsize()
734 Running = max(0, self.submitted.qsize() - Idle - Done)
735
736 if Idle + Running <= 0 and not force_one_more_loop:
737 update_status(Idle, Running, Done)
738
739
740 self.queue.join()
741 break
742
743 if (Idle, Running, Done) != last_status:
744 if first and update_first:
745 update_first(Idle, Running, Done)
746 first = False
747 else:
748 update_status(Idle, Running, Done)
749 last_status = (Idle, Running, Done)
750
751
752 while not self.done_pid_queue.empty():
753 pid = self.done_pid_queue.get()
754 self.done_pid.append(pid)
755 self.done_pid_queue.task_done()
756
757
758
759 if use_lock:
760
761 use_lock = self.lock.wait(300)
762 self.lock.clear()
763 if not use_lock and Idle > 0:
764 use_lock = True
765 else:
766
767
768 time.sleep(sleep_time)
769 sleep_time = min(sleep_time + 2, 180)
770 if update_first:
771 update_first(Idle, Running, Done)
772
773 if self.stoprequest.isSet():
774 if isinstance(self.fail_msg, Exception):
775 raise self.fail_msg
776 elif isinstance(self.fail_msg, str):
777 raise Exception, self.fail_msg
778 else:
779 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
780
781 try:
782 self.lock.clear()
783 except Exception:
784 pass
785 self.done = Queue.Queue()
786 self.done_pid = []
787 self.done_pid_queue = Queue.Queue()
788 self.nb_done = 0
789 self.submitted = Queue.Queue()
790 self.pids = Queue.Queue()
791 self.stoprequest.clear()
792
793 except KeyboardInterrupt:
794
795 if isinstance(self.fail_msg, Exception):
796 raise self.fail_msg
797 elif isinstance(self.fail_msg, str):
798 raise Exception, self.fail_msg
799 elif self.fail_msg:
800 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
801
802 raise
803
805 """Basic class for dealing with cluster submission"""
806
807 name = 'condor'
808 job_id = 'CONDOR_ID'
809
810
811
812 @multiple_try()
813 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
814 required_output=[], nb_submit=0):
815 """Submit a job prog to a Condor cluster"""
816
817 text = """Executable = %(prog)s
818 output = %(stdout)s
819 error = %(stderr)s
820 log = %(log)s
821 %(argument)s
822 environment = CONDOR_ID=$(Cluster).$(Process)
823 Universe = vanilla
824 notification = Error
825 Initialdir = %(cwd)s
826 %(requirement)s
827 getenv=True
828 queue 1
829 """
830
831 if self.cluster_queue not in ['None', None]:
832 requirement = 'Requirements = %s=?=True' % self.cluster_queue
833 else:
834 requirement = ''
835
836 if cwd is None:
837 cwd = os.getcwd()
838 if stdout is None:
839 stdout = '/dev/null'
840 if stderr is None:
841 stderr = '/dev/null'
842 if log is None:
843 log = '/dev/null'
844 if not os.path.exists(prog):
845 prog = os.path.join(cwd, prog)
846 if argument:
847 argument = 'Arguments = %s' % ' '.join(argument)
848 else:
849 argument = ''
850
851
852 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
853 'stderr': stderr,'log': log,'argument': argument,
854 'requirement': requirement}
855
856
857 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
858 stdin=subprocess.PIPE)
859 output, _ = a.communicate(text % dico)
860
861
862
863
864 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
865 try:
866 id = pat.search(output).groups()[0]
867 except:
868 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
869 % output
870 self.submitted += 1
871 self.submitted_ids.append(id)
872 return id
873
874 @store_input()
875 @multiple_try()
876 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
877 log=None, input_files=[], output_files=[], required_output=[],
878 nb_submit=0):
879 """Submit the job on the cluster NO SHARE DISK
880 input/output file should be give relative to cwd
881 """
882
883 if not required_output and output_files:
884 required_output = output_files
885
886 if (input_files == [] == output_files):
887 return self.submit(prog, argument, cwd, stdout, stderr, log,
888 required_output=required_output, nb_submit=nb_submit)
889
890 text = """Executable = %(prog)s
891 output = %(stdout)s
892 error = %(stderr)s
893 log = %(log)s
894 %(argument)s
895 should_transfer_files = YES
896 when_to_transfer_output = ON_EXIT
897 transfer_input_files = %(input_files)s
898 %(output_files)s
899 Universe = vanilla
900 notification = Error
901 Initialdir = %(cwd)s
902 %(requirement)s
903 getenv=True
904 queue 1
905 """
906
907 if self.cluster_queue not in ['None', None]:
908 requirement = 'Requirements = %s=?=True' % self.cluster_queue
909 else:
910 requirement = ''
911
912 if cwd is None:
913 cwd = os.getcwd()
914 if stdout is None:
915 stdout = '/dev/null'
916 if stderr is None:
917 stderr = '/dev/null'
918 if log is None:
919 log = '/dev/null'
920 if not os.path.exists(prog):
921 prog = os.path.join(cwd, prog)
922 if argument:
923 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
924 else:
925 argument = ''
926
927 if input_files:
928 input_files = ','.join(input_files)
929 else:
930 input_files = ''
931 if output_files:
932 output_files = 'transfer_output_files = %s' % ','.join(output_files)
933 else:
934 output_files = ''
935
936
937
938 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
939 'stderr': stderr,'log': log,'argument': argument,
940 'requirement': requirement, 'input_files':input_files,
941 'output_files':output_files}
942
943
944 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
945 stdin=subprocess.PIPE)
946 output, _ = a.communicate(text % dico)
947
948
949
950
951 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
952 try:
953 id = pat.search(output).groups()[0]
954 except:
955 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
956 % output
957 self.submitted += 1
958 self.submitted_ids.append(id)
959 return id
960
961
962
963
964
965 @multiple_try(nb_try=10, sleep=10)
967 """ control the status of a single job with it's cluster id """
968 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
969 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
970 stderr=subprocess.PIPE)
971
972 error = status.stderr.read()
973 if status.returncode or error:
974 raise ClusterManagmentError, 'condor_q returns error: %s' % error
975
976 return status.stdout.readline().strip()
977
978 @check_interupt()
979 @multiple_try(nb_try=10, sleep=10)
981 """ control the status of a single job with it's cluster id """
982
983 if not self.submitted_ids:
984 return 0, 0, 0, 0
985
986 packet = 15000
987 idle, run, fail = 0, 0, 0
988 ongoing = []
989 for i in range(1+(len(self.submitted_ids)-1)//packet):
990 start = i * packet
991 stop = (i+1) * packet
992 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
993 " -format \'%-2s\ ' \'ClusterId\' " + \
994 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
995
996 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
997 stderr=subprocess.PIPE)
998 error = status.stderr.read()
999 if status.returncode or error:
1000 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1001
1002 for line in status.stdout:
1003 id, status = line.strip().split()
1004 ongoing.append(int(id))
1005 if status in ['I','U']:
1006 idle += 1
1007 elif status == 'R':
1008 run += 1
1009 elif status != 'C':
1010 fail += 1
1011
1012 for id in list(self.submitted_ids):
1013 if int(id) not in ongoing:
1014 status = self.check_termination(id)
1015 if status == 'wait':
1016 run += 1
1017 elif status == 'resubmit':
1018 idle += 1
1019
1020 return idle, run, self.submitted - (idle+run+fail), fail
1021
1022 @multiple_try()
1023 - def remove(self, *args, **opts):
1024 """Clean the jobson the cluster"""
1025
1026 if not self.submitted_ids:
1027 return
1028 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1029
1030 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1031 self.submitted_ids = []
1032
1034 """Basic class for dealing with cluster submission"""
1035
1036 name = 'pbs'
1037 job_id = 'PBS_JOBID'
1038 idle_tag = ['Q']
1039 running_tag = ['T','E','R']
1040 complete_tag = ['C']
1041
1042 maximum_submited_jobs = 2500
1043
1044 @multiple_try()
1045 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1046 required_output=[], nb_submit=0):
1047 """Submit a job prog to a PBS cluster"""
1048
1049 me_dir = self.get_jobs_identifier(cwd, prog)
1050
1051 if len(self.submitted_ids) > self.maximum_submited_jobs:
1052 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1053 self.wait(me_dir, fct, self.maximum_submited_jobs)
1054
1055
1056 text = ""
1057 if cwd is None:
1058 cwd = os.getcwd()
1059 else:
1060 text = " cd %s;" % cwd
1061 if stdout is None:
1062 stdout = '/dev/null'
1063 if stderr is None:
1064 stderr = '/dev/null'
1065 elif stderr == -2:
1066 stderr = stdout
1067 if log is None:
1068 log = '/dev/null'
1069
1070 if not os.path.isabs(prog):
1071 text += "./%s" % prog
1072 else:
1073 text+= prog
1074
1075 if argument:
1076 text += ' ' + ' '.join(argument)
1077
1078 command = ['qsub','-o', stdout,
1079 '-N', me_dir,
1080 '-e', stderr,
1081 '-V']
1082
1083 if self.cluster_queue and self.cluster_queue != 'None':
1084 command.extend(['-q', self.cluster_queue])
1085
1086 a = misc.Popen(command, stdout=subprocess.PIPE,
1087 stderr=subprocess.STDOUT,
1088 stdin=subprocess.PIPE, cwd=cwd)
1089
1090 output = a.communicate(text)[0]
1091 id = output.split('.')[0]
1092 if not id.isdigit() or a.returncode !=0:
1093 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1094 % output
1095
1096 self.submitted += 1
1097 self.submitted_ids.append(id)
1098 return id
1099
1100 @multiple_try()
1102 """ control the status of a single job with it's cluster id """
1103 cmd = 'qstat '+str(id)
1104 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1105 stderr=subprocess.STDOUT)
1106
1107 for line in status.stdout:
1108 line = line.strip()
1109 if 'cannot connect to server' in line or 'cannot read reply' in line:
1110 raise ClusterManagmentError, 'server disconnected'
1111 if 'Unknown' in line:
1112 return 'F'
1113 elif line.startswith(str(id)):
1114 jobstatus = line.split()[4]
1115 else:
1116 jobstatus=""
1117
1118 if status.returncode != 0 and status.returncode is not None:
1119 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1120 if jobstatus in self.idle_tag:
1121 return 'I'
1122 elif jobstatus in self.running_tag:
1123 return 'R'
1124 return 'F'
1125
1126
1127 @multiple_try()
1129 """ control the status of a single job with it's cluster id """
1130 cmd = "qstat"
1131 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1132
1133 me_dir = self.get_jobs_identifier(me_dir)
1134
1135 ongoing = []
1136
1137 idle, run, fail = 0, 0, 0
1138 for line in status.stdout:
1139 if 'cannot connect to server' in line or 'cannot read reply' in line:
1140 raise ClusterManagmentError, 'server disconnected'
1141 if me_dir in line:
1142 ongoing.append(line.split()[0].split('.')[0])
1143 status2 = line.split()[4]
1144 if status2 in self.idle_tag:
1145 idle += 1
1146 elif status2 in self.running_tag:
1147 run += 1
1148 elif status2 in self.complete_tag:
1149 if not self.check_termination(line.split()[0].split('.')[0]):
1150 idle += 1
1151 else:
1152 fail += 1
1153
1154 if status.returncode != 0 and status.returncode is not None:
1155 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1156
1157 for id in list(self.submitted_ids):
1158 if id not in ongoing:
1159 status2 = self.check_termination(id)
1160 if status2 == 'wait':
1161 run += 1
1162 elif status2 == 'resubmit':
1163 idle += 1
1164
1165 return idle, run, self.submitted - (idle+run+fail), fail
1166
1167 @multiple_try()
1168 - def remove(self, *args, **opts):
1169 """Clean the jobs on the cluster"""
1170
1171 if not self.submitted_ids:
1172 return
1173 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1174 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1175 self.submitted_ids = []
1176
1179 """Basic class for dealing with cluster submission"""
1180
1181
1182 name = 'sge'
1183 job_id = 'JOB_ID'
1184 idle_tag = ['qw', 'hqw','hRqw','w']
1185 running_tag = ['r','t','Rr','Rt']
1186 identifier_length = 10
1187
1189 """replace string for path issues"""
1190 location = os.path.realpath(location)
1191 homePath = os.getenv("HOME")
1192 if homePath:
1193 location = location.replace(homePath,'$HOME')
1194 return location
1195
1196 @multiple_try()
1197 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1198 required_output=[], nb_submit=0):
1199 """Submit a job prog to an SGE cluster"""
1200
1201 me_dir = self.get_jobs_identifier(cwd, prog)
1202
1203
1204 if cwd is None:
1205
1206 cwd = self.def_get_path(os.getcwd())
1207 cwd1 = self.def_get_path(cwd)
1208 text = " cd %s;" % cwd1
1209 if stdout is None:
1210 stdout = '/dev/null'
1211 else:
1212 stdout = self.def_get_path(stdout)
1213 if stderr is None:
1214 stderr = '/dev/null'
1215 elif stderr == -2:
1216 stderr = stdout
1217 else:
1218 stderr = self.def_get_path(stderr)
1219
1220 if log is None:
1221 log = '/dev/null'
1222 else:
1223 log = self.def_get_path(log)
1224
1225 text += prog
1226 if argument:
1227 text += ' ' + ' '.join(argument)
1228
1229
1230
1231
1232 homePath = os.getenv("HOME")
1233 if homePath:
1234 text = text.replace(homePath,'$HOME')
1235
1236 logger.debug("!=== input %s" % text)
1237 logger.debug("!=== output %s" % stdout)
1238 logger.debug("!=== error %s" % stderr)
1239 logger.debug("!=== logs %s" % log)
1240
1241 command = ['qsub','-o', stdout,
1242 '-N', me_dir,
1243 '-e', stderr,
1244 '-V']
1245
1246 if self.cluster_queue and self.cluster_queue != 'None':
1247 command.extend(['-q', self.cluster_queue])
1248
1249 a = misc.Popen(command, stdout=subprocess.PIPE,
1250 stderr=subprocess.STDOUT,
1251 stdin=subprocess.PIPE, cwd=cwd)
1252
1253 output = a.communicate(text)[0]
1254 id = output.split(' ')[2]
1255 if not id.isdigit():
1256 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1257 % output
1258 self.submitted += 1
1259 self.submitted_ids.append(id)
1260 logger.debug(output)
1261
1262 return id
1263
1264 @multiple_try()
1266 """ control the status of a single job with it's cluster id """
1267
1268 cmd = 'qstat '
1269 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1270 for line in status.stdout:
1271
1272
1273
1274
1275
1276
1277 if str(id) in line:
1278 status = line.split()[4]
1279
1280 if status in self.idle_tag:
1281 return 'I'
1282 elif status in self.running_tag:
1283 return 'R'
1284 return 'F'
1285
1286 @multiple_try()
1288 """ control the status of a single job with it's cluster id """
1289 cmd = "qstat "
1290 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1291
1292 me_dir = self.get_jobs_identifier(me_dir)
1293
1294 finished = list(self.submitted_ids)
1295
1296 idle, run, fail = 0, 0, 0
1297 for line in status.stdout:
1298 if me_dir in line:
1299 id,_,_,_,status = line.split()[:5]
1300 if status in self.idle_tag:
1301 idle += 1
1302 finished.remove(id)
1303 elif status in self.running_tag:
1304 run += 1
1305 finished.remove(id)
1306 else:
1307 logger.debug(line)
1308 fail += 1
1309 finished.remove(id)
1310
1311 for id in finished:
1312 self.check_termination(id)
1313
1314 return idle, run, self.submitted - (idle+run+fail), fail
1315
1316
1317
1318 @multiple_try()
1319 - def remove(self, *args, **opts):
1320 """Clean the jobs on the cluster"""
1321
1322 if not self.submitted_ids:
1323 return
1324 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1325 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1326 self.submitted_ids = []
1327
1330 """Basic class for dealing with cluster submission"""
1331
1332 name = 'lsf'
1333 job_id = 'LSB_JOBID'
1334
1335 @multiple_try()
1336 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1337 required_output=[], nb_submit=0):
1338 """Submit the job prog to an LSF cluster"""
1339
1340
1341 me_dir = self.get_jobs_identifier(cwd, prog)
1342
1343 text = ""
1344 command = ['bsub', '-C0', '-J', me_dir]
1345 if cwd is None:
1346 cwd = os.getcwd()
1347 else:
1348 text = " cd %s;" % cwd
1349 if stdout and isinstance(stdout, str):
1350 command.extend(['-o', stdout])
1351 if stderr and isinstance(stdout, str):
1352 command.extend(['-e', stderr])
1353 elif stderr == -2:
1354 pass
1355 if log is None:
1356 log = '/dev/null'
1357
1358 text += prog
1359 if argument:
1360 text += ' ' + ' '.join(argument)
1361
1362 if self.cluster_queue and self.cluster_queue != 'None':
1363 command.extend(['-q', self.cluster_queue])
1364
1365 a = misc.Popen(command, stdout=subprocess.PIPE,
1366 stderr=subprocess.STDOUT,
1367 stdin=subprocess.PIPE, cwd=cwd)
1368
1369 output = a.communicate(text)[0]
1370
1371 try:
1372 id = output.split('>',1)[0].split('<')[1]
1373 except:
1374 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1375 % output
1376 if not id.isdigit():
1377 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1378 % output
1379 self.submitted += 1
1380 self.submitted_ids.append(id)
1381 return id
1382
1383
1384 @multiple_try()
1386 """ control the status of a single job with it's cluster id """
1387
1388 cmd = 'bjobs '+str(id)
1389 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1390
1391 for line in status.stdout:
1392 line = line.strip().upper()
1393 if 'JOBID' in line:
1394 continue
1395 elif str(id) not in line:
1396 continue
1397 status = line.split()[2]
1398 if status == 'RUN':
1399 return 'R'
1400 elif status == 'PEND':
1401 return 'I'
1402 elif status == 'DONE':
1403 return 'F'
1404 else:
1405 return 'H'
1406 return 'F'
1407
1408 @multiple_try()
1410 """ control the status of a single job with it's cluster id """
1411
1412 if not self.submitted_ids:
1413 return 0, 0, 0, 0
1414
1415 cmd = "bjobs " + ' '.join(self.submitted_ids)
1416 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1417
1418 jobstatus = {}
1419 for line in status.stdout:
1420 line = line.strip()
1421 if 'JOBID' in line:
1422 continue
1423 splitline = line.split()
1424 id = splitline[0]
1425 if id not in self.submitted_ids:
1426 continue
1427 jobstatus[id] = splitline[2]
1428
1429 idle, run, fail = 0, 0, 0
1430 for id in self.submitted_ids[:]:
1431 if id in jobstatus:
1432 status = jobstatus[id]
1433 else:
1434 status = 'MISSING'
1435 if status == 'RUN':
1436 run += 1
1437 elif status == 'PEND':
1438 idle += 1
1439 else:
1440 status = self.check_termination(id)
1441 if status == 'wait':
1442 run += 1
1443 elif status == 'resubmit':
1444 idle += 1
1445
1446 return idle, run, self.submitted - (idle+run+fail), fail
1447
1448 @multiple_try()
1449 - def remove(self, *args,**opts):
1450 """Clean the jobs on the cluster"""
1451
1452 if not self.submitted_ids:
1453 return
1454 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1455 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1456 self.submitted_ids = []
1457
1459 """Class for dealing with cluster submission on a GE cluster"""
1460
1461 name = 'ge'
1462 job_id = 'JOB_ID'
1463 idle_tag = ['qw']
1464 running_tag = ['r']
1465
1466 @multiple_try()
1467 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1468 required_output=[], nb_submit=0):
1469 """Submit a job prog to a GE cluster"""
1470
1471 text = ""
1472 if cwd is None:
1473 cwd = os.getcwd()
1474 else:
1475 text = " cd %s; bash " % cwd
1476 if stdout is None:
1477 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1478 if stderr is None:
1479 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1480 elif stderr == -2:
1481 stderr = stdout
1482 if log is None:
1483 log = '/dev/null'
1484
1485 text += prog
1486 if argument:
1487 text += ' ' + ' '.join(argument)
1488 text += '\n'
1489 tmp_submit = os.path.join(cwd, 'tmp_submit')
1490 open(tmp_submit,'w').write(text)
1491
1492 a = misc.Popen(['qsub','-o', stdout,
1493 '-e', stderr,
1494 tmp_submit],
1495 stdout=subprocess.PIPE,
1496 stderr=subprocess.STDOUT,
1497 stdin=subprocess.PIPE, cwd=cwd)
1498
1499 output = a.communicate()[0]
1500
1501 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1502 try:
1503 id = pat.search(output).groups()[0]
1504 except:
1505 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1506 % output
1507 self.submitted += 1
1508 self.submitted_ids.append(id)
1509 return id
1510
1511 @multiple_try()
1513 """ control the status of a single job with it's cluster id """
1514 cmd = 'qstat | grep '+str(id)
1515 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1516 if not status:
1517 return 'F'
1518
1519 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1520 stat = ''
1521 for line in status.stdout.read().split('\n'):
1522 if not line:
1523 continue
1524 line = line.strip()
1525 try:
1526 groups = pat.search(line).groups()
1527 except:
1528 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1529 if groups[0] != id: continue
1530 stat = groups[1]
1531 if not stat:
1532 return 'F'
1533 if stat in self.idle_tag:
1534 return 'I'
1535 if stat in self.running_tag:
1536 return 'R'
1537
1538 @multiple_try()
1540 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1541 if not self.submitted_ids:
1542 return 0, 0, 0, 0
1543 idle, run, fail = 0, 0, 0
1544 ongoing = []
1545 for statusflag in ['p', 'r', 'sh']:
1546 cmd = 'qstat -s %s' % statusflag
1547 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1548
1549 pat = re.compile("^(\d+)")
1550 for line in status.stdout.read().split('\n'):
1551 line = line.strip()
1552 try:
1553 id = pat.search(line).groups()[0]
1554 except Exception:
1555 pass
1556 else:
1557 if id not in self.submitted_ids:
1558 continue
1559 ongoing.append(id)
1560 if statusflag == 'p':
1561 idle += 1
1562 if statusflag == 'r':
1563 run += 1
1564 if statusflag == 'sh':
1565 fail += 1
1566 for id in list(self.submitted_ids):
1567 if id not in ongoing:
1568 self.check_termination(id)
1569
1570
1571 return idle, run, self.submitted - idle - run - fail, fail
1572
1573 @multiple_try()
1574 - def remove(self, *args, **opts):
1575 """Clean the jobs on the cluster"""
1576
1577 if not self.submitted_ids:
1578 return
1579 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1580 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1581 self.submitted_ids = []
1582
1584 """start a computation and not wait for it to finish.
1585 this fonction returns a lock which is locked as long as the job is
1586 running."""
1587
1588 mc = MultiCore(1)
1589 mc.submit(exe, argument, cwd, stdout, **opt)
1590 mc.need_waiting = True
1591 return mc.lock
1592
1595 """Basic class for dealing with cluster submission"""
1596
1597 name = 'slurm'
1598 job_id = 'SLURM_JOBID'
1599 idle_tag = ['Q','PD','S','CF']
1600 running_tag = ['R', 'CG']
1601 complete_tag = ['C']
1602 identification_length = 8
1603
1604 @multiple_try()
1605 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1606 required_output=[], nb_submit=0):
1607 """Submit a job prog to a SLURM cluster"""
1608
1609 me_dir = self.get_jobs_identifier(cwd, prog)
1610
1611
1612 if cwd is None:
1613 cwd = os.getcwd()
1614 if stdout is None:
1615 stdout = '/dev/null'
1616 if stderr is None:
1617 stderr = '/dev/null'
1618 elif stderr == -2:
1619 stderr = stdout
1620 if log is None:
1621 log = '/dev/null'
1622
1623 command = ['sbatch', '-o', stdout,
1624 '-J', me_dir,
1625 '-e', stderr, prog] + argument
1626
1627 if self.cluster_queue and self.cluster_queue != 'None':
1628 command.insert(1, '-p')
1629 command.insert(2, self.cluster_queue)
1630
1631 a = misc.Popen(command, stdout=subprocess.PIPE,
1632 stderr=subprocess.STDOUT,
1633 stdin=subprocess.PIPE, cwd=cwd)
1634
1635 output = a.communicate()
1636 output_arr = output[0].split(' ')
1637 id = output_arr[3].rstrip()
1638
1639 if not id.isdigit():
1640 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1641
1642 self.submitted += 1
1643 self.submitted_ids.append(id)
1644 return id
1645
1646 @multiple_try()
1648 """ control the status of a single job with it's cluster id """
1649 cmd = 'squeue j'+str(id)
1650 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1651 stderr=open(os.devnull,'w'))
1652
1653 for line in status.stdout:
1654 line = line.strip()
1655 if 'Invalid' in line:
1656 return 'F'
1657 elif line.startswith(str(id)):
1658 status = line.split()[4]
1659 if status in self.idle_tag:
1660 return 'I'
1661 elif status in self.running_tag:
1662 return 'R'
1663 return 'F'
1664
1665 @multiple_try()
1667 """ control the status of a single job with it's cluster id """
1668 cmd = "squeue"
1669 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1670
1671 me_dir = self.get_jobs_identifier(me_dir)
1672
1673 idle, run, fail = 0, 0, 0
1674 ongoing=[]
1675 for line in status.stdout:
1676 if me_dir in line:
1677 id, _, _,_ , status,_ = line.split(None,5)
1678 ongoing.append(id)
1679 if status in self.idle_tag:
1680 idle += 1
1681 elif status in self.running_tag:
1682 run += 1
1683 elif status in self.complete_tag:
1684 status = self.check_termination(id)
1685 if status == 'wait':
1686 run += 1
1687 elif status == 'resubmit':
1688 idle += 1
1689 else:
1690 fail += 1
1691
1692
1693 for id in list(self.submitted_ids):
1694 if id not in ongoing:
1695 status = self.check_termination(id)
1696 if status == 'wait':
1697 run += 1
1698 elif status == 'resubmit':
1699 idle += 1
1700
1701
1702 return idle, run, self.submitted - (idle+run+fail), fail
1703
1704 @multiple_try()
1705 - def remove(self, *args, **opts):
1706 """Clean the jobs on the cluster"""
1707
1708 if not self.submitted_ids:
1709 return
1710 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1711 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1712 self.submitted_ids = []
1713
1715 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1716
1717 name= 'htcaas'
1718 job_id = 'HTCAAS_JOBID'
1719
1720 @store_input()
1721 @multiple_try()
1722 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1723 log=None, input_files=[], output_files=[], required_output=[],
1724 nb_submit=0):
1725 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1726 input/output file should be give relative to cwd
1727 """
1728
1729 if 'ajob' in prog:
1730 prog_num = prog.rsplit("ajob",1)[1]
1731 else:
1732 prog_num = '0'
1733
1734 cur_usr = os.getenv('USER')
1735
1736 if cwd is None:
1737 cwd = os.getcwd()
1738
1739 cwd_cp = cwd.rsplit("/",2)
1740
1741
1742 if not stdout is None:
1743 print "stdout: %s" % stdout
1744
1745 if not os.path.exists(prog):
1746 prog = os.path.join(cwd, prog)
1747
1748 if not required_output and output_files:
1749 required_output = output_files
1750
1751
1752 if not 'combine' and not 'pythia' in prog :
1753 cwd_arg = cwd+"/arguments"
1754 temp = ' '.join([str(a) for a in argument])
1755 arg_cmd="echo '"+temp+"' > " + cwd_arg
1756
1757
1758 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1759 if argument :
1760 command.extend(['-a ', '='.join([str(a) for a in argument])])
1761 print command
1762 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1763 id = a.stdout.read().strip()
1764
1765 else:
1766 cwd_arg = cwd+"/arguments"
1767 temp = ' '.join([str(a) for a in argument])
1768
1769
1770
1771
1772 temp_file_name = "sub." + os.path.basename(prog)
1773 text = """#!/bin/bash
1774 MYPWD=%(cwd)s
1775 cd $MYPWD
1776 input_files=(%(input_files)s )
1777 for i in ${input_files[@]}
1778 do
1779 chmod -f +x $i
1780 done
1781 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1782 """
1783 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1784 'arguments': ' '.join([str(a) for a in argument]),
1785 'program': ' ' if '.py' in prog else 'bash'}
1786
1787
1788 new_prog = pjoin(cwd, temp_file_name)
1789 open(new_prog, 'w').write(text % dico)
1790 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1791 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1792 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1793 id = a.stdout.read().strip()
1794
1795 nb_try=0
1796 nb_limit=5
1797 if not id.isdigit() :
1798 print "[ID is not digit]:" + id
1799
1800 while not id.isdigit() :
1801 nb_try+=1
1802 print "[fail_retry]:"+ nb_try
1803 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1804 id = a.stdout.read().strip()
1805 if nb_try > nb_limit :
1806 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1807 break
1808
1809 self.submitted += 1
1810 self.submitted_ids.append(id)
1811
1812 return id
1813
1814 @multiple_try(nb_try=10, sleep=10)
1816 """ control the status of a single job with it's cluster id """
1817
1818 if id == 0 :
1819 status_out ='C'
1820 else :
1821 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1822 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1823 stderr=subprocess.PIPE)
1824 error = status.stderr.read()
1825 if status.returncode or error:
1826 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1827 status_out= status.stdout.read().strip()
1828 status_out= status_out.split(":",1)[1]
1829 if status_out == 'waiting':
1830 status_out='I'
1831 elif status_out == 'preparing' or status_out == 'running':
1832 status_out = 'R'
1833 elif status_out != 'done':
1834 status_out = 'F'
1835 elif status_out == 'done':
1836 status_out = 'C'
1837
1838 return status_out
1839
1840 @multiple_try(nb_try=15, sleep=1)
1842 """ control the status of a single job with it's cluster id """
1843
1844 if not self.submitted_ids:
1845 return 0, 0, 0, 0
1846
1847 ongoing = []
1848 idle, run, fail = 0, 0, 0
1849
1850 if id == 0 :
1851 return 0 , 0, 0, 0
1852 else :
1853 for i in range(len(self.submitted_ids)):
1854 ongoing.append(int(self.submitted_ids[i]))
1855 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
1856 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1857 status_out= status.stdout.read().strip()
1858 status_out= status_out.split(":",1)[1]
1859 if status_out == 'waiting':
1860 idle += 1
1861 elif status_out == 'preparing':
1862 run += 1
1863 elif status_out == 'running':
1864 run += 1
1865 elif status_out != 'done':
1866 fail += 1
1867
1868 if status_out != 'done':
1869 print "["+ self.submitted_ids[i] + "] " + status_out
1870 '''
1871 for i in range(len(self.submitted_ids)):
1872 if int(self.submitted_ids[i]) not in ongoing:
1873 status = self.check_termination(int(self.submitted_ids[i]))
1874 if status = 'waiting':
1875 idle += 1
1876 elif status == 'resubmit':
1877 idle += 1
1878 elif status == 'failed':
1879 fail += 1
1880 '''
1881
1882 return idle, run, self.submitted - (idle+run+fail), fail
1883
1884 @multiple_try()
1885 - def remove(self, *args, **opts):
1886 """Clean the jobson the cluster"""
1887
1888 if not self.submitted_ids:
1889 return
1890 for i in range(len(self.submitted_ids)):
1891 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
1892 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1893 self.submitted_ids = []
1894
1897 """Class for dealing with cluster submission on a HTCaaS cluster"""
1898
1899 name= 'htcaas2'
1900 job_id = 'HTCAAS2_JOBID'
1901
1902 @store_input()
1903 @multiple_try()
1904 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1905 log=None, input_files=[], output_files=[], required_output=[],
1906 nb_submit=0):
1907 """Submit the job on the cluster NO SHARE DISK
1908 input/output file should be give relative to cwd
1909 """
1910
1911 if 'ajob' in prog:
1912 prog_num = prog.rsplit("ajob",1)[1]
1913 elif 'run_combine' in prog:
1914 prog_num = '0'
1915 else:
1916 prog_num = prog
1917
1918 cur_usr = os.getenv('USER')
1919
1920 import uuid
1921 dir = str(uuid.uuid4().hex)
1922
1923 prog_dir = '_run%s'% prog_num
1924 prog_dir = dir+prog_dir
1925
1926 if cwd is None:
1927 cwd = os.getcwd()
1928
1929 cwd_cp = cwd.rsplit("/",2)
1930
1931 if stdout is None:
1932 stdout='/dev/null'
1933
1934 if not os.path.exists(prog):
1935 prog = os.path.join(cwd, prog)
1936
1937 if not required_output and output_files:
1938 required_output = output_files
1939
1940 if '/' in argument :
1941 temp_file_name = "sub." + os.path.basename(prog)
1942 else :
1943 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
1944
1945
1946 if 'combine' in prog or 'pythia' in prog :
1947 text = """#!/bin/bash
1948 MYPWD=%(cwd)s
1949 cd $MYPWD
1950 script=%(script)s
1951 input_files=(%(input_files)s )
1952 if [ $# -ge 1 ]; then
1953 arg1=$1
1954 else
1955 arg1=''
1956 fi
1957 args=' %(arguments)s'
1958 for i in ${input_files[@]}; do
1959 if [[ "$i" == *$script* ]]; then
1960 script=$i
1961 fi
1962 chmod -f +x $i
1963 done
1964 /bin/bash ${script} ${args} > %(stdout)s
1965 """
1966
1967 elif 'shower' in prog :
1968 text = """#!/bin/bash
1969 MYPWD=%(cwd)s
1970 cd $MYPWD
1971 args=' %(arguments)s'
1972 input_files=( %(input_files)s )
1973 for i in ${input_files[@]}
1974 do
1975 chmod -f +x $i
1976 done
1977 /bin/bash %(script)s ${args} > $MYPWD/done
1978 """
1979
1980 else :
1981 text = """#!/bin/bash
1982 MYPWD=%(cwd)s
1983 #mkdir -p $MYTMP
1984 cd $MYPWD
1985 input_files=( %(input_files)s )
1986 for i in ${input_files[@]}
1987 do
1988 if [[ $i != */*/* ]]; then
1989 i=$PWD"/"$i
1990 fi
1991 echo $i
1992 if [ -d $i ]; then
1993 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1994 else
1995 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1996 fi
1997 done
1998 """
1999
2000 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
2001 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
2002 'input_files': ' '.join(input_files + [prog]),
2003 'output_files': ' '.join(output_files), 'stdout': stdout,
2004 'arguments': ' '.join([str(a) for a in argument]),
2005 'program': ' ' if '.py' in prog else 'bash'}
2006
2007
2008 new_prog = pjoin(cwd, temp_file_name)
2009 open(new_prog, 'w').write(text % dico)
2010 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
2011
2012
2013 cmd1='/bin/bash '+ cwd+'/'+temp_file_name
2014 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE,
2015 stderr=subprocess.PIPE)
2016
2017
2018
2019 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog:
2020
2021 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s"""
2022 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
2023 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) ,
2024 'prog_dir': prog_dir }
2025 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE,
2026 stderr=subprocess.PIPE)
2027 id = status3.stdout.read().strip()
2028
2029 nb_try=0
2030 nb_limit=5
2031 while not id.isdigit() :
2032 nb_try+=1
2033 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
2034 id = a.stdout.read().strip()
2035 if nb_try > nb_limit :
2036 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id
2037 break
2038
2039 temp_file_name2 = "sub." +id
2040 text2 = """#!/bin/bash
2041 MYPWD=%(cwd)s
2042 output_files=( %(output_files)s )
2043 result=done
2044 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then
2045 for i in ${output_files[@]}
2046 do
2047 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s
2048 chmod -Rf 777 ${MYPWD}/$i
2049 done
2050 for i in ${output_files[@]}; do
2051 if [[ -e ${MYPWD}/$i ]]; then
2052 result=done
2053 else
2054 result=running
2055 echo $result
2056 exit 0
2057 fi
2058 done
2059 echo $result
2060 touch ${MYPWD}/done.%(job_id)s
2061 else
2062 for i in ${output_files[@]}; do
2063 if [ -e ${MYPWD}/$i ]; then
2064 result=done
2065 else
2066 rm -f ${MYPWD}/done.%(job_id)s
2067 result=running
2068 echo $result
2069 exit 0
2070 fi
2071 done
2072 echo $result
2073
2074 fi
2075
2076 """
2077 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
2078 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
2079 'output_files': ' '.join(output_files), 'job_id': id,
2080 'program': ' ' if '.py' in prog else 'bash'}
2081
2082 homePath = os.getenv("HOME")
2083 outPath = homePath +"/MG5"
2084
2085 new_prog2 = pjoin(outPath, temp_file_name2)
2086 open(new_prog2, 'w').write(text2 % dico2)
2087 misc.Popen(['chmod','+x',new_prog2],cwd=cwd)
2088
2089
2090 self.submitted += 1
2091 self.submitted_ids.append(id)
2092
2093 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog:
2094 if '/dev/null' in stdout :
2095 stdout=''
2096
2097 temp_file_shower = "sub.out"
2098 text_shower = """#!/bin/bash
2099 MYPWD=%(cwd)s
2100 result=done
2101 output_files=(%(output_files)s)
2102 for i in ${output_files[@]}; do
2103 if [ -e $MYPWD/$i -o -e $i ]; then
2104 result=done
2105 else
2106 result=running
2107 echo $result
2108 exit 0
2109 fi
2110 done
2111 echo $result
2112 """
2113 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files),
2114 'program': ' ' if '.py' in prog else 'bash'}
2115 homePath = os.getenv("HOME")
2116 outPath = homePath +"/MG5"
2117 new_prog_shower = pjoin(outPath, temp_file_shower)
2118 open(new_prog_shower, 'w').write(text_shower % dico_shower)
2119 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd)
2120
2121 id='-1'
2122 self.submitted += 1
2123 self.submitted_ids.append(id)
2124
2125 else :
2126 id='-2'
2127 self.submitted += 1
2128 self.submitted_ids.append(id)
2129
2130 return id
2131
2132 @multiple_try(nb_try=10, sleep=10)
2134 """ control the status of a single job with it's cluster id """
2135
2136 homePath = os.getenv("HOME")
2137 outPath = homePath +"/MG5"
2138
2139
2140 if id == '0' or id=='-2' :
2141 status_out ='done'
2142 elif id == '-1' :
2143 cmd='/bin/bash ' +outPath+'/sub.out'
2144 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2145 status_out=status.stdout.read().strip()
2146 print "["+id+"]" + status_out
2147 if status_out == 'waiting':
2148 status_out='wait'
2149 elif status_out == 'preparing' or status_out == 'running':
2150 status_out = 'R'
2151 elif status_out != 'done':
2152 status_out = 'F'
2153 elif status_out == 'done':
2154 status_out = 'C'
2155
2156 print "["+id+"]" + status_out
2157 else :
2158 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
2159 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
2160 stderr=subprocess.PIPE)
2161 error = status.stderr.read()
2162 if status.returncode or error:
2163 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
2164 status_out= status.stdout.read().strip()
2165 status_out= status_out.split(":",1)[1]
2166 print "["+id+"]" + status_out
2167 if status_out == 'waiting':
2168 status_out='wait'
2169 elif status_out == 'preparing' or status_out == 'running':
2170 status_out = 'R'
2171 elif status_out == 'failed' :
2172 args = self.retry_args[id]
2173 id_temp = self.submit2(**args)
2174 del self.retry_args[id]
2175 self.submitted_ids.remove(id)
2176 status_out = 'I'
2177 elif status_out != 'done':
2178 status_out = 'F'
2179 elif status_out == 'done':
2180 status_out = 'C'
2181
2182 return status_out
2183
2184
2185 @check_interupt()
2186 @multiple_try(nb_try=15, sleep=10)
2188 """ control the status of a single job with it's cluster id """
2189
2190 if not self.submitted_ids:
2191 return 0, 0, 0, 0
2192
2193 ongoing = []
2194 idle, run, fail = 0, 0, 0
2195
2196 homePath = os.getenv("HOME")
2197 outPath = homePath +"/MG5"
2198
2199 for i in range(len(self.submitted_ids)):
2200 ongoing.append(self.submitted_ids[i])
2201 if self.submitted_ids[i] == '-2' :
2202 return 0,0,0,0
2203 if self.submitted_ids[i] == '0' :
2204
2205 status_out='done'
2206 elif self.submitted_ids[i] == '-1' :
2207 cmd='/bin/bash ' +outPath+'/sub.out'
2208 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2209 status_out=status.stdout.read().strip()
2210 if status_out == 'waiting':
2211 idle += 1
2212 elif status_out == 'preparing':
2213 run += 1
2214 elif status_out == 'running':
2215 run += 1
2216 elif status_out != 'done':
2217 fail += 1
2218 else :
2219 args = self.retry_args[str(self.submitted_ids[i])]
2220 if 'required_output'in args and not args['required_output']:
2221 args['required_output'] = args['output_files']
2222 self.retry_args[str(self.submitted_ids[i])] = args
2223
2224 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
2225 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2226 status_out= status.stdout.read().strip()
2227 status_out= status_out.split(":",1)[1]
2228 if status_out == 'waiting':
2229 idle += 1
2230 elif status_out == 'preparing':
2231 run += 1
2232 elif status_out == 'running':
2233 run += 1
2234 elif status_out == 'failed' or status_out == 'canceled':
2235 id = self.submit2(**args)
2236
2237 del self.retry_args[self.submitted_ids[i]]
2238 self.submitted_ids.remove(self.submitted_ids[i])
2239 self.submitted-=1
2240 idle += 1
2241 elif status_out != 'done':
2242 fail += 1
2243 if status_out == 'done':
2244 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i]
2245 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2246 aa= status2.stdout.read().strip()
2247
2248
2249
2250
2251
2252
2253
2254 for path in args['required_output']:
2255 if args['cwd']:
2256 path = pjoin(args['cwd'], path)
2257
2258 temp1=os.path.exists(path)
2259 temp2=os.stat(path).st_size
2260 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
2261 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2262 aa= status2.stdout.read().strip()
2263 if aa == 'done':
2264 self.submitted_ids[i] = '0'
2265 elif aa == 'running':
2266 run += 1
2267 else :
2268 self.submitted_ids[i]='0'
2269
2270
2271 for i in range(len(self.submitted_ids)):
2272 if str(self.submitted_ids[i]) not in ongoing:
2273 status2= self.check_termination(str(self.submitted_ids[i]))
2274 if status2 == 'wait':
2275 run += 1
2276 elif status2 == 'resubmit':
2277 idle += 1
2278
2279 return idle, run, self.submitted - (idle+run+fail), fail
2280
2281 @multiple_try()
2282 - def remove(self, *args, **opts):
2283 """Clean the jobson the cluster"""
2284
2285 if not self.submitted_ids:
2286 return
2287 for i in range(len(self.submitted_ids)):
2288 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
2289 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2290 self.submitted_ids = []
2291
2292
2293 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2294 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2295 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2296