1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21 import sys
22
23 logger = logging.getLogger('madgraph.cluster')
24
25 try:
26 from madgraph import MadGraph5Error
27 import madgraph.various.misc as misc
28 except Exception, error:
29 if __debug__:
30 print str(error)
31 from internal import MadGraph5Error
32 import internal.misc as misc
33
34 pjoin = os.path.join
38
41
42
43 multiple_try = misc.multiple_try
44 pjoin = os.path.join
48
49 def deco_interupt(f):
50 def deco_f_interupt(self, *args, **opt):
51 try:
52 return f(self, *args, **opt)
53 except error:
54 try:
55 self.remove(*args, **opt)
56 except Exception:
57 pass
58 raise error
59 return deco_f_interupt
60 return deco_interupt
61
74 return deco_f_store
75 return deco_store
76
78 """ This function checks whether compression of input files are necessary
79 given the running options given. """
80
81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
82 return False
83 else:
84 return True
85
87 """Basic Class for all cluster type submission"""
88 name = 'mother class'
89 identifier_length = 14
90
92 """Init the cluster"""
93
94 self.submitted = 0
95 self.submitted_ids = []
96 self.finish = 0
97 self.submitted_dirs = []
98 self.submitted_exes = []
99 self.submitted_args = []
100
101 if 'cluster_queue' in opts:
102 self.cluster_queue = opts['cluster_queue']
103 else:
104 self.cluster_queue = 'madgraph'
105 if 'cluster_temp_path' in opts:
106 self.temp_dir = opts['cluster_temp_path']
107 else:
108 self.temp_dir = None
109 self.options = {'cluster_status_update': (600, 30)}
110 for key,value in opts.items():
111 self.options[key] = value
112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
114 self.options = dict(opts)
115 self.retry_args = {}
116
117 self.packet = {}
118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster."""
123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124
125
126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
128 log=None, input_files=[], output_files=[], required_output=[],
129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster.
131 NO SHARE DISK"""
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137
138 if not required_output and output_files:
139 required_output = output_files
140
141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
142 (input_files == [] == output_files):
143 return self.submit(prog, argument, cwd, stdout, stderr, log,
144 required_output=required_output, nb_submit=nb_submit)
145
146 if not input_files and not output_files:
147
148 return self.submit(prog, argument, cwd, stdout, stderr, log,
149 required_output=required_output, nb_submit=nb_submit)
150
151 if cwd is None:
152 cwd = os.getcwd()
153 if not os.path.exists(prog):
154 prog = os.path.join(cwd, prog)
155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
156
157 text = """#!/bin/bash
158 MYTMP=%(tmpdir)s/run$%(job_id)s
159 MYPWD=%(cwd)s
160 mkdir -p $MYTMP
161 cd $MYPWD
162 input_files=( %(input_files)s )
163 for i in ${input_files[@]}
164 do
165 cp -R -L $i $MYTMP
166 done
167 cd $MYTMP
168 echo '%(arguments)s' > arguments
169 chmod +x ./%(script)s
170 %(program)s ./%(script)s %(arguments)s
171 exit=$?
172 output_files=( %(output_files)s )
173 for i in ${output_files[@]}
174 do
175 cp -r $MYTMP/$i $MYPWD
176 done
177 # if [ "$exit" -eq "0" ]
178 # then
179 rm -rf $MYTMP
180 # fi
181 """
182
183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
184 'cwd': cwd, 'job_id': self.job_id,
185 'input_files': ' '.join(input_files + [prog]),
186 'output_files': ' '.join(output_files),
187 'arguments': ' '.join([str(a) for a in argument]),
188 'program': ' ' if '.py' in prog else 'bash'}
189
190
191 new_prog = pjoin(cwd, temp_file_name)
192 open(new_prog, 'w').write(text % dico)
193 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
194
195 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
196 required_output=required_output, nb_submit=nb_submit)
197
198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
200 log=None, input_files=[], output_files=[], required_output=[],
201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant
203 method should not be overwritten (but for DAG type submission)"""
204
205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
206 output_files, required_output, nb_submit)
207
208
209 if not packet_member:
210 return id
211 else:
212 if isinstance(packet_member, Packet):
213 self.id_to_packet[id] = packet_member
214 packet_member.put(id)
215 if packet_member.tag not in self.packet:
216 self.packet[packet_member.tag] = packet_member
217 else:
218 if packet_member in self.packet:
219 packet = self.packet[packet_member]
220 packet.put(id)
221 self.id_to_packet[id] = packet
222 return id
223
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
226 if not self.submitted_ids:
227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
228 idle, run, fail = 0, 0, 0
229 for pid in self.submitted_ids[:]:
230 status = self.control_one_job(id)
231 if status == 'I':
232 idle += 1
233 elif status == 'R':
234 run += 1
235 elif status == 'F':
236 self.finish +=1
237 self.submitted_ids.remove(pid)
238 else:
239 fail += 1
240
241 return idle, run, self.finish, fail
242
244 """ control the status of a single job with it's cluster id """
245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
248 """get a unique run_name for all the jobs helps to identify the runs
249 in the controller for some cluster."""
250
251 if second_path:
252 path = os.path.realpath(pjoin(path, second_path))
253 elif not os.path.exists(path):
254 return path
255
256 if 'SubProcesses' in path:
257 target = path.rsplit('/SubProcesses',1)[0]
258 elif 'MCatNLO' in path:
259 target = path.rsplit('/MCatNLO',1)[0]
260 elif second_path:
261 target=path
262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
263 else:
264 target = path
265
266 if target.endswith('/'):
267 target = target[:-1]
268
269 target = misc.digest(target)[-self.identifier_length:]
270 if not target[0].isalpha():
271 target = 'a' + target[1:]
272
273 return target
274
275
276 @check_interupt()
277 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
278 """Wait that all job are finish.
279 if minimal_job set, then return if idle + run is lower than that number"""
280
281
282 mode = 1
283 nb_iter = 0
284 nb_short = 0
285 change_at = 5
286
287 if update_first:
288 idle, run, finish, fail = self.control(me_dir)
289 update_first(idle, run, finish)
290
291
292 longtime, shorttime = self.options['cluster_status_update']
293
294 nb_job = 0
295
296 if self.options['cluster_type'] == 'htcaas2':
297 me_dir = self.metasubmit(self)
298
299 while 1:
300 old_mode = mode
301 nb_iter += 1
302 idle, run, finish, fail = self.control(me_dir)
303 if nb_job:
304 if idle + run + finish + fail != nb_job:
305 nb_job = idle + run + finish + fail
306 nb_iter = 1
307 else:
308 nb_job = idle + run + finish + fail
309 if fail:
310 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
311 if idle + run == 0:
312
313 logger.info('All jobs finished')
314 fct(idle, run, finish)
315 break
316 if idle + run < minimal_job:
317 return
318 fct(idle, run, finish)
319
320 if nb_iter < change_at:
321 mode = 1
322 elif idle < run:
323 if old_mode == 0:
324 if nb_short:
325 mode = 0
326
327 elif idle:
328 if nb_iter > change_at + int(longtime)//shorttime:
329 mode = 0
330 else:
331 mode = 1
332 nb_short =0
333 else:
334 mode = 1
335 nb_short = 0
336 elif old_mode == 1:
337 nb_short +=1
338 if nb_short > 3* max(change_at, int(longtime)//shorttime):
339 mode = 0
340 else:
341 mode = 0
342
343
344 if old_mode > mode:
345 logger.info('''Start to wait %ss between checking status.
346 Note that you can change this time in the configuration file.
347 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
348
349
350 if mode == 0:
351 try:
352 time.sleep(self.options['cluster_status_update'][0])
353 except KeyboardInterrupt:
354 logger.info('start to update the status')
355 nb_iter = min(0, change_at -2)
356 nb_short = 0
357 else:
358 time.sleep(self.options['cluster_status_update'][1])
359
360
361 self.submitted = 0
362 self.submitted_ids = []
363
365 """Check the termination of the jobs with job_id and relaunch it if needed."""
366
367
368 if job_id not in self.retry_args:
369 return True
370
371 args = self.retry_args[job_id]
372 if 'time_check' in args:
373 time_check = args['time_check']
374 else:
375 time_check = 0
376
377 for path in args['required_output']:
378 if args['cwd']:
379 path = pjoin(args['cwd'], path)
380
381 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
382 break
383 else:
384
385 if time_check > 0:
386 logger.info('Job %s Finally found the missing output.' % (job_id))
387 del self.retry_args[job_id]
388 self.submitted_ids.remove(job_id)
389
390 if job_id in self.id_to_packet:
391 nb_in_packet = self.id_to_packet[job_id].remove_one()
392 if nb_in_packet == 0:
393
394 packet = self.id_to_packet[job_id]
395
396 packet.queue.join()
397
398 packet.fct(*packet.args)
399 del self.id_to_packet[job_id]
400 return 'resubmit'
401
402 return 'done'
403
404 if time_check == 0:
405 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
406 args['time_check'] = time.time()
407 return 'wait'
408 elif self.cluster_retry_wait > time.time() - time_check:
409 return 'wait'
410
411
412 if self.nb_retry < 0:
413 logger.critical('''Fail to run correctly job %s.
414 with option: %s
415 file missing: %s''' % (job_id, args, path))
416 raw_input('press enter to continue.')
417 elif self.nb_retry == 0:
418 logger.critical('''Fail to run correctly job %s.
419 with option: %s
420 file missing: %s.
421 Stopping all runs.''' % (job_id, args, path))
422 self.remove()
423 elif args['nb_submit'] >= self.nb_retry:
424 logger.critical('''Fail to run correctly job %s.
425 with option: %s
426 file missing: %s
427 Fails %s times
428 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
429 self.remove()
430 else:
431 args['nb_submit'] += 1
432 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
433 del self.retry_args[job_id]
434 self.submitted_ids.remove(job_id)
435 if 'time_check' in args:
436 del args['time_check']
437 if job_id in self.id_to_packet:
438 self.id_to_packet[job_id].remove_one()
439 args['packet_member'] = self.id_to_packet[job_id]
440 del self.id_to_packet[job_id]
441 self.cluster_submit(**args)
442 else:
443 self.submit2(**args)
444 return 'resubmit'
445 return 'done'
446
447 @check_interupt()
448 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
449 stderr=None, log=None, required_output=[], nb_submit=0,
450 input_files=[], output_files=[]):
451 """launch one job on the cluster and wait for it"""
452
453 special_output = False
454 if stderr == -2 and stdout:
455
456 special_output = True
457 stderr = stdout + '.err'
458
459 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
460 required_output=required_output, input_files=input_files,
461 output_files=output_files)
462
463 if self.options['cluster_type']=='htcaas2':
464 if self.submitted == self.submitted_ids[-1]:
465 id = self.metasubmit(self)
466
467 frame = inspect.currentframe()
468 args, _, _, values = inspect.getargvalues(frame)
469 args = dict([(i, values[i]) for i in args if i != 'self'])
470 self.retry_args[id] = args
471
472 nb_wait=0
473 while 1:
474 nb_wait+=1
475 status = self.control_one_job(id)
476 if not status in ['R','I']:
477 status = self.check_termination(id)
478 if status in ['wait']:
479 time.sleep(30)
480 continue
481 elif status in ['resubmit']:
482 id = self.submitted_ids[0]
483 time.sleep(30)
484 continue
485
486 time.sleep(30)
487 break
488 time.sleep(self.options['cluster_status_update'][1])
489
490 if required_output:
491 status = self.check_termination(id)
492 if status == 'wait':
493 run += 1
494 elif status == 'resubmit':
495 idle += 1
496
497
498 if special_output:
499
500
501 for i in range(5):
502 if os.path.exists(stdout):
503 if not os.path.exists(stderr):
504 time.sleep(5)
505 if os.path.exists(stderr):
506 err_text = open(stderr).read()
507 if not err_text:
508 return
509 logger.warning(err_text)
510 text = open(stdout).read()
511 open(stdout,'w').write(text + err_text)
512 else:
513 return
514 time.sleep(10)
515
516 - def remove(self, *args, **opts):
517 """ """
518 logger.warning("""This cluster didn't support job removal,
519 the jobs are still running on the cluster.""")
520
521 @store_input()
525
527 """ an object for handling packet of job, it is designed to be thread safe
528 """
529
530 - def __init__(self, name, fct, args, opts={}):
531 import Queue
532 import threading
533 self.queue = Queue.Queue()
534 self.tag = name
535 self.fct = fct
536 self.args = args
537 self.opts = opts
538 self.done = threading.Event()
539
540 - def put(self, *args, **opts):
542
543 append = put
544
549
551 """class for dealing with the submission in multiple node"""
552
553 job_id = "$"
554
556 """Init the cluster """
557
558
559 super(MultiCore, self).__init__(self, *args, **opt)
560
561 import Queue
562 import threading
563 import thread
564 self.queue = Queue.Queue()
565 self.done = Queue.Queue()
566 self.submitted = Queue.Queue()
567 self.stoprequest = threading.Event()
568 self.demons = []
569 self.nb_done =0
570 if 'nb_core' in opt:
571 self.nb_core = opt['nb_core']
572 elif isinstance(args[0],int):
573 self.nb_core = args[0]
574 else:
575 self.nb_core = 1
576 self.update_fct = None
577
578 self.lock = threading.Event()
579 self.pids = Queue.Queue()
580 self.done_pid = []
581 self.done_pid_queue = Queue.Queue()
582 self.fail_msg = None
583
584
585 for _ in range(self.nb_core):
586 self.start_demon()
587
588
590 import threading
591 t = threading.Thread(target=self.worker)
592 t.daemon = True
593 t.start()
594 self.demons.append(t)
595
596
598 import Queue
599 import thread
600 while not self.stoprequest.isSet():
601 try:
602 args = self.queue.get()
603 tag, exe, arg, opt = args
604 try:
605
606 if isinstance(exe,str):
607 if os.path.exists(exe) and not exe.startswith('/'):
608 exe = './' + exe
609 if opt['stderr'] == None:
610 opt['stderr'] = subprocess.STDOUT
611 proc = misc.Popen([exe] + arg, **opt)
612 pid = proc.pid
613 self.pids.put(pid)
614 proc.wait()
615 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
616 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
617 (' '.join([exe]+arg), proc.returncode)
618 logger.warning(fail_msg)
619 self.stoprequest.set()
620 self.remove(fail_msg)
621
622
623
624
625 else:
626 pid = tag
627 self.pids.put(pid)
628
629
630 returncode = exe(*arg, **opt)
631 if returncode != 0:
632 logger.warning("fct %s does not return 0. Starts to stop the code in a clean way.", exe)
633 self.stoprequest.set()
634 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
635 except Exception,error:
636 self.fail_msg = sys.exc_info()
637 logger.warning(str(error))
638 self.stoprequest.set()
639 self.remove(error)
640
641 if __debug__:
642 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2]
643
644 self.queue.task_done()
645 self.done.put(tag)
646 self.done_pid_queue.put(pid)
647
648 try:
649 self.lock.set()
650 except thread.error:
651 continue
652 except Queue.Empty:
653 continue
654
655
656
657
658 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
659 log=None, required_output=[], nb_submit=0):
660 """submit a job on multicore machine"""
661
662 tag = (prog, tuple(argument), cwd, nb_submit)
663 if isinstance(prog, str):
664
665
666 opt = {'cwd': cwd,
667 'stdout':stdout,
668 'stderr': stderr}
669 self.queue.put((tag, prog, argument, opt))
670 self.submitted.put(1)
671 return tag
672 else:
673
674 self.queue.put((tag, prog, argument, {}))
675 self.submitted.put(1)
676 return tag
677
678 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
679 stderr=None, log=None, **opts):
680 """launch one job and wait for it"""
681 if isinstance(stdout, str):
682 stdout = open(stdout, 'w')
683 if isinstance(stderr, str):
684 stdout = open(stderr, 'w')
685 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
686
687 - def remove(self, error=None):
688 """Ensure that all thread are killed"""
689
690
691 self.stoprequest.set()
692 if error and not self.fail_msg:
693 self.fail_msg = error
694
695
696 while not self.done_pid_queue.empty():
697 pid = self.done_pid_queue.get()
698 self.done_pid.append(pid)
699
700
701 while not self.pids.empty():
702 pid = self.pids.get()
703 self.pids.task_done()
704 if isinstance(pid, tuple):
705 continue
706 if pid in self.done_pid:
707 continue
708 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
709 % {'pid':pid} )
710 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
711
712
713 - def wait(self, me_dir, update_status, update_first=None):
714 """Waiting that all the jobs are done. This function also control that
715 the submission by packet are handle correctly (i.e. submit the function)"""
716
717 import Queue
718 import threading
719
720 try:
721 last_status = (0, 0, 0)
722 sleep_time = 1
723 use_lock = True
724 first = True
725 while True:
726 force_one_more_loop = False
727
728
729
730 while self.done.qsize():
731 try:
732 tag = self.done.get(True, 1)
733 except Queue.Empty:
734 pass
735 else:
736 if self.id_to_packet and tuple(tag) in self.id_to_packet:
737 packet = self.id_to_packet[tuple(tag)]
738 remaining = packet.remove_one()
739 if remaining == 0:
740
741 packet.queue.join()
742 self.submit(packet.fct, packet.args)
743 force_one_more_loop = True
744 self.nb_done += 1
745 self.done.task_done()
746
747
748
749 Idle = self.queue.qsize()
750 Done = self.nb_done + self.done.qsize()
751 Running = max(0, self.submitted.qsize() - Idle - Done)
752
753 if Idle + Running <= 0 and not force_one_more_loop:
754 update_status(Idle, Running, Done)
755
756
757 self.queue.join()
758 break
759
760 if (Idle, Running, Done) != last_status:
761 if first and update_first:
762 update_first(Idle, Running, Done)
763 first = False
764 else:
765 update_status(Idle, Running, Done)
766 last_status = (Idle, Running, Done)
767
768
769 while not self.done_pid_queue.empty():
770 pid = self.done_pid_queue.get()
771 self.done_pid.append(pid)
772 self.done_pid_queue.task_done()
773
774
775
776 if use_lock:
777
778 use_lock = self.lock.wait(300)
779 self.lock.clear()
780 if not use_lock and Idle > 0:
781 use_lock = True
782 else:
783
784
785 time.sleep(sleep_time)
786 sleep_time = min(sleep_time + 2, 180)
787 if update_first:
788 update_first(Idle, Running, Done)
789
790 if self.stoprequest.isSet():
791 if isinstance(self.fail_msg, Exception):
792 raise self.fail_msg
793 elif isinstance(self.fail_msg, str):
794 raise Exception, self.fail_msg
795 else:
796 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
797
798 try:
799 self.lock.clear()
800 except Exception:
801 pass
802 self.done = Queue.Queue()
803 self.done_pid = []
804 self.done_pid_queue = Queue.Queue()
805 self.nb_done = 0
806 self.submitted = Queue.Queue()
807 self.pids = Queue.Queue()
808 self.stoprequest.clear()
809
810 except KeyboardInterrupt:
811
812 if isinstance(self.fail_msg, Exception):
813 raise self.fail_msg
814 elif isinstance(self.fail_msg, str):
815 raise Exception, self.fail_msg
816 elif self.fail_msg:
817 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
818
819 raise
820
822 """Basic class for dealing with cluster submission"""
823
824 name = 'condor'
825 job_id = 'CONDOR_ID'
826
827
828
829 @multiple_try()
830 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
831 required_output=[], nb_submit=0):
832 """Submit a job prog to a Condor cluster"""
833
834 text = """Executable = %(prog)s
835 output = %(stdout)s
836 error = %(stderr)s
837 log = %(log)s
838 %(argument)s
839 environment = CONDOR_ID=$(Cluster).$(Process)
840 Universe = vanilla
841 notification = Error
842 Initialdir = %(cwd)s
843 %(requirement)s
844 getenv=True
845 queue 1
846 """
847
848 if self.cluster_queue not in ['None', None]:
849 requirement = 'Requirements = %s=?=True' % self.cluster_queue
850 else:
851 requirement = ''
852
853 if cwd is None:
854 cwd = os.getcwd()
855 if stdout is None:
856 stdout = '/dev/null'
857 if stderr is None:
858 stderr = '/dev/null'
859 if log is None:
860 log = '/dev/null'
861 if not os.path.exists(prog):
862 prog = os.path.join(cwd, prog)
863 if argument:
864 argument = 'Arguments = %s' % ' '.join(argument)
865 else:
866 argument = ''
867
868
869 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
870 'stderr': stderr,'log': log,'argument': argument,
871 'requirement': requirement}
872
873
874 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
875 stdin=subprocess.PIPE)
876 output, _ = a.communicate(text % dico)
877
878
879
880
881 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
882 try:
883 id = pat.search(output).groups()[0]
884 except:
885 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
886 % output
887 self.submitted += 1
888 self.submitted_ids.append(id)
889 return id
890
891 @store_input()
892 @multiple_try()
893 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
894 log=None, input_files=[], output_files=[], required_output=[],
895 nb_submit=0):
896 """Submit the job on the cluster NO SHARE DISK
897 input/output file should be give relative to cwd
898 """
899
900 if not required_output and output_files:
901 required_output = output_files
902
903 if (input_files == [] == output_files):
904 return self.submit(prog, argument, cwd, stdout, stderr, log,
905 required_output=required_output, nb_submit=nb_submit)
906
907 text = """Executable = %(prog)s
908 output = %(stdout)s
909 error = %(stderr)s
910 log = %(log)s
911 %(argument)s
912 should_transfer_files = YES
913 when_to_transfer_output = ON_EXIT
914 transfer_input_files = %(input_files)s
915 %(output_files)s
916 Universe = vanilla
917 notification = Error
918 Initialdir = %(cwd)s
919 %(requirement)s
920 getenv=True
921 queue 1
922 """
923
924 if self.cluster_queue not in ['None', None]:
925 requirement = 'Requirements = %s=?=True' % self.cluster_queue
926 else:
927 requirement = ''
928
929 if cwd is None:
930 cwd = os.getcwd()
931 if stdout is None:
932 stdout = '/dev/null'
933 if stderr is None:
934 stderr = '/dev/null'
935 if log is None:
936 log = '/dev/null'
937 if not os.path.exists(prog):
938 prog = os.path.join(cwd, prog)
939 if argument:
940 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
941 else:
942 argument = ''
943
944 if input_files:
945 input_files = ','.join(input_files)
946 else:
947 input_files = ''
948 if output_files:
949 output_files = 'transfer_output_files = %s' % ','.join(output_files)
950 else:
951 output_files = ''
952
953
954
955 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
956 'stderr': stderr,'log': log,'argument': argument,
957 'requirement': requirement, 'input_files':input_files,
958 'output_files':output_files}
959
960
961 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
962 stdin=subprocess.PIPE)
963 output, _ = a.communicate(text % dico)
964
965
966
967
968 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
969 try:
970 id = pat.search(output).groups()[0]
971 except:
972 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
973 % output
974 self.submitted += 1
975 self.submitted_ids.append(id)
976 return id
977
978
979
980
981
982 @multiple_try(nb_try=10, sleep=10)
984 """ control the status of a single job with it's cluster id """
985 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
986 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
987 stderr=subprocess.PIPE)
988
989 error = status.stderr.read()
990 if status.returncode or error:
991 raise ClusterManagmentError, 'condor_q returns error: %s' % error
992
993 return status.stdout.readline().strip()
994
995 @check_interupt()
996 @multiple_try(nb_try=10, sleep=10)
998 """ control the status of a single job with it's cluster id """
999
1000 if not self.submitted_ids:
1001 return 0, 0, 0, 0
1002
1003 packet = 15000
1004 idle, run, fail = 0, 0, 0
1005 ongoing = []
1006 for i in range(1+(len(self.submitted_ids)-1)//packet):
1007 start = i * packet
1008 stop = (i+1) * packet
1009 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1010 " -format \'%-2s\ ' \'ClusterId\' " + \
1011 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1012
1013 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1014 stderr=subprocess.PIPE)
1015 error = status.stderr.read()
1016 if status.returncode or error:
1017 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1018
1019 for line in status.stdout:
1020 id, status = line.strip().split()
1021 ongoing.append(int(id))
1022 if status in ['I','U']:
1023 idle += 1
1024 elif status == 'R':
1025 run += 1
1026 elif status != 'C':
1027 fail += 1
1028
1029 for id in list(self.submitted_ids):
1030 if int(id) not in ongoing:
1031 status = self.check_termination(id)
1032 if status == 'wait':
1033 run += 1
1034 elif status == 'resubmit':
1035 idle += 1
1036
1037 return idle, run, self.submitted - (idle+run+fail), fail
1038
1039 @multiple_try()
1040 - def remove(self, *args, **opts):
1041 """Clean the jobson the cluster"""
1042
1043 if not self.submitted_ids:
1044 return
1045 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1046
1047 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1048 self.submitted_ids = []
1049
1051 """Basic class for dealing with cluster submission"""
1052
1053 name = 'pbs'
1054 job_id = 'PBS_JOBID'
1055 idle_tag = ['Q']
1056 running_tag = ['T','E','R']
1057 complete_tag = ['C']
1058
1059 maximum_submited_jobs = 2500
1060
1061 @multiple_try()
1062 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1063 required_output=[], nb_submit=0):
1064 """Submit a job prog to a PBS cluster"""
1065
1066 me_dir = self.get_jobs_identifier(cwd, prog)
1067
1068 if len(self.submitted_ids) > self.maximum_submited_jobs:
1069 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1070 self.wait(me_dir, fct, self.maximum_submited_jobs)
1071
1072
1073 text = ""
1074 if cwd is None:
1075 cwd = os.getcwd()
1076 else:
1077 text = " cd %s;" % cwd
1078 if stdout is None:
1079 stdout = '/dev/null'
1080 if stderr is None:
1081 stderr = '/dev/null'
1082 elif stderr == -2:
1083 stderr = stdout
1084 if log is None:
1085 log = '/dev/null'
1086
1087 if not os.path.isabs(prog):
1088 text += "./%s" % prog
1089 else:
1090 text+= prog
1091
1092 if argument:
1093 text += ' ' + ' '.join(argument)
1094
1095 command = ['qsub','-o', stdout,
1096 '-N', me_dir,
1097 '-e', stderr,
1098 '-V']
1099
1100 if self.cluster_queue and self.cluster_queue != 'None':
1101 command.extend(['-q', self.cluster_queue])
1102
1103 a = misc.Popen(command, stdout=subprocess.PIPE,
1104 stderr=subprocess.STDOUT,
1105 stdin=subprocess.PIPE, cwd=cwd)
1106
1107 output = a.communicate(text)[0]
1108 id = output.split('.')[0]
1109 if not id.isdigit() or a.returncode !=0:
1110 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1111 % output
1112
1113 self.submitted += 1
1114 self.submitted_ids.append(id)
1115 return id
1116
1117 @multiple_try()
1119 """ control the status of a single job with it's cluster id """
1120 cmd = 'qstat '+str(id)
1121 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1122 stderr=subprocess.STDOUT)
1123
1124 for line in status.stdout:
1125 line = line.strip()
1126 if 'cannot connect to server' in line or 'cannot read reply' in line:
1127 raise ClusterManagmentError, 'server disconnected'
1128 if 'Unknown' in line:
1129 return 'F'
1130 elif line.startswith(str(id)):
1131 jobstatus = line.split()[4]
1132 else:
1133 jobstatus=""
1134
1135 if status.returncode != 0 and status.returncode is not None:
1136 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1137 if jobstatus in self.idle_tag:
1138 return 'I'
1139 elif jobstatus in self.running_tag:
1140 return 'R'
1141 return 'F'
1142
1143
1144 @multiple_try()
1146 """ control the status of a single job with it's cluster id """
1147 cmd = "qstat"
1148 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1149
1150 me_dir = self.get_jobs_identifier(me_dir)
1151
1152 ongoing = []
1153
1154 idle, run, fail = 0, 0, 0
1155 for line in status.stdout:
1156 if 'cannot connect to server' in line or 'cannot read reply' in line:
1157 raise ClusterManagmentError, 'server disconnected'
1158 if me_dir in line:
1159 ongoing.append(line.split()[0].split('.')[0])
1160 status2 = line.split()[4]
1161 if status2 in self.idle_tag:
1162 idle += 1
1163 elif status2 in self.running_tag:
1164 run += 1
1165 elif status2 in self.complete_tag:
1166 if not self.check_termination(line.split()[0].split('.')[0]):
1167 idle += 1
1168 else:
1169 fail += 1
1170
1171 if status.returncode != 0 and status.returncode is not None:
1172 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1173
1174 for id in list(self.submitted_ids):
1175 if id not in ongoing:
1176 status2 = self.check_termination(id)
1177 if status2 == 'wait':
1178 run += 1
1179 elif status2 == 'resubmit':
1180 idle += 1
1181
1182 return idle, run, self.submitted - (idle+run+fail), fail
1183
1184 @multiple_try()
1185 - def remove(self, *args, **opts):
1186 """Clean the jobs on the cluster"""
1187
1188 if not self.submitted_ids:
1189 return
1190 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1191 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1192 self.submitted_ids = []
1193
1196 """Basic class for dealing with cluster submission"""
1197
1198
1199 name = 'sge'
1200 job_id = 'JOB_ID'
1201 idle_tag = ['qw', 'hqw','hRqw','w']
1202 running_tag = ['r','t','Rr','Rt']
1203 identifier_length = 10
1204
1206 """replace string for path issues"""
1207 location = os.path.realpath(location)
1208 homePath = os.getenv("HOME")
1209 if homePath:
1210 location = location.replace(homePath,'$HOME')
1211 return location
1212
1213 @multiple_try()
1214 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1215 required_output=[], nb_submit=0):
1216 """Submit a job prog to an SGE cluster"""
1217
1218 me_dir = self.get_jobs_identifier(cwd, prog)
1219
1220
1221 if cwd is None:
1222
1223 cwd = self.def_get_path(os.getcwd())
1224 cwd1 = self.def_get_path(cwd)
1225 text = " cd %s;" % cwd1
1226 if stdout is None:
1227 stdout = '/dev/null'
1228 else:
1229 stdout = self.def_get_path(stdout)
1230 if stderr is None:
1231 stderr = '/dev/null'
1232 elif stderr == -2:
1233 stderr = stdout
1234 else:
1235 stderr = self.def_get_path(stderr)
1236
1237 if log is None:
1238 log = '/dev/null'
1239 else:
1240 log = self.def_get_path(log)
1241
1242 text += prog
1243 if argument:
1244 text += ' ' + ' '.join(argument)
1245
1246
1247
1248
1249 homePath = os.getenv("HOME")
1250 if homePath:
1251 text = text.replace(homePath,'$HOME')
1252
1253 logger.debug("!=== input %s" % text)
1254 logger.debug("!=== output %s" % stdout)
1255 logger.debug("!=== error %s" % stderr)
1256 logger.debug("!=== logs %s" % log)
1257
1258 command = ['qsub','-o', stdout,
1259 '-N', me_dir,
1260 '-e', stderr,
1261 '-V']
1262
1263 if self.cluster_queue and self.cluster_queue != 'None':
1264 command.extend(['-q', self.cluster_queue])
1265
1266 a = misc.Popen(command, stdout=subprocess.PIPE,
1267 stderr=subprocess.STDOUT,
1268 stdin=subprocess.PIPE, cwd=cwd)
1269
1270 output = a.communicate(text)[0]
1271 id = output.split(' ')[2]
1272 if not id.isdigit():
1273 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1274 % output
1275 self.submitted += 1
1276 self.submitted_ids.append(id)
1277 logger.debug(output)
1278
1279 return id
1280
1281 @multiple_try()
1283 """ control the status of a single job with it's cluster id """
1284
1285 cmd = 'qstat '
1286 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1287 for line in status.stdout:
1288
1289
1290
1291
1292
1293
1294 if str(id) in line:
1295 status = line.split()[4]
1296
1297 if status in self.idle_tag:
1298 return 'I'
1299 elif status in self.running_tag:
1300 return 'R'
1301 return 'F'
1302
1303 @multiple_try()
1305 """ control the status of a single job with it's cluster id """
1306 cmd = "qstat "
1307 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1308
1309 me_dir = self.get_jobs_identifier(me_dir)
1310
1311 finished = list(self.submitted_ids)
1312
1313 idle, run, fail = 0, 0, 0
1314 for line in status.stdout:
1315 if me_dir in line:
1316 id,_,_,_,status = line.split()[:5]
1317 if status in self.idle_tag:
1318 idle += 1
1319 finished.remove(id)
1320 elif status in self.running_tag:
1321 run += 1
1322 finished.remove(id)
1323 else:
1324 logger.debug(line)
1325 fail += 1
1326 finished.remove(id)
1327
1328 for id in finished:
1329 self.check_termination(id)
1330
1331 return idle, run, self.submitted - (idle+run+fail), fail
1332
1333
1334
1335 @multiple_try()
1336 - def remove(self, *args, **opts):
1337 """Clean the jobs on the cluster"""
1338
1339 if not self.submitted_ids:
1340 return
1341 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1342 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1343 self.submitted_ids = []
1344
1347 """Basic class for dealing with cluster submission"""
1348
1349 name = 'lsf'
1350 job_id = 'LSB_JOBID'
1351
1352 @multiple_try()
1353 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1354 required_output=[], nb_submit=0):
1355 """Submit the job prog to an LSF cluster"""
1356
1357
1358 me_dir = self.get_jobs_identifier(cwd, prog)
1359
1360 text = ""
1361 command = ['bsub', '-C0', '-J', me_dir]
1362 if cwd is None:
1363 cwd = os.getcwd()
1364 else:
1365 text = " cd %s;" % cwd
1366 if stdout and isinstance(stdout, str):
1367 command.extend(['-o', stdout])
1368 if stderr and isinstance(stdout, str):
1369 command.extend(['-e', stderr])
1370 elif stderr == -2:
1371 pass
1372 if log is None:
1373 log = '/dev/null'
1374
1375 text += prog
1376 if argument:
1377 text += ' ' + ' '.join(argument)
1378
1379 if self.cluster_queue and self.cluster_queue != 'None':
1380 command.extend(['-q', self.cluster_queue])
1381
1382 a = misc.Popen(command, stdout=subprocess.PIPE,
1383 stderr=subprocess.STDOUT,
1384 stdin=subprocess.PIPE, cwd=cwd)
1385
1386 output = a.communicate(text)[0]
1387
1388 try:
1389 id = output.split('>',1)[0].split('<')[1]
1390 except:
1391 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1392 % output
1393 if not id.isdigit():
1394 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1395 % output
1396 self.submitted += 1
1397 self.submitted_ids.append(id)
1398 return id
1399
1400
1401 @multiple_try()
1403 """ control the status of a single job with it's cluster id """
1404
1405 cmd = 'bjobs '+str(id)
1406 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1407
1408 for line in status.stdout:
1409 line = line.strip().upper()
1410 if 'JOBID' in line:
1411 continue
1412 elif str(id) not in line:
1413 continue
1414 status = line.split()[2]
1415 if status == 'RUN':
1416 return 'R'
1417 elif status == 'PEND':
1418 return 'I'
1419 elif status == 'DONE':
1420 return 'F'
1421 else:
1422 return 'H'
1423 return 'F'
1424
1425 @multiple_try()
1427 """ control the status of a single job with it's cluster id """
1428
1429 if not self.submitted_ids:
1430 return 0, 0, 0, 0
1431
1432 cmd = "bjobs " + ' '.join(self.submitted_ids)
1433 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1434
1435 jobstatus = {}
1436 for line in status.stdout:
1437 line = line.strip()
1438 if 'JOBID' in line:
1439 continue
1440 splitline = line.split()
1441 id = splitline[0]
1442 if id not in self.submitted_ids:
1443 continue
1444 jobstatus[id] = splitline[2]
1445
1446 idle, run, fail = 0, 0, 0
1447 for id in self.submitted_ids[:]:
1448 if id in jobstatus:
1449 status = jobstatus[id]
1450 else:
1451 status = 'MISSING'
1452 if status == 'RUN':
1453 run += 1
1454 elif status == 'PEND':
1455 idle += 1
1456 else:
1457 status = self.check_termination(id)
1458 if status == 'wait':
1459 run += 1
1460 elif status == 'resubmit':
1461 idle += 1
1462
1463 return idle, run, self.submitted - (idle+run+fail), fail
1464
1465 @multiple_try()
1466 - def remove(self, *args,**opts):
1467 """Clean the jobs on the cluster"""
1468
1469 if not self.submitted_ids:
1470 return
1471 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1472 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1473 self.submitted_ids = []
1474
1476 """Class for dealing with cluster submission on a GE cluster"""
1477
1478 name = 'ge'
1479 job_id = 'JOB_ID'
1480 idle_tag = ['qw']
1481 running_tag = ['r']
1482
1483 @multiple_try()
1484 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1485 required_output=[], nb_submit=0):
1486 """Submit a job prog to a GE cluster"""
1487
1488 text = ""
1489 if cwd is None:
1490 cwd = os.getcwd()
1491 else:
1492 text = " cd %s; bash " % cwd
1493 if stdout is None:
1494 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1495 if stderr is None:
1496 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1497 elif stderr == -2:
1498 stderr = stdout
1499 if log is None:
1500 log = '/dev/null'
1501
1502 text += prog
1503 if argument:
1504 text += ' ' + ' '.join(argument)
1505 text += '\n'
1506 tmp_submit = os.path.join(cwd, 'tmp_submit')
1507 open(tmp_submit,'w').write(text)
1508
1509 a = misc.Popen(['qsub','-o', stdout,
1510 '-e', stderr,
1511 tmp_submit],
1512 stdout=subprocess.PIPE,
1513 stderr=subprocess.STDOUT,
1514 stdin=subprocess.PIPE, cwd=cwd)
1515
1516 output = a.communicate()[0]
1517
1518 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1519 try:
1520 id = pat.search(output).groups()[0]
1521 except:
1522 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1523 % output
1524 self.submitted += 1
1525 self.submitted_ids.append(id)
1526 return id
1527
1528 @multiple_try()
1530 """ control the status of a single job with it's cluster id """
1531 cmd = 'qstat | grep '+str(id)
1532 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1533 if not status:
1534 return 'F'
1535
1536 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1537 stat = ''
1538 for line in status.stdout.read().split('\n'):
1539 if not line:
1540 continue
1541 line = line.strip()
1542 try:
1543 groups = pat.search(line).groups()
1544 except:
1545 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1546 if groups[0] != id: continue
1547 stat = groups[1]
1548 if not stat:
1549 return 'F'
1550 if stat in self.idle_tag:
1551 return 'I'
1552 if stat in self.running_tag:
1553 return 'R'
1554
1555 @multiple_try()
1557 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1558 if not self.submitted_ids:
1559 return 0, 0, 0, 0
1560 idle, run, fail = 0, 0, 0
1561 ongoing = []
1562 for statusflag in ['p', 'r', 'sh']:
1563 cmd = 'qstat -s %s' % statusflag
1564 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1565
1566 pat = re.compile("^(\d+)")
1567 for line in status.stdout.read().split('\n'):
1568 line = line.strip()
1569 try:
1570 id = pat.search(line).groups()[0]
1571 except Exception:
1572 pass
1573 else:
1574 if id not in self.submitted_ids:
1575 continue
1576 ongoing.append(id)
1577 if statusflag == 'p':
1578 idle += 1
1579 if statusflag == 'r':
1580 run += 1
1581 if statusflag == 'sh':
1582 fail += 1
1583 for id in list(self.submitted_ids):
1584 if id not in ongoing:
1585 self.check_termination(id)
1586
1587
1588 return idle, run, self.submitted - idle - run - fail, fail
1589
1590 @multiple_try()
1591 - def remove(self, *args, **opts):
1592 """Clean the jobs on the cluster"""
1593
1594 if not self.submitted_ids:
1595 return
1596 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1597 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1598 self.submitted_ids = []
1599
1601 """start a computation and not wait for it to finish.
1602 this fonction returns a lock which is locked as long as the job is
1603 running."""
1604
1605 mc = MultiCore(1)
1606 mc.submit(exe, argument, cwd, stdout, **opt)
1607 mc.need_waiting = True
1608 return mc.lock
1609
1612 """Basic class for dealing with cluster submission"""
1613
1614 name = 'slurm'
1615 job_id = 'SLURM_JOBID'
1616 idle_tag = ['Q','PD','S','CF']
1617 running_tag = ['R', 'CG']
1618 complete_tag = ['C']
1619 identifier_length = 8
1620
1621 @multiple_try()
1622 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1623 required_output=[], nb_submit=0):
1624 """Submit a job prog to a SLURM cluster"""
1625
1626 me_dir = self.get_jobs_identifier(cwd, prog)
1627
1628
1629 if cwd is None:
1630 cwd = os.getcwd()
1631 if stdout is None:
1632 stdout = '/dev/null'
1633 if stderr is None:
1634 stderr = '/dev/null'
1635 elif stderr == -2:
1636 stderr = stdout
1637 if log is None:
1638 log = '/dev/null'
1639
1640 command = ['sbatch', '-o', stdout,
1641 '-J', me_dir,
1642 '-e', stderr, prog] + argument
1643
1644 if self.cluster_queue and self.cluster_queue != 'None':
1645 command.insert(1, '-p')
1646 command.insert(2, self.cluster_queue)
1647
1648 a = misc.Popen(command, stdout=subprocess.PIPE,
1649 stderr=subprocess.STDOUT,
1650 stdin=subprocess.PIPE, cwd=cwd)
1651
1652 output = a.communicate()
1653 output_arr = output[0].split(' ')
1654 id = output_arr[3].rstrip()
1655
1656 if not id.isdigit():
1657 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1658
1659 self.submitted += 1
1660 self.submitted_ids.append(id)
1661 return id
1662
1663 @multiple_try()
1665 """ control the status of a single job with it's cluster id """
1666 cmd = 'squeue j'+str(id)
1667 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1668 stderr=open(os.devnull,'w'))
1669
1670 for line in status.stdout:
1671 line = line.strip()
1672 if 'Invalid' in line:
1673 return 'F'
1674 elif line.startswith(str(id)):
1675 status = line.split()[4]
1676 if status in self.idle_tag:
1677 return 'I'
1678 elif status in self.running_tag:
1679 return 'R'
1680 return 'F'
1681
1682 @multiple_try()
1684 """ control the status of a single job with it's cluster id """
1685 cmd = "squeue"
1686 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1687
1688 me_dir = self.get_jobs_identifier(me_dir)
1689
1690 idle, run, fail = 0, 0, 0
1691 ongoing=[]
1692 for line in pstatus.stdout:
1693 if me_dir in line:
1694 id, _, _,_ , status,_ = line.split(None,5)
1695 ongoing.append(id)
1696 if status in self.idle_tag:
1697 idle += 1
1698 elif status in self.running_tag:
1699 run += 1
1700 elif status in self.complete_tag:
1701 status = self.check_termination(id)
1702 if status == 'wait':
1703 run += 1
1704 elif status == 'resubmit':
1705 idle += 1
1706 else:
1707 fail += 1
1708
1709
1710 for id in list(self.submitted_ids):
1711 if id not in ongoing:
1712 status = self.check_termination(id)
1713 if status == 'wait':
1714 run += 1
1715 elif status == 'resubmit':
1716 idle += 1
1717
1718
1719 return idle, run, self.submitted - (idle+run+fail), fail
1720
1721 @multiple_try()
1722 - def remove(self, *args, **opts):
1723 """Clean the jobs on the cluster"""
1724
1725 if not self.submitted_ids:
1726 return
1727 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1728 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1729 self.submitted_ids = []
1730
1732 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1733
1734 name= 'htcaas'
1735 job_id = 'HTCAAS_JOBID'
1736 idle_tag = ['waiting']
1737 running_tag = ['preparing','running']
1738 complete_tag = ['done']
1739
1740 @store_input()
1741 @multiple_try()
1742 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1743 log=None, input_files=[], output_files=[], required_output=[],
1744 nb_submit=0):
1745 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1746 input/output file should be given as relative to CWd
1747 """
1748
1749 cur_usr = os.getenv('USER')
1750
1751 if cwd is None:
1752 cwd = os.getcwd()
1753
1754 cwd_cp = cwd.rsplit("/",2)
1755
1756 if not stdout is None:
1757 print "stdout: %s" % stdout
1758
1759 if not os.path.exists(prog):
1760 prog = os.path.join(cwd, prog)
1761
1762 if not required_output and output_files:
1763 required_output = output_files
1764
1765 logger.debug(prog)
1766 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1767 cwd_arg = cwd+"/arguments"
1768 temp = ' '.join([str(a) for a in argument])
1769 arg_cmd="echo '"+temp+"' > " + cwd_arg
1770 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1771 if argument :
1772 command.extend(['-a ', '='.join([str(a) for a in argument])])
1773 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1774 id = a.stdout.read().strip()
1775
1776 else:
1777 cwd_arg = cwd+"/arguments"
1778 temp = ' '.join([str(a) for a in argument])
1779 temp_file_name = "sub." + os.path.basename(prog)
1780 text = """#!/bin/bash
1781 MYPWD=%(cwd)s
1782 cd $MYPWD
1783 input_files=(%(input_files)s )
1784 for i in ${input_files[@]}
1785 do
1786 chmod -f +x $i
1787 done
1788 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1789 """
1790 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1791 'arguments': ' '.join([str(a) for a in argument]),
1792 'program': ' ' if '.py' in prog else 'bash'}
1793
1794
1795 new_prog = pjoin(cwd, temp_file_name)
1796 open(new_prog, 'w').write(text % dico)
1797 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1798 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1799 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1800 id = a.stdout.read().strip()
1801 logger.debug(id)
1802
1803 nb_try=0
1804 nb_limit=5
1805 if not id.isdigit() :
1806 print "[ID is not digit]:" + id
1807
1808 while not id.isdigit() :
1809 nb_try+=1
1810 print "[fail_retry]:"+ nb_try
1811 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1812 id = a.stdout.read().strip()
1813 if nb_try > nb_limit :
1814 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1815 break
1816
1817 self.submitted += 1
1818 self.submitted_ids.append(id)
1819
1820 return id
1821
1822 @multiple_try(nb_try=10, sleep=5)
1824 """ control the status of a single job with it's cluster id """
1825
1826 if id == 0 :
1827 status_out ='C'
1828 else :
1829 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1830 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1831 stderr=subprocess.PIPE)
1832 error = status.stderr.read()
1833 if status.returncode or error:
1834 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1835 status_out= status.stdout.read().strip()
1836 status_out= status_out.split(":",1)[1]
1837 if status_out == 'waiting':
1838 status_out='I'
1839 elif status_out == 'preparing' or status_out == 'running':
1840 status_out = 'R'
1841 elif status_out != 'done':
1842 status_out = 'F'
1843 elif status_out == 'done':
1844 status_out = 'C'
1845
1846 return status_out
1847
1848 @multiple_try()
1850 """ control the status of a single job with it's cluster id """
1851 if not self.submitted_ids:
1852 logger.debug("self.submitted_ids not exists")
1853 return 0, 0, 0, 0
1854
1855 ongoing = []
1856 idle, run, fail = 0, 0, 0
1857
1858 start = self.submitted_ids[0]
1859 end = self.submitted_ids[-1]
1860
1861 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1862 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1863
1864 for line in status.stdout:
1865
1866 status2 = line.split()[-1]
1867 if status2 is not 'null' or line.split()[0].strip() is not '0':
1868 ongoing.append(line.split()[0].strip())
1869 logger.debug("["+line.split()[0].strip()+"]"+status2)
1870 if status2 is 'null' or line.split()[0].strip() is '0':
1871 idle += 1
1872 elif status2 in self.idle_tag:
1873 idle += 1
1874 elif status2 in self.running_tag:
1875 run += 1
1876 elif status2 in self.complete_tag:
1877 if not self.check_termination(line.split()[0]):
1878 idle +=1
1879 else:
1880 fail += 1
1881
1882 return idle, run, self.submitted - (idle+run+fail), fail
1883
1884 @multiple_try()
1885 - def remove(self, *args, **opts):
1886 """Clean the jobson the cluster"""
1887
1888 if not self.submitted_ids:
1889 return
1890 for i in range(len(self.submitted_ids)):
1891 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1892 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1893
1895 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1896
1897 name= 'htcaas2'
1898 job_id = 'HTCAAS2_JOBID'
1899 idle_tag = ['waiting']
1900 running_tag = ['preparing','running']
1901 complete_tag = ['done']
1902
1903 @store_input()
1904 @multiple_try()
1905 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1906 log=None, input_files=[], output_files=[], required_output=[],
1907 nb_submit=0):
1908
1909 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1910 input/output file should be given as relative to CWD
1911 """
1912 if cwd is None:
1913 cwd = os.getcwd()
1914
1915 if not os.path.exists(prog):
1916 prog = os.path.join(cwd, prog)
1917
1918 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1919 if cwd or prog :
1920 self.submitted_dirs.append(cwd)
1921 self.submitted_exes.append(prog)
1922 else:
1923 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1924
1925 if argument :
1926 self.submitted_args.append('='.join([str(a) for a in argument]))
1927
1928 if cwd or prog :
1929 self.submitted += 1
1930 id = self.submitted
1931 self.submitted_ids.append(id)
1932 else:
1933 logger.debug("cwd and prog are not exist! ")
1934 id = 0
1935
1936 else:
1937 temp_file_name = "sub."+ os.path.basename(prog)
1938 text = """#!/bin/bash
1939 MYPWD=%(cwd)s
1940 cd $MYPWD
1941 input_files=(%(input_files)s )
1942 for i in ${input_files[@]}
1943 do
1944 chmod -f +x $i
1945 done
1946 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1947 """
1948 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1949 'arguments': ' '.join([str(a) for a in argument]),
1950 'program': ' ' if '.py' in prog else 'bash'}
1951
1952 new_prog = pjoin(cwd, temp_file_name)
1953 open(new_prog, 'w').write(text % dico)
1954 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1955 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
1956 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1957 id = a.stdout.read().strip()
1958 logger.debug("[mode2]-["+str(id)+"]")
1959 if cwd and prog :
1960 self.submitted += 1
1961 self.submitted_ids.append(id)
1962 else:
1963 logger.debug("cwd and prog are not exist! ")
1964 id = 0
1965
1966 return id
1967
1968 @multiple_try()
2012
2013
2014 @multiple_try(nb_try=10, sleep=5)
2016 """ control the status of a single job with it's cluster id """
2017
2018 if self.submitted == self.submitted_ids[-1] :
2019 id = self.metasubmit(self)
2020 tempid = self.submitted_ids[-1]
2021 self.submitted_ids.remove(self.submitted_ids[-1])
2022 self.submitted_ids.append(id)
2023 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2024
2025 if id == 0 :
2026 status_out ='C'
2027 else:
2028 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2029 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2030 stderr=subprocess.PIPE)
2031 error = status.stderr.read()
2032 if status.returncode or error:
2033 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error
2034 status_out= status.stdout.read().strip()
2035 status_out= status_out.split(":",1)[1]
2036 logger.debug("[["+str(id)+"]]"+status_out)
2037 if status_out == 'waiting':
2038 status_out='I'
2039 elif status_out == 'preparing' or status_out == 'running':
2040 status_out = 'R'
2041 elif status_out != 'done':
2042 status_out = 'F'
2043 elif status_out == 'done':
2044 status_out = 'C'
2045 self.submitted -= 1
2046
2047 return status_out
2048
2049 @multiple_try()
2051 """ control the status of a single job with it's cluster id """
2052 if not self.submitted_ids:
2053 logger.debug("self.submitted_ids not exists")
2054 return 0, 0, 0, 0
2055
2056 if "//" in me_dir :
2057 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2058 start = me_dir.split("//")[0]
2059 end = me_dir.split("//")[1]
2060 else :
2061 start = me_dir.split("//")[1]
2062 end = me_dir.split("//")[0]
2063 elif "/" in me_dir :
2064 start = 0
2065 end = 0
2066 elif me_dir.isdigit():
2067 start = me_dir
2068 end = me_dir
2069 elif not me_dir.isdigit():
2070 me_dir = self.submitted_ids[0]
2071 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2072
2073 ongoing = []
2074 idle, run, fail, done = 0, 0, 0, 0
2075
2076 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2077 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2078
2079 for line in status.stdout:
2080 status2 = line.split()[-1]
2081 if status2 is not 'null' or line.split()[0].strip() is not '0':
2082 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2083 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2084
2085 if status2 is 'null' or line.split()[0].strip() is '0':
2086 idle += 1
2087 elif status2 in self.idle_tag:
2088 idle += 1
2089 elif status2 in self.running_tag:
2090 run += 1
2091 elif status2 in self.complete_tag:
2092 done += 1
2093 self.submitted -= 1
2094 if not self.check_termination(line.split()[1]):
2095 idle +=1
2096 else:
2097 fail += 1
2098
2099 return idle, run, self.submitted - (idle+run+fail), fail
2100
2101 @multiple_try()
2102 - def remove(self, *args, **opts):
2103 """Clean the jobson the cluster"""
2104
2105 if not self.submitted_ids:
2106 return
2107 id = self.submitted_ids[0]
2108 if id is not 0 :
2109 cmd = "htcaas-job-cancel -m %s" % str(id)
2110 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2111
2112 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2113 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2114 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2115