1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21 import sys
22
23 logger = logging.getLogger('madgraph.cluster')
24
25 try:
26 from madgraph import MadGraph5Error
27 import madgraph.various.misc as misc
28 except Exception, error:
29 if __debug__:
30 print str(error)
31 from internal import MadGraph5Error
32 import internal.misc as misc
33
34 pjoin = os.path.join
38
41
42
43 multiple_try = misc.multiple_try
44 pjoin = os.path.join
48
49 def deco_interupt(f):
50 def deco_f_interupt(self, *args, **opt):
51 try:
52 return f(self, *args, **opt)
53 except error:
54 try:
55 self.remove(*args, **opt)
56 except Exception:
57 pass
58 raise error
59 return deco_f_interupt
60 return deco_interupt
61
74 return deco_f_store
75 return deco_store
76
78 """ This function checks whether compression of input files are necessary
79 given the running options given. """
80
81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
82 return False
83 else:
84 return True
85
87 """Basic Class for all cluster type submission"""
88 name = 'mother class'
89 identifier_length = 14
90
92 """Init the cluster"""
93
94 self.submitted = 0
95 self.submitted_ids = []
96 self.finish = 0
97 self.submitted_dirs = []
98 self.submitted_exes = []
99 self.submitted_args = []
100
101 if 'cluster_queue' in opts:
102 self.cluster_queue = opts['cluster_queue']
103 else:
104 self.cluster_queue = 'madgraph'
105 if 'cluster_temp_path' in opts:
106 self.temp_dir = opts['cluster_temp_path']
107 else:
108 self.temp_dir = None
109 self.options = {'cluster_status_update': (600, 30)}
110 for key,value in opts.items():
111 self.options[key] = value
112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
114 self.options = dict(opts)
115 self.retry_args = {}
116
117 self.packet = {}
118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster."""
123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124
125
126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
128 log=None, input_files=[], output_files=[], required_output=[],
129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster.
131 NO SHARE DISK"""
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137
138 if not required_output and output_files:
139 required_output = output_files
140
141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
142 (input_files == [] == output_files):
143 return self.submit(prog, argument, cwd, stdout, stderr, log,
144 required_output=required_output, nb_submit=nb_submit)
145
146 if not input_files and not output_files:
147
148 return self.submit(prog, argument, cwd, stdout, stderr, log,
149 required_output=required_output, nb_submit=nb_submit)
150
151 if cwd is None:
152 cwd = os.getcwd()
153 if not os.path.exists(prog):
154 prog = os.path.join(cwd, prog)
155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
156
157 text = """#!/bin/bash
158 MYTMP=%(tmpdir)s/run$%(job_id)s
159 MYPWD=%(cwd)s
160 mkdir -p $MYTMP
161 cd $MYPWD
162 input_files=( %(input_files)s )
163 for i in ${input_files[@]}
164 do
165 cp -R -L $i $MYTMP
166 done
167 cd $MYTMP
168 echo '%(arguments)s' > arguments
169 chmod +x ./%(script)s
170 %(program)s ./%(script)s %(arguments)s
171 exit=$?
172 output_files=( %(output_files)s )
173 for i in ${output_files[@]}
174 do
175 cp -r $MYTMP/$i $MYPWD
176 done
177 # if [ "$exit" -eq "0" ]
178 # then
179 rm -rf $MYTMP
180 # fi
181 """
182
183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
184 'cwd': cwd, 'job_id': self.job_id,
185 'input_files': ' '.join(input_files + [prog]),
186 'output_files': ' '.join(output_files),
187 'arguments': ' '.join([str(a) for a in argument]),
188 'program': ' ' if '.py' in prog else 'bash'}
189
190
191 new_prog = pjoin(cwd, temp_file_name)
192 open(new_prog, 'w').write(text % dico)
193 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
194
195 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
196 required_output=required_output, nb_submit=nb_submit)
197
198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
200 log=None, input_files=[], output_files=[], required_output=[],
201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant
203 method should not be overwritten (but for DAG type submission)"""
204
205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
206 output_files, required_output, nb_submit)
207
208
209 if not packet_member:
210 return id
211 else:
212 if isinstance(packet_member, Packet):
213 self.id_to_packet[id] = packet_member
214 packet_member.put(id)
215 if packet_member.tag not in self.packet:
216 self.packet[packet_member.tag] = packet_member
217 else:
218 if packet_member in self.packet:
219 packet = self.packet[packet_member]
220 packet.put(id)
221 self.id_to_packet[id] = packet
222 return id
223
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
226 if not self.submitted_ids:
227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
228 idle, run, fail = 0, 0, 0
229 for pid in self.submitted_ids[:]:
230 status = self.control_one_job(id)
231 if status == 'I':
232 idle += 1
233 elif status == 'R':
234 run += 1
235 elif status == 'F':
236 self.finish +=1
237 self.submitted_ids.remove(pid)
238 else:
239 fail += 1
240
241 return idle, run, self.finish, fail
242
244 """ control the status of a single job with it's cluster id """
245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
248 """get a unique run_name for all the jobs helps to identify the runs
249 in the controller for some cluster."""
250
251 if second_path:
252 path = os.path.realpath(pjoin(path, second_path))
253 elif not os.path.exists(path):
254 return path
255
256 if 'SubProcesses' in path:
257 target = path.rsplit('/SubProcesses',1)[0]
258 elif 'MCatNLO' in path:
259 target = path.rsplit('/MCatNLO',1)[0]
260 elif second_path:
261 target=path
262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
263 else:
264 target = path
265
266 if target.endswith('/'):
267 target = target[:-1]
268
269 target = misc.digest(target)[-self.identifier_length:]
270 if not target[0].isalpha():
271 target = 'a' + target[1:]
272
273 return target
274
275
276 @check_interupt()
277 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
278 """Wait that all job are finish.
279 if minimal_job set, then return if idle + run is lower than that number"""
280
281
282 mode = 1
283 nb_iter = 0
284 nb_short = 0
285 change_at = 5
286
287 if update_first:
288 idle, run, finish, fail = self.control(me_dir)
289 update_first(idle, run, finish)
290
291
292 longtime, shorttime = self.options['cluster_status_update']
293
294 nb_job = 0
295
296 if self.options['cluster_type'] == 'htcaas2':
297 me_dir = self.metasubmit(self)
298
299 while 1:
300 old_mode = mode
301 nb_iter += 1
302 idle, run, finish, fail = self.control(me_dir)
303 if nb_job:
304 if idle + run + finish + fail != nb_job:
305 nb_job = idle + run + finish + fail
306 nb_iter = 1
307 else:
308 nb_job = idle + run + finish + fail
309 if fail:
310 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
311 if idle + run == 0:
312
313 logger.info('All jobs finished')
314 fct(idle, run, finish)
315 break
316 if idle + run < minimal_job:
317 return
318 fct(idle, run, finish)
319
320 if nb_iter < change_at:
321 mode = 1
322 elif idle < run:
323 if old_mode == 0:
324 if nb_short:
325 mode = 0
326
327 elif idle:
328 if nb_iter > change_at + int(longtime)//shorttime:
329 mode = 0
330 else:
331 mode = 1
332 nb_short =0
333 else:
334 mode = 1
335 nb_short = 0
336 elif old_mode == 1:
337 nb_short +=1
338 if nb_short > 3* max(change_at, int(longtime)//shorttime):
339 mode = 0
340 else:
341 mode = 0
342
343
344 if old_mode > mode:
345 logger.info('''Start to wait %ss between checking status.
346 Note that you can change this time in the configuration file.
347 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
348
349
350 if mode == 0:
351 try:
352 time.sleep(self.options['cluster_status_update'][0])
353 except KeyboardInterrupt:
354 logger.info('start to update the status')
355 nb_iter = min(0, change_at -2)
356 nb_short = 0
357 else:
358 time.sleep(self.options['cluster_status_update'][1])
359
360
361 self.submitted = 0
362 self.submitted_ids = []
363
365 """Check the termination of the jobs with job_id and relaunch it if needed."""
366
367
368 if job_id not in self.retry_args:
369 if job_id in self.id_to_packet:
370 nb_in_packet = self.id_to_packet[job_id].remove_one()
371 if nb_in_packet == 0:
372
373 packet = self.id_to_packet[job_id]
374
375 packet.queue.join()
376
377 packet.fct(*packet.args)
378 del self.id_to_packet[job_id]
379 return 'resubmit'
380 else:
381 return True
382
383 args = self.retry_args[job_id]
384 if 'time_check' in args:
385 time_check = args['time_check']
386 else:
387 time_check = 0
388
389 for path in args['required_output']:
390 if args['cwd']:
391 path = pjoin(args['cwd'], path)
392
393 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
394 break
395 else:
396
397 if time_check > 0:
398 logger.info('Job %s Finally found the missing output.' % (job_id))
399 del self.retry_args[job_id]
400 self.submitted_ids.remove(job_id)
401
402 if job_id in self.id_to_packet:
403 nb_in_packet = self.id_to_packet[job_id].remove_one()
404 if nb_in_packet == 0:
405
406 packet = self.id_to_packet[job_id]
407
408 packet.queue.join()
409
410 packet.fct(*packet.args)
411 del self.id_to_packet[job_id]
412 return 'resubmit'
413
414 return 'done'
415
416 if time_check == 0:
417 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
418 args['time_check'] = time.time()
419 return 'wait'
420 elif self.cluster_retry_wait > time.time() - time_check:
421 return 'wait'
422
423
424 if self.nb_retry < 0:
425 logger.critical('''Fail to run correctly job %s.
426 with option: %s
427 file missing: %s''' % (job_id, args, path))
428 raw_input('press enter to continue.')
429 elif self.nb_retry == 0:
430 logger.critical('''Fail to run correctly job %s.
431 with option: %s
432 file missing: %s.
433 Stopping all runs.''' % (job_id, args, path))
434 self.remove()
435 elif args['nb_submit'] >= self.nb_retry:
436 logger.critical('''Fail to run correctly job %s.
437 with option: %s
438 file missing: %s
439 Fails %s times
440 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
441 self.remove()
442 else:
443 args['nb_submit'] += 1
444 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
445 del self.retry_args[job_id]
446 self.submitted_ids.remove(job_id)
447 if 'time_check' in args:
448 del args['time_check']
449 if job_id in self.id_to_packet:
450 self.id_to_packet[job_id].remove_one()
451 args['packet_member'] = self.id_to_packet[job_id]
452 del self.id_to_packet[job_id]
453 self.cluster_submit(**args)
454 else:
455 self.submit2(**args)
456 return 'resubmit'
457 return 'done'
458
459 @check_interupt()
460 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
461 stderr=None, log=None, required_output=[], nb_submit=0,
462 input_files=[], output_files=[]):
463 """launch one job on the cluster and wait for it"""
464
465 special_output = False
466 if stderr == -2 and stdout:
467
468 special_output = True
469 stderr = stdout + '.err'
470
471 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
472 required_output=required_output, input_files=input_files,
473 output_files=output_files)
474
475 if self.options['cluster_type']=='htcaas2':
476 if self.submitted == self.submitted_ids[-1]:
477 id = self.metasubmit(self)
478
479 frame = inspect.currentframe()
480 args, _, _, values = inspect.getargvalues(frame)
481 args = dict([(i, values[i]) for i in args if i != 'self'])
482 self.retry_args[id] = args
483
484 nb_wait=0
485 while 1:
486 nb_wait+=1
487 status = self.control_one_job(id)
488 if not status in ['R','I']:
489 status = self.check_termination(id)
490 if status in ['wait']:
491 time.sleep(30)
492 continue
493 elif status in ['resubmit']:
494 id = self.submitted_ids[0]
495 time.sleep(30)
496 continue
497
498 time.sleep(30)
499 break
500 time.sleep(self.options['cluster_status_update'][1])
501
502 if required_output:
503 status = self.check_termination(id)
504 if status == 'wait':
505 run += 1
506 elif status == 'resubmit':
507 idle += 1
508
509
510 if special_output:
511
512
513 for i in range(5):
514 if os.path.exists(stdout):
515 if not os.path.exists(stderr):
516 time.sleep(5)
517 if os.path.exists(stderr):
518 err_text = open(stderr).read()
519 if not err_text:
520 return
521 logger.warning(err_text)
522 text = open(stdout).read()
523 open(stdout,'w').write(text + err_text)
524 else:
525 return
526 time.sleep(10)
527
528 - def remove(self, *args, **opts):
529 """ """
530 logger.warning("""This cluster didn't support job removal,
531 the jobs are still running on the cluster.""")
532
533 @store_input()
537
539 """ an object for handling packet of job, it is designed to be thread safe
540 """
541
542 - def __init__(self, name, fct, args, opts={}):
543 import Queue
544 import threading
545 self.queue = Queue.Queue()
546 self.tag = name
547 self.fct = fct
548 self.args = args
549 self.opts = opts
550 self.done = threading.Event()
551
552 - def put(self, *args, **opts):
554
555 append = put
556
561
563 """class for dealing with the submission in multiple node"""
564
565 job_id = "$"
566
568 """Init the cluster """
569
570
571 super(MultiCore, self).__init__(self, *args, **opt)
572
573 import Queue
574 import threading
575 import thread
576 self.queue = Queue.Queue()
577 self.done = Queue.Queue()
578 self.submitted = Queue.Queue()
579 self.stoprequest = threading.Event()
580 self.demons = []
581 self.nb_done =0
582 if 'nb_core' in opt:
583 self.nb_core = opt['nb_core']
584 elif isinstance(args[0],int):
585 self.nb_core = args[0]
586 else:
587 self.nb_core = 1
588 self.update_fct = None
589
590 self.lock = threading.Event()
591 self.pids = Queue.Queue()
592 self.done_pid = []
593 self.done_pid_queue = Queue.Queue()
594 self.fail_msg = None
595
596
597 for _ in range(self.nb_core):
598 self.start_demon()
599
600
602 import threading
603 t = threading.Thread(target=self.worker)
604 t.daemon = True
605 t.start()
606 self.demons.append(t)
607
608
610 import Queue
611 import thread
612 while not self.stoprequest.isSet():
613 try:
614 args = self.queue.get()
615 tag, exe, arg, opt = args
616 try:
617
618 if isinstance(exe,str):
619 if os.path.exists(exe) and not exe.startswith('/'):
620 exe = './' + exe
621 if isinstance(opt['stdout'],str):
622 opt['stdout'] = open(opt['stdout'],'w')
623 if opt['stderr'] == None:
624 opt['stderr'] = subprocess.STDOUT
625 proc = misc.Popen([exe] + arg, **opt)
626 pid = proc.pid
627 self.pids.put(pid)
628 proc.wait()
629 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
630 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
631 (' '.join([exe]+arg), proc.returncode)
632 logger.warning(fail_msg)
633 self.stoprequest.set()
634 self.remove(fail_msg)
635
636
637
638
639 else:
640 pid = tag
641 self.pids.put(pid)
642
643
644 returncode = exe(*arg, **opt)
645 if returncode != 0:
646 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
647 self.stoprequest.set()
648 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
649 except Exception,error:
650 self.fail_msg = sys.exc_info()
651 logger.warning(str(error))
652 self.stoprequest.set()
653 self.remove(error)
654
655 if __debug__:
656 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2]
657
658 self.queue.task_done()
659 self.done.put(tag)
660 self.done_pid_queue.put(pid)
661
662 try:
663 self.lock.set()
664 except thread.error:
665 continue
666 except Queue.Empty:
667 continue
668
669
670
671
672 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
673 log=None, required_output=[], nb_submit=0):
674 """submit a job on multicore machine"""
675
676 tag = (prog, tuple(argument), cwd, nb_submit)
677 if isinstance(prog, str):
678
679 opt = {'cwd': cwd,
680 'stdout':stdout,
681 'stderr': stderr}
682 self.queue.put((tag, prog, argument, opt))
683 self.submitted.put(1)
684 return tag
685 else:
686
687 self.queue.put((tag, prog, argument, {}))
688 self.submitted.put(1)
689 return tag
690
691 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
692 stderr=None, log=None, **opts):
693 """launch one job and wait for it"""
694 if isinstance(stdout, str):
695 stdout = open(stdout, 'w')
696 if isinstance(stderr, str):
697 stdout = open(stderr, 'w')
698 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
699
700 - def remove(self, error=None):
701 """Ensure that all thread are killed"""
702
703
704 self.stoprequest.set()
705 if error and not self.fail_msg:
706 self.fail_msg = error
707
708
709 while not self.done_pid_queue.empty():
710 pid = self.done_pid_queue.get()
711 self.done_pid.append(pid)
712
713
714 while not self.pids.empty():
715 pid = self.pids.get()
716 self.pids.task_done()
717 if isinstance(pid, tuple):
718 continue
719 if pid in self.done_pid:
720 continue
721 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
722 % {'pid':pid} )
723 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
724
725
726 - def wait(self, me_dir, update_status, update_first=None):
727 """Waiting that all the jobs are done. This function also control that
728 the submission by packet are handle correctly (i.e. submit the function)"""
729
730 import Queue
731 import threading
732
733 try:
734 last_status = (0, 0, 0)
735 sleep_time = 1
736 use_lock = True
737 first = True
738 while True:
739 force_one_more_loop = False
740
741
742
743 while self.done.qsize():
744 try:
745 tag = self.done.get(True, 1)
746 except Queue.Empty:
747 pass
748 else:
749 if self.id_to_packet and tuple(tag) in self.id_to_packet:
750 packet = self.id_to_packet[tuple(tag)]
751 remaining = packet.remove_one()
752 if remaining == 0:
753
754 packet.queue.join()
755 self.submit(packet.fct, packet.args)
756 force_one_more_loop = True
757 self.nb_done += 1
758 self.done.task_done()
759
760
761
762 Idle = self.queue.qsize()
763 Done = self.nb_done + self.done.qsize()
764 Running = max(0, self.submitted.qsize() - Idle - Done)
765
766 if Idle + Running <= 0 and not force_one_more_loop:
767 update_status(Idle, Running, Done)
768
769
770 self.queue.join()
771 break
772
773 if (Idle, Running, Done) != last_status:
774 if first and update_first:
775 update_first(Idle, Running, Done)
776 first = False
777 else:
778 update_status(Idle, Running, Done)
779 last_status = (Idle, Running, Done)
780
781
782 while not self.done_pid_queue.empty():
783 pid = self.done_pid_queue.get()
784 self.done_pid.append(pid)
785 self.done_pid_queue.task_done()
786
787
788
789 if use_lock:
790
791 use_lock = self.lock.wait(300)
792 self.lock.clear()
793 if not use_lock and Idle > 0:
794 use_lock = True
795 else:
796
797
798 time.sleep(sleep_time)
799 sleep_time = min(sleep_time + 2, 180)
800 if update_first:
801 update_first(Idle, Running, Done)
802
803 if self.stoprequest.isSet():
804 if isinstance(self.fail_msg, Exception):
805 raise self.fail_msg
806 elif isinstance(self.fail_msg, str):
807 raise Exception, self.fail_msg
808 else:
809 misc.sprint(self.fail_msg)
810 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
811
812 try:
813 self.lock.clear()
814 except Exception:
815 pass
816 self.done = Queue.Queue()
817 self.done_pid = []
818 self.done_pid_queue = Queue.Queue()
819 self.nb_done = 0
820 self.submitted = Queue.Queue()
821 self.pids = Queue.Queue()
822 self.stoprequest.clear()
823
824 except KeyboardInterrupt:
825
826 if isinstance(self.fail_msg, Exception):
827 raise self.fail_msg
828 elif isinstance(self.fail_msg, str):
829 raise Exception, self.fail_msg
830 elif self.fail_msg:
831 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
832
833 raise
834
836 """Basic class for dealing with cluster submission"""
837
838 name = 'condor'
839 job_id = 'CONDOR_ID'
840
841
842
843 @multiple_try()
844 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
845 required_output=[], nb_submit=0):
846 """Submit a job prog to a Condor cluster"""
847
848 text = """Executable = %(prog)s
849 output = %(stdout)s
850 error = %(stderr)s
851 log = %(log)s
852 %(argument)s
853 environment = CONDOR_ID=$(Cluster).$(Process)
854 Universe = vanilla
855 notification = Error
856 Initialdir = %(cwd)s
857 %(requirement)s
858 getenv=True
859 queue 1
860 """
861
862 if self.cluster_queue not in ['None', None]:
863 requirement = 'Requirements = %s=?=True' % self.cluster_queue
864 else:
865 requirement = ''
866
867 if cwd is None:
868 cwd = os.getcwd()
869 if stdout is None:
870 stdout = '/dev/null'
871 if stderr is None:
872 stderr = '/dev/null'
873 if log is None:
874 log = '/dev/null'
875 if not os.path.exists(prog):
876 prog = os.path.join(cwd, prog)
877 if argument:
878 argument = 'Arguments = %s' % ' '.join(argument)
879 else:
880 argument = ''
881
882
883 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
884 'stderr': stderr,'log': log,'argument': argument,
885 'requirement': requirement}
886
887
888 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
889 stdin=subprocess.PIPE)
890 output, _ = a.communicate(text % dico)
891
892
893
894
895 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
896 try:
897 id = pat.search(output).groups()[0]
898 except:
899 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
900 % output
901 self.submitted += 1
902 self.submitted_ids.append(id)
903 return id
904
905 @store_input()
906 @multiple_try()
907 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
908 log=None, input_files=[], output_files=[], required_output=[],
909 nb_submit=0):
910 """Submit the job on the cluster NO SHARE DISK
911 input/output file should be give relative to cwd
912 """
913
914 if not required_output and output_files:
915 required_output = output_files
916
917 if (input_files == [] == output_files):
918 return self.submit(prog, argument, cwd, stdout, stderr, log,
919 required_output=required_output, nb_submit=nb_submit)
920
921 text = """Executable = %(prog)s
922 output = %(stdout)s
923 error = %(stderr)s
924 log = %(log)s
925 %(argument)s
926 should_transfer_files = YES
927 when_to_transfer_output = ON_EXIT
928 transfer_input_files = %(input_files)s
929 %(output_files)s
930 Universe = vanilla
931 notification = Error
932 Initialdir = %(cwd)s
933 %(requirement)s
934 getenv=True
935 queue 1
936 """
937
938 if self.cluster_queue not in ['None', None]:
939 requirement = 'Requirements = %s=?=True' % self.cluster_queue
940 else:
941 requirement = ''
942
943 if cwd is None:
944 cwd = os.getcwd()
945 if stdout is None:
946 stdout = '/dev/null'
947 if stderr is None:
948 stderr = '/dev/null'
949 if log is None:
950 log = '/dev/null'
951 if not os.path.exists(prog):
952 prog = os.path.join(cwd, prog)
953 if argument:
954 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
955 else:
956 argument = ''
957
958 if input_files:
959 input_files = ','.join(input_files)
960 else:
961 input_files = ''
962 if output_files:
963 output_files = 'transfer_output_files = %s' % ','.join(output_files)
964 else:
965 output_files = ''
966
967
968
969 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
970 'stderr': stderr,'log': log,'argument': argument,
971 'requirement': requirement, 'input_files':input_files,
972 'output_files':output_files}
973
974
975 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
976 stdin=subprocess.PIPE)
977 output, _ = a.communicate(text % dico)
978
979
980
981
982 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
983 try:
984 id = pat.search(output).groups()[0]
985 except:
986 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
987 % output
988 self.submitted += 1
989 self.submitted_ids.append(id)
990 return id
991
992
993
994
995
996 @multiple_try(nb_try=10, sleep=10)
998 """ control the status of a single job with it's cluster id """
999 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1000 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1001 stderr=subprocess.PIPE)
1002
1003 error = status.stderr.read()
1004 if status.returncode or error:
1005 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1006
1007 return status.stdout.readline().strip()
1008
1009 @check_interupt()
1010 @multiple_try(nb_try=10, sleep=10)
1012 """ control the status of a single job with it's cluster id """
1013
1014 if not self.submitted_ids:
1015 return 0, 0, 0, 0
1016
1017 packet = 15000
1018 idle, run, fail = 0, 0, 0
1019 ongoing = []
1020 for i in range(1+(len(self.submitted_ids)-1)//packet):
1021 start = i * packet
1022 stop = (i+1) * packet
1023 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1024 " -format \'%-2s\ ' \'ClusterId\' " + \
1025 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1026
1027 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1028 stderr=subprocess.PIPE)
1029 error = status.stderr.read()
1030 if status.returncode or error:
1031 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1032
1033 for line in status.stdout:
1034 id, status = line.strip().split()
1035 ongoing.append(int(id))
1036 if status in ['I','U']:
1037 idle += 1
1038 elif status == 'R':
1039 run += 1
1040 elif status != 'C':
1041 fail += 1
1042
1043 for id in list(self.submitted_ids):
1044 if int(id) not in ongoing:
1045 status = self.check_termination(id)
1046 if status == 'wait':
1047 run += 1
1048 elif status == 'resubmit':
1049 idle += 1
1050
1051 return idle, run, self.submitted - (idle+run+fail), fail
1052
1053 @multiple_try()
1054 - def remove(self, *args, **opts):
1055 """Clean the jobson the cluster"""
1056
1057 if not self.submitted_ids:
1058 return
1059 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1060
1061 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1062 self.submitted_ids = []
1063
1065 """Basic class for dealing with cluster submission"""
1066
1067 name = 'pbs'
1068 job_id = 'PBS_JOBID'
1069 idle_tag = ['Q']
1070 running_tag = ['T','E','R']
1071 complete_tag = ['C']
1072
1073 maximum_submited_jobs = 2500
1074
1075 @multiple_try()
1076 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1077 required_output=[], nb_submit=0):
1078 """Submit a job prog to a PBS cluster"""
1079
1080 me_dir = self.get_jobs_identifier(cwd, prog)
1081
1082 if len(self.submitted_ids) > self.maximum_submited_jobs:
1083 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1084 self.wait(me_dir, fct, self.maximum_submited_jobs)
1085
1086
1087 text = ""
1088 if cwd is None:
1089 cwd = os.getcwd()
1090 else:
1091 text = " cd %s;" % cwd
1092 if stdout is None:
1093 stdout = '/dev/null'
1094 if stderr is None:
1095 stderr = '/dev/null'
1096 elif stderr == -2:
1097 stderr = stdout
1098 if log is None:
1099 log = '/dev/null'
1100
1101 if not os.path.isabs(prog):
1102 text += "./%s" % prog
1103 else:
1104 text+= prog
1105
1106 if argument:
1107 text += ' ' + ' '.join(argument)
1108
1109 command = ['qsub','-o', stdout,
1110 '-N', me_dir,
1111 '-e', stderr,
1112 '-V']
1113
1114 if self.cluster_queue and self.cluster_queue != 'None':
1115 command.extend(['-q', self.cluster_queue])
1116
1117 a = misc.Popen(command, stdout=subprocess.PIPE,
1118 stderr=subprocess.STDOUT,
1119 stdin=subprocess.PIPE, cwd=cwd)
1120
1121 output = a.communicate(text)[0]
1122 id = output.split('.')[0]
1123 if not id.isdigit() or a.returncode !=0:
1124 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1125 % output
1126
1127 self.submitted += 1
1128 self.submitted_ids.append(id)
1129 return id
1130
1131 @multiple_try()
1133 """ control the status of a single job with it's cluster id """
1134 cmd = 'qstat '+str(id)
1135 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1136 stderr=subprocess.STDOUT)
1137
1138 for line in status.stdout:
1139 line = line.strip()
1140 if 'cannot connect to server' in line or 'cannot read reply' in line:
1141 raise ClusterManagmentError, 'server disconnected'
1142 if 'Unknown' in line:
1143 return 'F'
1144 elif line.startswith(str(id)):
1145 jobstatus = line.split()[4]
1146 else:
1147 jobstatus=""
1148
1149 if status.returncode != 0 and status.returncode is not None:
1150 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1151 if jobstatus in self.idle_tag:
1152 return 'I'
1153 elif jobstatus in self.running_tag:
1154 return 'R'
1155 return 'F'
1156
1157
1158 @multiple_try()
1160 """ control the status of a single job with it's cluster id """
1161 cmd = "qstat"
1162 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1163
1164 me_dir = self.get_jobs_identifier(me_dir)
1165
1166 ongoing = []
1167
1168 idle, run, fail = 0, 0, 0
1169 for line in status.stdout:
1170 if 'cannot connect to server' in line or 'cannot read reply' in line:
1171 raise ClusterManagmentError, 'server disconnected'
1172 if me_dir in line:
1173 ongoing.append(line.split()[0].split('.')[0])
1174 status2 = line.split()[4]
1175 if status2 in self.idle_tag:
1176 idle += 1
1177 elif status2 in self.running_tag:
1178 run += 1
1179 elif status2 in self.complete_tag:
1180 if not self.check_termination(line.split()[0].split('.')[0]):
1181 idle += 1
1182 else:
1183 fail += 1
1184
1185 if status.returncode != 0 and status.returncode is not None:
1186 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1187
1188 for id in list(self.submitted_ids):
1189 if id not in ongoing:
1190 status2 = self.check_termination(id)
1191 if status2 == 'wait':
1192 run += 1
1193 elif status2 == 'resubmit':
1194 idle += 1
1195
1196 return idle, run, self.submitted - (idle+run+fail), fail
1197
1198 @multiple_try()
1199 - def remove(self, *args, **opts):
1200 """Clean the jobs on the cluster"""
1201
1202 if not self.submitted_ids:
1203 return
1204 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1205 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1206 self.submitted_ids = []
1207
1210 """Basic class for dealing with cluster submission"""
1211
1212
1213 name = 'sge'
1214 job_id = 'JOB_ID'
1215 idle_tag = ['qw', 'hqw','hRqw','w']
1216 running_tag = ['r','t','Rr','Rt']
1217 identifier_length = 10
1218
1220 """replace string for path issues"""
1221 location = os.path.realpath(location)
1222 homePath = os.getenv("HOME")
1223 if homePath:
1224 location = location.replace(homePath,'$HOME')
1225 return location
1226
1227 @multiple_try()
1228 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1229 required_output=[], nb_submit=0):
1230 """Submit a job prog to an SGE cluster"""
1231
1232 me_dir = self.get_jobs_identifier(cwd, prog)
1233
1234
1235 if cwd is None:
1236
1237 cwd = self.def_get_path(os.getcwd())
1238 cwd1 = self.def_get_path(cwd)
1239 text = " cd %s;" % cwd1
1240 if stdout is None:
1241 stdout = '/dev/null'
1242 else:
1243 stdout = self.def_get_path(stdout)
1244 if stderr is None:
1245 stderr = '/dev/null'
1246 elif stderr == -2:
1247 stderr = stdout
1248 else:
1249 stderr = self.def_get_path(stderr)
1250
1251 if log is None:
1252 log = '/dev/null'
1253 else:
1254 log = self.def_get_path(log)
1255
1256 text += prog
1257 if argument:
1258 text += ' ' + ' '.join(argument)
1259
1260
1261
1262
1263 homePath = os.getenv("HOME")
1264 if homePath:
1265 text = text.replace(homePath,'$HOME')
1266
1267 logger.debug("!=== input %s" % text)
1268 logger.debug("!=== output %s" % stdout)
1269 logger.debug("!=== error %s" % stderr)
1270 logger.debug("!=== logs %s" % log)
1271
1272 command = ['qsub','-o', stdout,
1273 '-N', me_dir,
1274 '-e', stderr,
1275 '-V']
1276
1277 if self.cluster_queue and self.cluster_queue != 'None':
1278 command.extend(['-q', self.cluster_queue])
1279
1280 a = misc.Popen(command, stdout=subprocess.PIPE,
1281 stderr=subprocess.STDOUT,
1282 stdin=subprocess.PIPE, cwd=cwd)
1283
1284 output = a.communicate(text)[0]
1285 id = output.split(' ')[2]
1286 if not id.isdigit():
1287 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1288 % output
1289 self.submitted += 1
1290 self.submitted_ids.append(id)
1291 logger.debug(output)
1292
1293 return id
1294
1295 @multiple_try()
1297 """ control the status of a single job with it's cluster id """
1298
1299 cmd = 'qstat '
1300 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1301 for line in status.stdout:
1302
1303
1304
1305
1306
1307
1308 if str(id) in line:
1309 status = line.split()[4]
1310
1311 if status in self.idle_tag:
1312 return 'I'
1313 elif status in self.running_tag:
1314 return 'R'
1315 return 'F'
1316
1317 @multiple_try()
1319 """ control the status of a single job with it's cluster id """
1320 cmd = "qstat "
1321 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1322
1323 me_dir = self.get_jobs_identifier(me_dir)
1324
1325 finished = list(self.submitted_ids)
1326
1327 idle, run, fail = 0, 0, 0
1328 for line in status.stdout:
1329 if me_dir in line:
1330 id,_,_,_,status = line.split()[:5]
1331 if status in self.idle_tag:
1332 idle += 1
1333 finished.remove(id)
1334 elif status in self.running_tag:
1335 run += 1
1336 finished.remove(id)
1337 else:
1338 logger.debug(line)
1339 fail += 1
1340 finished.remove(id)
1341
1342 for id in finished:
1343 self.check_termination(id)
1344
1345 return idle, run, self.submitted - (idle+run+fail), fail
1346
1347
1348
1349 @multiple_try()
1350 - def remove(self, *args, **opts):
1351 """Clean the jobs on the cluster"""
1352
1353 if not self.submitted_ids:
1354 return
1355 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1356 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1357 self.submitted_ids = []
1358
1361 """Basic class for dealing with cluster submission"""
1362
1363 name = 'lsf'
1364 job_id = 'LSB_JOBID'
1365
1366 @multiple_try()
1367 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1368 required_output=[], nb_submit=0):
1369 """Submit the job prog to an LSF cluster"""
1370
1371
1372 me_dir = self.get_jobs_identifier(cwd, prog)
1373
1374 text = ""
1375 command = ['bsub', '-C0', '-J', me_dir]
1376 if cwd is None:
1377 cwd = os.getcwd()
1378 else:
1379 text = " cd %s;" % cwd
1380 if stdout and isinstance(stdout, str):
1381 command.extend(['-o', stdout])
1382 if stderr and isinstance(stdout, str):
1383 command.extend(['-e', stderr])
1384 elif stderr == -2:
1385 pass
1386 if log is None:
1387 log = '/dev/null'
1388
1389 text += prog
1390 if argument:
1391 text += ' ' + ' '.join(argument)
1392
1393 if self.cluster_queue and self.cluster_queue != 'None':
1394 command.extend(['-q', self.cluster_queue])
1395
1396 a = misc.Popen(command, stdout=subprocess.PIPE,
1397 stderr=subprocess.STDOUT,
1398 stdin=subprocess.PIPE, cwd=cwd)
1399
1400 output = a.communicate(text)[0]
1401
1402 try:
1403 id = output.split('>',1)[0].split('<')[1]
1404 except:
1405 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1406 % output
1407 if not id.isdigit():
1408 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1409 % output
1410 self.submitted += 1
1411 self.submitted_ids.append(id)
1412 return id
1413
1414
1415 @multiple_try()
1417 """ control the status of a single job with it's cluster id """
1418
1419 cmd = 'bjobs '+str(id)
1420 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1421
1422 for line in status.stdout:
1423 line = line.strip().upper()
1424 if 'JOBID' in line:
1425 continue
1426 elif str(id) not in line:
1427 continue
1428 status = line.split()[2]
1429 if status == 'RUN':
1430 return 'R'
1431 elif status == 'PEND':
1432 return 'I'
1433 elif status == 'DONE':
1434 return 'F'
1435 else:
1436 return 'H'
1437 return 'F'
1438
1439 @multiple_try()
1441 """ control the status of a single job with it's cluster id """
1442
1443 if not self.submitted_ids:
1444 return 0, 0, 0, 0
1445
1446 cmd = "bjobs " + ' '.join(self.submitted_ids)
1447 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1448
1449 jobstatus = {}
1450 for line in status.stdout:
1451 line = line.strip()
1452 if 'JOBID' in line:
1453 continue
1454 splitline = line.split()
1455 id = splitline[0]
1456 if id not in self.submitted_ids:
1457 continue
1458 jobstatus[id] = splitline[2]
1459
1460 idle, run, fail = 0, 0, 0
1461 for id in self.submitted_ids[:]:
1462 if id in jobstatus:
1463 status = jobstatus[id]
1464 else:
1465 status = 'MISSING'
1466 if status == 'RUN':
1467 run += 1
1468 elif status == 'PEND':
1469 idle += 1
1470 else:
1471 status = self.check_termination(id)
1472 if status == 'wait':
1473 run += 1
1474 elif status == 'resubmit':
1475 idle += 1
1476
1477 return idle, run, self.submitted - (idle+run+fail), fail
1478
1479 @multiple_try()
1480 - def remove(self, *args,**opts):
1481 """Clean the jobs on the cluster"""
1482
1483 if not self.submitted_ids:
1484 return
1485 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1486 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1487 self.submitted_ids = []
1488
1490 """Class for dealing with cluster submission on a GE cluster"""
1491
1492 name = 'ge'
1493 job_id = 'JOB_ID'
1494 idle_tag = ['qw']
1495 running_tag = ['r']
1496
1497 @multiple_try()
1498 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1499 required_output=[], nb_submit=0):
1500 """Submit a job prog to a GE cluster"""
1501
1502 text = ""
1503 if cwd is None:
1504 cwd = os.getcwd()
1505 else:
1506 text = " cd %s; bash " % cwd
1507 if stdout is None:
1508 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1509 if stderr is None:
1510 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1511 elif stderr == -2:
1512 stderr = stdout
1513 if log is None:
1514 log = '/dev/null'
1515
1516 text += prog
1517 if argument:
1518 text += ' ' + ' '.join(argument)
1519 text += '\n'
1520 tmp_submit = os.path.join(cwd, 'tmp_submit')
1521 open(tmp_submit,'w').write(text)
1522
1523 a = misc.Popen(['qsub','-o', stdout,
1524 '-e', stderr,
1525 tmp_submit],
1526 stdout=subprocess.PIPE,
1527 stderr=subprocess.STDOUT,
1528 stdin=subprocess.PIPE, cwd=cwd)
1529
1530 output = a.communicate()[0]
1531
1532 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1533 try:
1534 id = pat.search(output).groups()[0]
1535 except:
1536 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1537 % output
1538 self.submitted += 1
1539 self.submitted_ids.append(id)
1540 return id
1541
1542 @multiple_try()
1544 """ control the status of a single job with it's cluster id """
1545 cmd = 'qstat | grep '+str(id)
1546 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1547 if not status:
1548 return 'F'
1549
1550 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1551 stat = ''
1552 for line in status.stdout.read().split('\n'):
1553 if not line:
1554 continue
1555 line = line.strip()
1556 try:
1557 groups = pat.search(line).groups()
1558 except:
1559 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1560 if groups[0] != id: continue
1561 stat = groups[1]
1562 if not stat:
1563 return 'F'
1564 if stat in self.idle_tag:
1565 return 'I'
1566 if stat in self.running_tag:
1567 return 'R'
1568
1569 @multiple_try()
1571 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1572 if not self.submitted_ids:
1573 return 0, 0, 0, 0
1574 idle, run, fail = 0, 0, 0
1575 ongoing = []
1576 for statusflag in ['p', 'r', 'sh']:
1577 cmd = 'qstat -s %s' % statusflag
1578 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1579
1580 pat = re.compile("^(\d+)")
1581 for line in status.stdout.read().split('\n'):
1582 line = line.strip()
1583 try:
1584 id = pat.search(line).groups()[0]
1585 except Exception:
1586 pass
1587 else:
1588 if id not in self.submitted_ids:
1589 continue
1590 ongoing.append(id)
1591 if statusflag == 'p':
1592 idle += 1
1593 if statusflag == 'r':
1594 run += 1
1595 if statusflag == 'sh':
1596 fail += 1
1597 for id in list(self.submitted_ids):
1598 if id not in ongoing:
1599 self.check_termination(id)
1600
1601
1602 return idle, run, self.submitted - idle - run - fail, fail
1603
1604 @multiple_try()
1605 - def remove(self, *args, **opts):
1606 """Clean the jobs on the cluster"""
1607
1608 if not self.submitted_ids:
1609 return
1610 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1611 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1612 self.submitted_ids = []
1613
1615 """start a computation and not wait for it to finish.
1616 this fonction returns a lock which is locked as long as the job is
1617 running."""
1618
1619 mc = MultiCore(1)
1620 mc.submit(exe, argument, cwd, stdout, **opt)
1621 mc.need_waiting = True
1622 return mc.lock
1623
1626 """Basic class for dealing with cluster submission"""
1627
1628 name = 'slurm'
1629 job_id = 'SLURM_JOBID'
1630 idle_tag = ['Q','PD','S','CF']
1631 running_tag = ['R', 'CG']
1632 complete_tag = ['C']
1633 identifier_length = 8
1634
1635 @multiple_try()
1636 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1637 required_output=[], nb_submit=0):
1638 """Submit a job prog to a SLURM cluster"""
1639
1640 me_dir = self.get_jobs_identifier(cwd, prog)
1641
1642
1643 if cwd is None:
1644 cwd = os.getcwd()
1645 if stdout is None:
1646 stdout = '/dev/null'
1647 if stderr is None:
1648 stderr = '/dev/null'
1649 elif stderr == -2:
1650 stderr = stdout
1651 if log is None:
1652 log = '/dev/null'
1653
1654 command = ['sbatch', '-o', stdout,
1655 '-J', me_dir,
1656 '-e', stderr, prog] + argument
1657
1658 if self.cluster_queue and self.cluster_queue != 'None':
1659 command.insert(1, '-p')
1660 command.insert(2, self.cluster_queue)
1661
1662 a = misc.Popen(command, stdout=subprocess.PIPE,
1663 stderr=subprocess.STDOUT,
1664 stdin=subprocess.PIPE, cwd=cwd)
1665
1666 output = a.communicate()
1667 output_arr = output[0].split(' ')
1668 id = output_arr[3].rstrip()
1669
1670 if not id.isdigit():
1671 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1672 % (output[0] + '\n' + output[1])
1673
1674 self.submitted += 1
1675 self.submitted_ids.append(id)
1676 return id
1677
1678 @multiple_try()
1680 """ control the status of a single job with it's cluster id """
1681 cmd = 'squeue j'+str(id)
1682 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1683 stderr=open(os.devnull,'w'))
1684
1685 for line in status.stdout:
1686 line = line.strip()
1687 if 'Invalid' in line:
1688 return 'F'
1689 elif line.startswith(str(id)):
1690 status = line.split()[4]
1691 if status in self.idle_tag:
1692 return 'I'
1693 elif status in self.running_tag:
1694 return 'R'
1695 return 'F'
1696
1697 @multiple_try()
1699 """ control the status of a single job with it's cluster id """
1700 cmd = "squeue"
1701 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1702
1703 me_dir = self.get_jobs_identifier(me_dir)
1704
1705 idle, run, fail = 0, 0, 0
1706 ongoing=[]
1707 for line in pstatus.stdout:
1708 if me_dir in line:
1709 id, _, _,_ , status,_ = line.split(None,5)
1710 ongoing.append(id)
1711 if status in self.idle_tag:
1712 idle += 1
1713 elif status in self.running_tag:
1714 run += 1
1715 elif status in self.complete_tag:
1716 status = self.check_termination(id)
1717 if status == 'wait':
1718 run += 1
1719 elif status == 'resubmit':
1720 idle += 1
1721 else:
1722 fail += 1
1723
1724
1725 for id in list(self.submitted_ids):
1726 if id not in ongoing:
1727 status = self.check_termination(id)
1728 if status == 'wait':
1729 run += 1
1730 elif status == 'resubmit':
1731 idle += 1
1732
1733
1734 return idle, run, self.submitted - (idle+run+fail), fail
1735
1736 @multiple_try()
1737 - def remove(self, *args, **opts):
1738 """Clean the jobs on the cluster"""
1739
1740 if not self.submitted_ids:
1741 return
1742 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1743 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1744 self.submitted_ids = []
1745
1747 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1748
1749 name= 'htcaas'
1750 job_id = 'HTCAAS_JOBID'
1751 idle_tag = ['waiting']
1752 running_tag = ['preparing','running']
1753 complete_tag = ['done']
1754
1755 @store_input()
1756 @multiple_try()
1757 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1758 log=None, input_files=[], output_files=[], required_output=[],
1759 nb_submit=0):
1760 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1761 input/output file should be given as relative to CWd
1762 """
1763
1764 cur_usr = os.getenv('USER')
1765
1766 if cwd is None:
1767 cwd = os.getcwd()
1768
1769 cwd_cp = cwd.rsplit("/",2)
1770
1771 if not stdout is None:
1772 print "stdout: %s" % stdout
1773
1774 if not os.path.exists(prog):
1775 prog = os.path.join(cwd, prog)
1776
1777 if not required_output and output_files:
1778 required_output = output_files
1779
1780 logger.debug(prog)
1781 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1782 cwd_arg = cwd+"/arguments"
1783 temp = ' '.join([str(a) for a in argument])
1784 arg_cmd="echo '"+temp+"' > " + cwd_arg
1785 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1786 if argument :
1787 command.extend(['-a ', '='.join([str(a) for a in argument])])
1788 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1789 id = a.stdout.read().strip()
1790
1791 else:
1792 cwd_arg = cwd+"/arguments"
1793 temp = ' '.join([str(a) for a in argument])
1794 temp_file_name = "sub." + os.path.basename(prog)
1795 text = """#!/bin/bash
1796 MYPWD=%(cwd)s
1797 cd $MYPWD
1798 input_files=(%(input_files)s )
1799 for i in ${input_files[@]}
1800 do
1801 chmod -f +x $i
1802 done
1803 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1804 """
1805 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1806 'arguments': ' '.join([str(a) for a in argument]),
1807 'program': ' ' if '.py' in prog else 'bash'}
1808
1809
1810 new_prog = pjoin(cwd, temp_file_name)
1811 open(new_prog, 'w').write(text % dico)
1812 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1813 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1814 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1815 id = a.stdout.read().strip()
1816 logger.debug(id)
1817
1818 nb_try=0
1819 nb_limit=5
1820 if not id.isdigit() :
1821 print "[ID is not digit]:" + id
1822
1823 while not id.isdigit() :
1824 nb_try+=1
1825 print "[fail_retry]:"+ nb_try
1826 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1827 id = a.stdout.read().strip()
1828 if nb_try > nb_limit :
1829 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1830 break
1831
1832 self.submitted += 1
1833 self.submitted_ids.append(id)
1834
1835 return id
1836
1837 @multiple_try(nb_try=10, sleep=5)
1839 """ control the status of a single job with it's cluster id """
1840
1841 if id == 0 :
1842 status_out ='C'
1843 else :
1844 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1845 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1846 stderr=subprocess.PIPE)
1847 error = status.stderr.read()
1848 if status.returncode or error:
1849 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1850 status_out= status.stdout.read().strip()
1851 status_out= status_out.split(":",1)[1]
1852 if status_out == 'waiting':
1853 status_out='I'
1854 elif status_out == 'preparing' or status_out == 'running':
1855 status_out = 'R'
1856 elif status_out != 'done':
1857 status_out = 'F'
1858 elif status_out == 'done':
1859 status_out = 'C'
1860
1861 return status_out
1862
1863 @multiple_try()
1865 """ control the status of a single job with it's cluster id """
1866 if not self.submitted_ids:
1867 logger.debug("self.submitted_ids not exists")
1868 return 0, 0, 0, 0
1869
1870 ongoing = []
1871 idle, run, fail = 0, 0, 0
1872
1873 start = self.submitted_ids[0]
1874 end = self.submitted_ids[-1]
1875
1876 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1877 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1878
1879 for line in status.stdout:
1880
1881 status2 = line.split()[-1]
1882 if status2 is not 'null' or line.split()[0].strip() is not '0':
1883 ongoing.append(line.split()[0].strip())
1884 logger.debug("["+line.split()[0].strip()+"]"+status2)
1885 if status2 is 'null' or line.split()[0].strip() is '0':
1886 idle += 1
1887 elif status2 in self.idle_tag:
1888 idle += 1
1889 elif status2 in self.running_tag:
1890 run += 1
1891 elif status2 in self.complete_tag:
1892 if not self.check_termination(line.split()[0]):
1893 idle +=1
1894 else:
1895 fail += 1
1896
1897 return idle, run, self.submitted - (idle+run+fail), fail
1898
1899 @multiple_try()
1900 - def remove(self, *args, **opts):
1901 """Clean the jobson the cluster"""
1902
1903 if not self.submitted_ids:
1904 return
1905 for i in range(len(self.submitted_ids)):
1906 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1907 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1908
1910 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1911
1912 name= 'htcaas2'
1913 job_id = 'HTCAAS2_JOBID'
1914 idle_tag = ['waiting']
1915 running_tag = ['preparing','running']
1916 complete_tag = ['done']
1917
1918 @store_input()
1919 @multiple_try()
1920 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1921 log=None, input_files=[], output_files=[], required_output=[],
1922 nb_submit=0):
1923
1924 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1925 input/output file should be given as relative to CWD
1926 """
1927 if cwd is None:
1928 cwd = os.getcwd()
1929
1930 if not os.path.exists(prog):
1931 prog = os.path.join(cwd, prog)
1932
1933 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1934 if cwd or prog :
1935 self.submitted_dirs.append(cwd)
1936 self.submitted_exes.append(prog)
1937 else:
1938 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1939
1940 if argument :
1941 self.submitted_args.append('='.join([str(a) for a in argument]))
1942
1943 if cwd or prog :
1944 self.submitted += 1
1945 id = self.submitted
1946 self.submitted_ids.append(id)
1947 else:
1948 logger.debug("cwd and prog are not exist! ")
1949 id = 0
1950
1951 else:
1952 temp_file_name = "sub."+ os.path.basename(prog)
1953 text = """#!/bin/bash
1954 MYPWD=%(cwd)s
1955 cd $MYPWD
1956 input_files=(%(input_files)s )
1957 for i in ${input_files[@]}
1958 do
1959 chmod -f +x $i
1960 done
1961 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1962 """
1963 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1964 'arguments': ' '.join([str(a) for a in argument]),
1965 'program': ' ' if '.py' in prog else 'bash'}
1966
1967 new_prog = pjoin(cwd, temp_file_name)
1968 open(new_prog, 'w').write(text % dico)
1969 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1970 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
1971 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1972 id = a.stdout.read().strip()
1973 logger.debug("[mode2]-["+str(id)+"]")
1974 if cwd and prog :
1975 self.submitted += 1
1976 self.submitted_ids.append(id)
1977 else:
1978 logger.debug("cwd and prog are not exist! ")
1979 id = 0
1980
1981 return id
1982
1983 @multiple_try()
2027
2028
2029 @multiple_try(nb_try=10, sleep=5)
2031 """ control the status of a single job with it's cluster id """
2032
2033 if self.submitted == self.submitted_ids[-1] :
2034 id = self.metasubmit(self)
2035 tempid = self.submitted_ids[-1]
2036 self.submitted_ids.remove(self.submitted_ids[-1])
2037 self.submitted_ids.append(id)
2038 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2039
2040 if id == 0 :
2041 status_out ='C'
2042 else:
2043 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2044 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2045 stderr=subprocess.PIPE)
2046 error = status.stderr.read()
2047 if status.returncode or error:
2048 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error
2049 status_out= status.stdout.read().strip()
2050 status_out= status_out.split(":",1)[1]
2051 logger.debug("[["+str(id)+"]]"+status_out)
2052 if status_out == 'waiting':
2053 status_out='I'
2054 elif status_out == 'preparing' or status_out == 'running':
2055 status_out = 'R'
2056 elif status_out != 'done':
2057 status_out = 'F'
2058 elif status_out == 'done':
2059 status_out = 'C'
2060 self.submitted -= 1
2061
2062 return status_out
2063
2064 @multiple_try()
2066 """ control the status of a single job with it's cluster id """
2067 if not self.submitted_ids:
2068 logger.debug("self.submitted_ids not exists")
2069 return 0, 0, 0, 0
2070
2071 if "//" in me_dir :
2072 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2073 start = me_dir.split("//")[0]
2074 end = me_dir.split("//")[1]
2075 else :
2076 start = me_dir.split("//")[1]
2077 end = me_dir.split("//")[0]
2078 elif "/" in me_dir :
2079 start = 0
2080 end = 0
2081 elif me_dir.isdigit():
2082 start = me_dir
2083 end = me_dir
2084 elif not me_dir.isdigit():
2085 me_dir = self.submitted_ids[0]
2086 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2087
2088 ongoing = []
2089 idle, run, fail, done = 0, 0, 0, 0
2090
2091 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2092 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2093
2094 for line in status.stdout:
2095 status2 = line.split()[-1]
2096 if status2 is not 'null' or line.split()[0].strip() is not '0':
2097 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2098 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2099
2100 if status2 is 'null' or line.split()[0].strip() is '0':
2101 idle += 1
2102 elif status2 in self.idle_tag:
2103 idle += 1
2104 elif status2 in self.running_tag:
2105 run += 1
2106 elif status2 in self.complete_tag:
2107 done += 1
2108 self.submitted -= 1
2109 if not self.check_termination(line.split()[1]):
2110 idle +=1
2111 else:
2112 fail += 1
2113
2114 return idle, run, self.submitted - (idle+run+fail), fail
2115
2116 @multiple_try()
2117 - def remove(self, *args, **opts):
2118 """Clean the jobson the cluster"""
2119
2120 if not self.submitted_ids:
2121 return
2122 id = self.submitted_ids[0]
2123 if id is not 0 :
2124 cmd = "htcaas-job-cancel -m %s" % str(id)
2125 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2126
2127 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2128 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2129 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2130
2131 onecore=MultiCore(1)
2132
2133