1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21
22 logger = logging.getLogger('madgraph.cluster')
23
24 try:
25 from madgraph import MadGraph5Error
26 import madgraph.various.misc as misc
27 except Exception, error:
28 if __debug__:
29 print str(error)
30 from internal import MadGraph5Error
31 import internal.misc as misc
32
33 pjoin = os.path.join
37
40
41
42 multiple_try = misc.multiple_try
43 pjoin = os.path.join
47
48 def deco_interupt(f):
49 def deco_f_interupt(self, *args, **opt):
50 try:
51 return f(self, *args, **opt)
52 except error:
53 try:
54 self.remove(*args, **opt)
55 except Exception:
56 pass
57 raise error
58 return deco_f_interupt
59 return deco_interupt
60
73 return deco_f_store
74 return deco_store
75
78 """Basic Class for all cluster type submission"""
79 name = 'mother class'
80 identifier_length = 14
81
83 """Init the cluster"""
84
85 self.submitted = 0
86 self.submitted_ids = []
87 self.finish = 0
88 if 'cluster_queue' in opts:
89 self.cluster_queue = opts['cluster_queue']
90 else:
91 self.cluster_queue = 'madgraph'
92 if 'cluster_temp_path' in opts:
93 self.temp_dir = opts['cluster_temp_path']
94 else:
95 self.temp_dir = None
96 self.options = {'cluster_status_update': (600, 30)}
97 for key,value in opts.items():
98 self.options[key] = value
99 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
100 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300
101 self.options = dict(opts)
102 self.retry_args = {}
103
104
105 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
106 log=None, required_output=[], nb_submit=0):
107 """How to make one submission. Return status id on the cluster."""
108 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
109
110 @store_input()
111 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
112 log=None, input_files=[], output_files=[], required_output=[],nb_submit=0):
113 """How to make one submission. Return status id on the cluster.
114 NO SHARE DISK"""
115
116 if cwd is None:
117 cwd = os.getcwd()
118 if not os.path.exists(prog):
119 prog = os.path.join(cwd, prog)
120
121 if not required_output and output_files:
122 required_output = output_files
123
124 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
125 (input_files == [] == output_files):
126 return self.submit(prog, argument, cwd, stdout, stderr, log,
127 required_output=required_output, nb_submit=nb_submit)
128
129 if not input_files and not output_files:
130
131 return self.submit(prog, argument, cwd, stdout, stderr, log,
132 required_output=required_output, nb_submit=nb_submit)
133
134 if cwd is None:
135 cwd = os.getcwd()
136 if not os.path.exists(prog):
137 prog = os.path.join(cwd, prog)
138 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
139
140 text = """#!/bin/bash
141 MYTMP=%(tmpdir)s/run$%(job_id)s
142 MYPWD=%(cwd)s
143 mkdir -p $MYTMP
144 cd $MYPWD
145 input_files=( %(input_files)s )
146 for i in ${input_files[@]}
147 do
148 cp -R -L $i $MYTMP
149 done
150 cd $MYTMP
151 echo '%(arguments)s' > arguments
152 chmod +x ./%(script)s
153 %(program)s ./%(script)s %(arguments)s
154 exit=$?
155 output_files=( %(output_files)s )
156 for i in ${output_files[@]}
157 do
158 cp -r $MYTMP/$i $MYPWD
159 done
160 # if [ "$exit" -eq "0" ]
161 # then
162 rm -rf $MYTMP
163 # fi
164 """
165 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
166 'cwd': cwd, 'job_id': self.job_id,
167 'input_files': ' '.join(input_files + [prog]),
168 'output_files': ' '.join(output_files),
169 'arguments': ' '.join([str(a) for a in argument]),
170 'program': ' ' if '.py' in prog else 'bash'}
171
172
173 new_prog = pjoin(cwd, temp_file_name)
174 open(new_prog, 'w').write(text % dico)
175 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
176
177 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
178 required_output=required_output, nb_submit=nb_submit)
179
180
182 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
183 if not self.submitted_ids:
184 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
185 idle, run, fail = 0, 0, 0
186 for pid in self.submitted_ids[:]:
187 status = self.control_one_job(id)
188 if status == 'I':
189 idle += 1
190 elif status == 'R':
191 run += 1
192 elif status == 'F':
193 self.finish +=1
194 self.submitted_ids.remove(pid)
195 else:
196 fail += 1
197
198 return idle, run, self.finish, fail
199
201 """ control the status of a single job with it's cluster id """
202 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
203
205 """get a unique run_name for all the jobs helps to identify the runs
206 in the controller for some cluster."""
207
208 if second_path:
209 path = os.path.realpath(pjoin(path, second_path))
210 elif not os.path.exists(path):
211 return path
212
213 if 'SubProcesses' in path:
214 target = path.rsplit('/SubProcesses',1)[0]
215 elif 'MCatNLO' in path:
216 target = path.rsplit('/MCatNLO',1)[0]
217 elif second_path:
218 target=path
219 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
220 else:
221 target = path
222
223 if target.endswith('/'):
224 target = target[:-1]
225
226 target = misc.digest(target)[-self.identifier_length:]
227 if not target[0].isalpha():
228 target = 'a' + target[1:]
229
230 return target
231
232
233 @check_interupt()
234 - def wait(self, me_dir, fct, minimal_job=0):
235 """Wait that all job are finish.
236 if minimal_job set, then return if idle + run is lower than that number"""
237
238
239 mode = 1
240 nb_iter = 0
241 nb_short = 0
242 change_at = 5
243
244 longtime, shorttime = self.options['cluster_status_update']
245
246 while 1:
247 old_mode = mode
248 nb_iter += 1
249 idle, run, finish, fail = self.control(me_dir)
250 if fail:
251 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
252 if idle + run == 0:
253
254 logger.info('All jobs finished')
255 break
256 if idle + run < minimal_job:
257 return
258 fct(idle, run, finish)
259
260 if nb_iter < change_at:
261 mode = 1
262 elif idle < run:
263 if old_mode == 0:
264 if nb_short:
265 mode = 0
266
267 elif idle:
268 if nb_iter > change_at + int(longtime)//shorttime:
269 mode = 0
270 else:
271 mode = 1
272 nb_short =0
273 else:
274 mode = 1
275 nb_short = 0
276 elif old_mode == 1:
277 nb_short +=1
278 if nb_short > 3* max(change_at, int(longtime)//shorttime):
279 mode = 0
280 else:
281 mode = 0
282
283
284 if old_mode > mode:
285 logger.info('''Start to wait %ss between checking status.
286 Note that you can change this time in the configuration file.
287 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
288
289
290 if mode == 0:
291 try:
292 time.sleep(self.options['cluster_status_update'][0])
293 except KeyboardInterrupt:
294 logger.info('start to update the status')
295 nb_iter = min(0, change_at -2)
296 nb_short = 0
297 else:
298 time.sleep(self.options['cluster_status_update'][1])
299
300
301 self.submitted = 0
302 self.submitted_ids = []
303
305 """Check the termination of the jobs with job_id and relaunch it if needed."""
306
307
308 if job_id not in self.retry_args:
309 return True
310
311 args = self.retry_args[job_id]
312 if 'time_check' in args:
313 time_check = args['time_check']
314 else:
315 time_check = 0
316
317 for path in args['required_output']:
318 if args['cwd']:
319 path = pjoin(args['cwd'], path)
320
321 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
322 break
323 else:
324
325 if time_check > 0:
326 logger.info('Job %s Finally found the missing output.' % (job_id))
327 del self.retry_args[job_id]
328 self.submitted_ids.remove(job_id)
329 return 'done'
330
331 if time_check == 0:
332 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
333 args['time_check'] = time.time()
334 return 'wait'
335 elif self.cluster_retry_wait > time.time() - time_check:
336 return 'wait'
337
338
339 if self.nb_retry < 0:
340 logger.critical('''Fail to run correctly job %s.
341 with option: %s
342 file missing: %s''' % (job_id, args, path))
343 raw_input('press enter to continue.')
344 elif self.nb_retry == 0:
345 logger.critical('''Fail to run correctly job %s.
346 with option: %s
347 file missing: %s.
348 Stopping all runs.''' % (job_id, args, path))
349
350 elif args['nb_submit'] >= self.nb_retry:
351 logger.critical('''Fail to run correctly job %s.
352 with option: %s
353 file missing: %s
354 Fails %s times
355 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
356
357 else:
358 args['nb_submit'] += 1
359 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
360 del self.retry_args[job_id]
361 self.submitted_ids.remove(job_id)
362 if 'time_check' in args:
363 del args['time_check']
364 self.submit2(**args)
365 return 'resubmit'
366 return 'done'
367
368
369
370 @check_interupt()
371 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
372 stderr=None, log=None, required_output=[], nb_submit=0,
373 input_files=[], output_files=[]):
374 """launch one job on the cluster and wait for it"""
375
376 special_output = False
377 if stderr == -2 and stdout:
378
379 special_output = True
380 stderr = stdout + '.err'
381
382 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
383 required_output=required_output, input_files=input_files,
384 output_files=output_files)
385
386 frame = inspect.currentframe()
387 args, _, _, values = inspect.getargvalues(frame)
388 args = dict([(i, values[i]) for i in args if i != 'self'])
389 self.retry_args[id] = args
390
391 nb_wait=0
392 while 1:
393 nb_wait+=1
394 status = self.control_one_job(id)
395 if not status in ['R','I']:
396 status = self.check_termination(id)
397 if status in ['wait']:
398 time.sleep(30)
399 continue
400 elif status in ['resubmit']:
401 id = self.submitted_ids[0]
402 time.sleep(30)
403 continue
404
405 time.sleep(30)
406 break
407 time.sleep(self.options['cluster_status_update'][1])
408
409 if required_output:
410 status = self.check_termination(id)
411 if status == 'wait':
412 run += 1
413 elif status == 'resubmit':
414 idle += 1
415
416
417 if special_output:
418
419
420 for i in range(5):
421 if os.path.exists(stdout):
422 if not os.path.exists(stderr):
423 time.sleep(5)
424 if os.path.exists(stderr):
425 err_text = open(stderr).read()
426 if not err_text:
427 return
428 logger.warning(err_text)
429 text = open(stdout).read()
430 open(stdout,'w').write(text + err_text)
431 else:
432 return
433 time.sleep(10)
434
435 - def remove(self, *args, **opts):
436 """ """
437 logger.warning("""This cluster didn't support job removal,
438 the jobs are still running on the cluster.""")
439
441 """ class for dealing with the submission in multiple node"""
442
443 job_id = '$'
444
446 """Init the cluster"""
447 import thread
448 super(MultiCore, self).__init__(self, *args, **opt)
449
450
451 self.submitted = 0
452 self.finish = 0
453 if 'nb_core' in opt:
454 self.nb_core = opt['nb_core']
455 elif isinstance(args[0],int):
456 self.nb_core = args[0]
457 else:
458 self.nb_core = 1
459 self.update_fct = None
460
461
462 self.need_waiting = False
463 self.nb_used = 0
464 self.lock = thread.allocate_lock()
465 self.done = 0
466 self.waiting_submission = []
467 self.pids = []
468 self.fail_msg = None
469
470 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
471 stderr=None, log=None, **opts):
472 """launch one job and wait for it"""
473 if isinstance(stdout, str):
474 stdout = open(stdout, 'w')
475 if isinstance(stderr, str):
476 stdout = open(stderr, 'w')
477 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
478
479
480 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
481 log=None, required_output=[], nb_submit=0):
482 """submit a job on multicore machine"""
483
484 self.submitted +=1
485 if cwd is None:
486 cwd = os.getcwd()
487 if isinstance(prog, str):
488 if not os.path.exists(prog) and not misc.which(prog):
489 prog = os.path.join(cwd, prog)
490
491 import thread
492 if self.waiting_submission or self.nb_used == self.nb_core:
493 self.waiting_submission.append((prog, argument,cwd, stdout))
494
495 while self.nb_used < self.nb_core and self.waiting_submission:
496 arg = self.waiting_submission.pop(0)
497 self.nb_used += 1
498 thread.start_new_thread(self.launch, arg)
499 elif self.nb_used < self.nb_core -1:
500 self.nb_used += 1
501 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
502 elif self.nb_used == self.nb_core -1:
503 self.nb_used += 1
504 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
505
506
507 - def launch(self, exe, argument, cwd, stdout):
508 """ way to launch for multicore. If exe is a string then treat it as
509 an executable. Otherwise treat it as a function"""
510 import thread
511 def end(self, pid):
512 self.nb_used -= 1
513 self.done += 1
514 try:
515 self.pids.remove(pid)
516 except:
517 pass
518
519 fail_msg = None
520 try:
521 if isinstance(exe,str):
522 if os.path.exists(exe) and not exe.startswith('/'):
523 exe = './' + exe
524 proc = misc.Popen([exe] + argument, cwd=cwd, stdout=stdout,
525 stderr=subprocess.STDOUT)
526 pid = proc.pid
527 self.pids.append(pid)
528 proc.wait()
529 if proc.returncode not in [0, 143, -15]:
530 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
531 (' '.join([exe]+argument), proc.returncode)
532
533 logger.warning(fail_msg)
534 try:
535 log = open(glob.glob(pjoin(cwd,'*','log.txt'))[0]).read()
536 logger.warning('Last 15 lines of logfile %s:\n%s\n' % \
537 (pjoin(cwd,'*','log.txt'), '\n'.join(log.split('\n')[-15:-1]) + '\n'))
538 except (IOError, AttributeError, IndexError):
539 logger.warning('Please look for possible logfiles in %s' % cwd)
540 pass
541 self.remove(fail_msg)
542 else:
543 pid = tuple([id(o) for o in [exe] + argument])
544 self.pids.append(pid)
545
546
547 returncode = exe(argument)
548 if returncode != 0:
549 logger.warning(returncode)
550 self.remove()
551
552
553
554
555 security = 0
556
557 while 1:
558 while not self.lock.locked():
559 if not self.need_waiting:
560
561 end(self, pid)
562 return
563 elif security > 60:
564 end(self, pid)
565 return
566 security += 1
567 time.sleep(1)
568 try:
569 self.lock.release()
570 except thread.error:
571 continue
572 break
573 end(self, pid)
574
575
576 except Exception, error:
577
578 self.remove()
579 raise
580
581
582
583
584 - def wait(self, me_dir, update_status):
585 """Wait that all thread finish
586 self.nb_used and self.done are update via each jobs (thread and local)
587 self.submitted is the nb of times that submitted has been call (local)
588 remaining is the nb of job that we still have to wait. (local)
589 self.pids is the list of the BASH pid of the submitted jobs. (thread)
590
591 WARNING: In principle all those value are coherent but since some are
592 modified in various thread, those data can be corrupted. (not the local
593 one). Nb_used in particular shouldn't be trusted too much.
594 This code check in different ways that all jobs have finished.
595
596 In principle, the statement related to '#security #X' are not used.
597 In practise they are times to times.
598 """
599
600 import thread
601
602 remaining = self.submitted - self.done
603
604 while self.nb_used < self.nb_core:
605 if self.waiting_submission:
606 arg = self.waiting_submission.pop(0)
607 thread.start_new_thread(self.launch, arg)
608 self.nb_used += 1
609 else:
610 break
611
612 try:
613 self.need_waiting = True
614 self.lock.acquire()
615 no_in_queue = 0
616 secure_mode = False
617 while self.waiting_submission or self.nb_used:
618 if self.fail_msg:
619 msg, self.fail_msg = self.fail_msg, None
620 self.remove()
621 raise Exception, msg
622 if update_status:
623 update_status(len(self.waiting_submission), self.nb_used, self.done)
624
625
626 if len(self.waiting_submission) == 0 == remaining :
627 self.done = self.submitted
628 break
629
630
631 if len(self.waiting_submission) == 0 and len(self.pids) == 0:
632 if self.submitted == self.done:
633 break
634 logger.debug('Found too many jobs. Recovering')
635 no_in_queue += 1
636 time.sleep(min(180, 5 * no_in_queue))
637 if no_in_queue > 3:
638 logger.debug('Still too many jobs. Continue')
639 break
640 continue
641
642
643 if not secure_mode and len(self.waiting_submission) != 0:
644 if self.nb_used != self.nb_core:
645 if self.nb_used != len(self.pids):
646 secure_mode = True
647
648 if secure_mode and not self.waiting_submission:
649 self.need_waiting = False
650 if self.lock.locked():
651 self.lock.release()
652 break
653
654
655 self.lock.acquire()
656 remaining -=1
657
658 if self.waiting_submission:
659 arg = self.waiting_submission.pop(0)
660 thread.start_new_thread(self.launch, arg)
661 self.nb_used += 1
662
663 if self.fail_msg:
664 msg, self.fail_msg = self.fail_msg, None
665 self.remove()
666 raise Exception, msg
667
668
669 no_in_queue = 0
670 while self.submitted > self.done:
671 if self.fail_msg:
672 msg, self.fail_msg = self.fail_msg, None
673 self.remove()
674 raise Exception, msg
675 if no_in_queue == 0:
676 logger.debug('Some jobs have been lost. Try to recover')
677
678 if not len(self.pids):
679
680 logger.critical('Some jobs have been lost in the multicore treatment.')
681 logger.critical('The results might be incomplete. (Trying to continue anyway)')
682 break
683 elif update_status:
684 update_status(len(self.waiting_submission), len(self.pids) ,
685 self.done)
686
687 if not secure_mode:
688 self.lock.acquire()
689 else:
690 no_in_queue += 1
691 try:
692 time.sleep(min(180,5*no_in_queue))
693 if no_in_queue > 5 * 3600.0 / 162:
694 break
695 except KeyboardInterrupt:
696 logger.warning('CTRL-C assumes that all jobs are done. Continue the code')
697 self.pids = []
698 break
699
700
701 no_in_queue = 0
702 while len(self.pids):
703 if self.fail_msg:
704 msg, self.fail_msg = self.fail_msg, None
705 self.remove()
706 raise Exception, msg
707 self.need_waiting = False
708 if self.lock.locked():
709 self.lock.release()
710 secure_mode = True
711 if no_in_queue == 0 :
712 logger.warning('Some jobs have been lost. Try to recover.')
713 logger.warning('Hitting ctrl-c will consider that all jobs are done and continue the code.')
714 try:
715
716 if update_status:
717 update_status(len(self.waiting_submission), len(self.pids) ,
718 self.done)
719 time.sleep(min(5*no_in_queue, 180))
720 no_in_queue += 1
721 if no_in_queue > 5 * 3600.0 / 162:
722 break
723 except KeyboardInterrupt:
724 break
725
726
727 if update_status:
728 self.next_update = 0
729 update_status(len(self.waiting_submission), 0, self.done)
730
731
732 self.need_waiting = False
733 security = 0
734 while not self.lock.locked() and security < 10:
735
736 if secure_mode:
737 security = 10
738 security +=1
739 time.sleep(1)
740 if security < 10:
741 self.lock.release()
742 self.done = 0
743 self.nb_used = 0
744 self.submitted = 0
745 self.pids = []
746
747 except KeyboardInterrupt:
748 self.remove()
749 raise
750 if self.fail_msg:
751 msg, self.fail_msg = self.fail_msg, None
752 self.remove()
753 raise Exception, msg
754
755
756 - def remove(self, error=None):
757 """Ensure that all thread are killed"""
758 logger.info('remove job currently running')
759 self.waiting_submission = []
760 if error:
761 self.fail_msg = error
762 for pid in list(self.pids):
763 if isinstance(pid, tuple):
764 continue
765 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
766 % {'pid':pid} )
767 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
768 if out == 0:
769 try:
770 self.pids.remove(pid)
771 except:
772 pass
773
774
775 time.sleep(1)
776 for pid in list(self.pids):
777 if isinstance(pid, tuple):
778 continue
779 out = os.system('CPIDS=$(pgrep -P %s); kill -15 $CPIDS > /dev/null 2>&1' % pid )
780 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
781 if out == 0:
782 try:
783 self.pids.remove(pid)
784 except:
785 pass
786
788 """Basic class for dealing with cluster submission"""
789
790 name = 'condor'
791 job_id = 'CONDOR_ID'
792
793
794
795 @multiple_try()
796 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
797 required_output=[], nb_submit=0):
798 """Submit a job prog to a Condor cluster"""
799
800 text = """Executable = %(prog)s
801 output = %(stdout)s
802 error = %(stderr)s
803 log = %(log)s
804 %(argument)s
805 environment = CONDOR_ID=$(Cluster).$(Process)
806 Universe = vanilla
807 notification = Error
808 Initialdir = %(cwd)s
809 %(requirement)s
810 getenv=True
811 queue 1
812 """
813
814 if self.cluster_queue not in ['None', None]:
815 requirement = 'Requirements = %s=?=True' % self.cluster_queue
816 else:
817 requirement = ''
818
819 if cwd is None:
820 cwd = os.getcwd()
821 if stdout is None:
822 stdout = '/dev/null'
823 if stderr is None:
824 stderr = '/dev/null'
825 if log is None:
826 log = '/dev/null'
827 if not os.path.exists(prog):
828 prog = os.path.join(cwd, prog)
829 if argument:
830 argument = 'Arguments = %s' % ' '.join(argument)
831 else:
832 argument = ''
833
834
835 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
836 'stderr': stderr,'log': log,'argument': argument,
837 'requirement': requirement}
838
839 open('submit_condor','w').write(text % dico)
840 a = misc.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE)
841 output = a.stdout.read()
842
843
844
845 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
846 try:
847 id = pat.search(output).groups()[0]
848 except:
849 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
850 % output
851 self.submitted += 1
852 self.submitted_ids.append(id)
853 return id
854
855 @store_input()
856 @multiple_try()
857 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
858 log=None, input_files=[], output_files=[], required_output=[],
859 nb_submit=0):
860 """Submit the job on the cluster NO SHARE DISK
861 input/output file should be give relative to cwd
862 """
863
864 if not required_output and output_files:
865 required_output = output_files
866
867 if (input_files == [] == output_files):
868 return self.submit(prog, argument, cwd, stdout, stderr, log,
869 required_output=required_output, nb_submit=nb_submit)
870
871 text = """Executable = %(prog)s
872 output = %(stdout)s
873 error = %(stderr)s
874 log = %(log)s
875 %(argument)s
876 should_transfer_files = YES
877 when_to_transfer_output = ON_EXIT
878 transfer_input_files = %(input_files)s
879 %(output_files)s
880 Universe = vanilla
881 notification = Error
882 Initialdir = %(cwd)s
883 %(requirement)s
884 getenv=True
885 queue 1
886 """
887
888 if self.cluster_queue not in ['None', None]:
889 requirement = 'Requirements = %s=?=True' % self.cluster_queue
890 else:
891 requirement = ''
892
893 if cwd is None:
894 cwd = os.getcwd()
895 if stdout is None:
896 stdout = '/dev/null'
897 if stderr is None:
898 stderr = '/dev/null'
899 if log is None:
900 log = '/dev/null'
901 if not os.path.exists(prog):
902 prog = os.path.join(cwd, prog)
903 if argument:
904 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
905 else:
906 argument = ''
907
908 if input_files:
909 input_files = ','.join(input_files)
910 else:
911 input_files = ''
912 if output_files:
913 output_files = 'transfer_output_files = %s' % ','.join(output_files)
914 else:
915 output_files = ''
916
917
918
919 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
920 'stderr': stderr,'log': log,'argument': argument,
921 'requirement': requirement, 'input_files':input_files,
922 'output_files':output_files}
923
924 open('submit_condor','w').write(text % dico)
925 a = subprocess.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE)
926 output = a.stdout.read()
927
928
929
930 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
931 try:
932 id = pat.search(output).groups()[0]
933 except:
934 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
935 % output
936 self.submitted += 1
937 self.submitted_ids.append(id)
938 return id
939
940
941
942
943
944 @multiple_try(nb_try=10, sleep=10)
946 """ control the status of a single job with it's cluster id """
947 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
948 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
949 stderr=subprocess.PIPE)
950
951 error = status.stderr.read()
952 if status.returncode or error:
953 raise ClusterManagmentError, 'condor_q returns error: %s' % error
954
955 return status.stdout.readline().strip()
956
957 @check_interupt()
958 @multiple_try(nb_try=10, sleep=10)
960 """ control the status of a single job with it's cluster id """
961
962 if not self.submitted_ids:
963 return 0, 0, 0, 0
964
965 packet = 15000
966 idle, run, fail = 0, 0, 0
967 ongoing = []
968 for i in range(1+(len(self.submitted_ids)-1)//packet):
969 start = i * packet
970 stop = (i+1) * packet
971 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
972 " -format \'%-2s\ ' \'ClusterId\' " + \
973 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
974
975 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
976 stderr=subprocess.PIPE)
977 error = status.stderr.read()
978 if status.returncode or error:
979 raise ClusterManagmentError, 'condor_q returns error: %s' % error
980
981 for line in status.stdout:
982 id, status = line.strip().split()
983 ongoing.append(int(id))
984 if status in ['I','U']:
985 idle += 1
986 elif status == 'R':
987 run += 1
988 elif status != 'C':
989 fail += 1
990
991 for id in list(self.submitted_ids):
992 if int(id) not in ongoing:
993 status = self.check_termination(id)
994 if status == 'wait':
995 run += 1
996 elif status == 'resubmit':
997 idle += 1
998
999 return idle, run, self.submitted - (idle+run+fail), fail
1000
1001 @multiple_try()
1002 - def remove(self, *args, **opts):
1003 """Clean the jobson the cluster"""
1004
1005 if not self.submitted_ids:
1006 return
1007 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1008
1009 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1010
1012 """Basic class for dealing with cluster submission"""
1013
1014 name = 'pbs'
1015 job_id = 'PBS_JOBID'
1016 idle_tag = ['Q']
1017 running_tag = ['T','E','R']
1018 complete_tag = ['C']
1019
1020 maximum_submited_jobs = 2500
1021
1022 @multiple_try()
1023 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1024 required_output=[], nb_submit=0):
1025 """Submit a job prog to a PBS cluster"""
1026
1027 me_dir = self.get_jobs_identifier(cwd, prog)
1028
1029 if len(self.submitted_ids) > self.maximum_submited_jobs:
1030 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1031 self.wait(me_dir, fct, self.maximum_submited_jobs)
1032
1033
1034 text = ""
1035 if cwd is None:
1036 cwd = os.getcwd()
1037 else:
1038 text = " cd %s;" % cwd
1039 if stdout is None:
1040 stdout = '/dev/null'
1041 if stderr is None:
1042 stderr = '/dev/null'
1043 elif stderr == -2:
1044 stderr = stdout
1045 if log is None:
1046 log = '/dev/null'
1047
1048 if not os.path.isabs(prog):
1049 text += "./%s" % prog
1050 else:
1051 text+= prog
1052
1053 if argument:
1054 text += ' ' + ' '.join(argument)
1055
1056 command = ['qsub','-o', stdout,
1057 '-N', me_dir,
1058 '-e', stderr,
1059 '-V']
1060
1061 if self.cluster_queue and self.cluster_queue != 'None':
1062 command.extend(['-q', self.cluster_queue])
1063
1064 a = misc.Popen(command, stdout=subprocess.PIPE,
1065 stderr=subprocess.STDOUT,
1066 stdin=subprocess.PIPE, cwd=cwd)
1067
1068 output = a.communicate(text)[0]
1069 id = output.split('.')[0]
1070 if not id.isdigit() or a.returncode !=0:
1071 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1072 % output
1073
1074 self.submitted += 1
1075 self.submitted_ids.append(id)
1076 return id
1077
1078 @multiple_try()
1080 """ control the status of a single job with it's cluster id """
1081 cmd = 'qstat '+str(id)
1082 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1083 stderr=subprocess.STDOUT)
1084
1085 for line in status.stdout:
1086 line = line.strip()
1087 if 'cannot connect to server' in line or 'cannot read reply' in line:
1088 raise ClusterManagmentError, 'server disconnected'
1089 if 'Unknown' in line:
1090 return 'F'
1091 elif line.startswith(str(id)):
1092 jobstatus = line.split()[4]
1093 else:
1094 jobstatus=""
1095
1096 if status.returncode != 0 and status.returncode is not None:
1097 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1098 if jobstatus in self.idle_tag:
1099 return 'I'
1100 elif jobstatus in self.running_tag:
1101 return 'R'
1102 return 'F'
1103
1104
1105 @multiple_try()
1107 """ control the status of a single job with it's cluster id """
1108 cmd = "qstat"
1109 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1110
1111 me_dir = self.get_jobs_identifier(me_dir)
1112
1113 ongoing = []
1114
1115 idle, run, fail = 0, 0, 0
1116 for line in status.stdout:
1117 if 'cannot connect to server' in line or 'cannot read reply' in line:
1118 raise ClusterManagmentError, 'server disconnected'
1119 if me_dir in line:
1120 ongoing.append(line.split()[0].split('.')[0])
1121 status2 = line.split()[4]
1122 if status2 in self.idle_tag:
1123 idle += 1
1124 elif status2 in self.running_tag:
1125 run += 1
1126 elif status2 in self.complete_tag:
1127 if not self.check_termination(line.split()[0].split('.')[0]):
1128 idle += 1
1129 else:
1130 fail += 1
1131
1132 if status.returncode != 0 and status.returncode is not None:
1133 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1134
1135 for id in list(self.submitted_ids):
1136 if id not in ongoing:
1137 status2 = self.check_termination(id)
1138 if status2 == 'wait':
1139 run += 1
1140 elif status2 == 'resubmit':
1141 idle += 1
1142
1143 return idle, run, self.submitted - (idle+run+fail), fail
1144
1145 @multiple_try()
1146 - def remove(self, *args, **opts):
1147 """Clean the jobs on the cluster"""
1148
1149 if not self.submitted_ids:
1150 return
1151 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1152 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1153
1156 """Basic class for dealing with cluster submission"""
1157
1158
1159 name = 'sge'
1160 job_id = 'JOB_ID'
1161 idle_tag = ['qw', 'hqw','hRqw','w']
1162 running_tag = ['r','t','Rr','Rt']
1163 identifier_length = 10
1164
1166 """replace string for path issues"""
1167 location = os.path.realpath(location)
1168 homePath = os.getenv("HOME")
1169 if homePath:
1170 location = location.replace(homePath,'$HOME')
1171 return location
1172
1173 @multiple_try()
1174 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1175 required_output=[], nb_submit=0):
1176 """Submit a job prog to an SGE cluster"""
1177
1178 me_dir = self.get_jobs_identifier(cwd, prog)
1179
1180
1181 if cwd is None:
1182
1183 cwd = self.def_get_path(os.getcwd())
1184 cwd1 = self.def_get_path(cwd)
1185 text = " cd %s;" % cwd1
1186 if stdout is None:
1187 stdout = '/dev/null'
1188 else:
1189 stdout = self.def_get_path(stdout)
1190 if stderr is None:
1191 stderr = '/dev/null'
1192 elif stderr == -2:
1193 stderr = stdout
1194 else:
1195 stderr = self.def_get_path(stderr)
1196
1197 if log is None:
1198 log = '/dev/null'
1199 else:
1200 log = self.def_get_path(log)
1201
1202 text += prog
1203 if argument:
1204 text += ' ' + ' '.join(argument)
1205
1206
1207
1208
1209 homePath = os.getenv("HOME")
1210 if homePath:
1211 text = text.replace(homePath,'$HOME')
1212
1213 logger.debug("!=== input %s" % text)
1214 logger.debug("!=== output %s" % stdout)
1215 logger.debug("!=== error %s" % stderr)
1216 logger.debug("!=== logs %s" % log)
1217
1218 command = ['qsub','-o', stdout,
1219 '-N', me_dir,
1220 '-e', stderr,
1221 '-V']
1222
1223 if self.cluster_queue and self.cluster_queue != 'None':
1224 command.extend(['-q', self.cluster_queue])
1225
1226 a = misc.Popen(command, stdout=subprocess.PIPE,
1227 stderr=subprocess.STDOUT,
1228 stdin=subprocess.PIPE, cwd=cwd)
1229
1230 output = a.communicate(text)[0]
1231 id = output.split(' ')[2]
1232 if not id.isdigit():
1233 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1234 % output
1235 self.submitted += 1
1236 self.submitted_ids.append(id)
1237 logger.debug(output)
1238
1239 return id
1240
1241 @multiple_try()
1243 """ control the status of a single job with it's cluster id """
1244
1245 cmd = 'qstat '
1246 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1247 for line in status.stdout:
1248
1249
1250
1251
1252
1253
1254 if str(id) in line:
1255 status = line.split()[4]
1256
1257 if status in self.idle_tag:
1258 return 'I'
1259 elif status in self.running_tag:
1260 return 'R'
1261 return 'F'
1262
1263 @multiple_try()
1265 """ control the status of a single job with it's cluster id """
1266 cmd = "qstat "
1267 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1268
1269 me_dir = self.get_jobs_identifier(me_dir)
1270
1271 idle, run, fail = 0, 0, 0
1272 for line in status.stdout:
1273 if me_dir in line:
1274 status = line.split()[4]
1275 if status in self.idle_tag:
1276 idle += 1
1277 elif status in self.running_tag:
1278 run += 1
1279 else:
1280 logger.debug(line)
1281 fail += 1
1282
1283 return idle, run, self.submitted - (idle+run+fail), fail
1284
1285
1286
1287 @multiple_try()
1288 - def remove(self, *args, **opts):
1289 """Clean the jobs on the cluster"""
1290
1291 if not self.submitted_ids:
1292 return
1293 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1294 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1295
1298 """Basic class for dealing with cluster submission"""
1299
1300 name = 'lsf'
1301 job_id = 'LSB_JOBID'
1302
1303 @multiple_try()
1304 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1305 required_output=[], nb_submit=0):
1306 """Submit the job prog to an LSF cluster"""
1307
1308
1309 me_dir = self.get_jobs_identifier(cwd, prog)
1310
1311 text = ""
1312 command = ['bsub', '-C0', '-J', me_dir]
1313 if cwd is None:
1314 cwd = os.getcwd()
1315 else:
1316 text = " cd %s;" % cwd
1317 if stdout and isinstance(stdout, str):
1318 command.extend(['-o', stdout])
1319 if stderr and isinstance(stdout, str):
1320 command.extend(['-e', stderr])
1321 elif stderr == -2:
1322 pass
1323 if log is None:
1324 log = '/dev/null'
1325
1326 text += prog
1327 if argument:
1328 text += ' ' + ' '.join(argument)
1329
1330 if self.cluster_queue and self.cluster_queue != 'None':
1331 command.extend(['-q', self.cluster_queue])
1332
1333 a = misc.Popen(command, stdout=subprocess.PIPE,
1334 stderr=subprocess.STDOUT,
1335 stdin=subprocess.PIPE, cwd=cwd)
1336
1337 output = a.communicate(text)[0]
1338
1339 try:
1340 id = output.split('>',1)[0].split('<')[1]
1341 except:
1342 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1343 % output
1344 if not id.isdigit():
1345 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1346 % output
1347 self.submitted += 1
1348 self.submitted_ids.append(id)
1349 return id
1350
1351
1352 @multiple_try()
1354 """ control the status of a single job with it's cluster id """
1355
1356 cmd = 'bjobs '+str(id)
1357 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1358
1359 for line in status.stdout:
1360 line = line.strip().upper()
1361 if 'JOBID' in line:
1362 continue
1363 elif str(id) not in line:
1364 continue
1365 status = line.split()[2]
1366 if status == 'RUN':
1367 return 'R'
1368 elif status == 'PEND':
1369 return 'I'
1370 elif status == 'DONE':
1371 return 'F'
1372 else:
1373 return 'H'
1374 return 'F'
1375
1376 @multiple_try()
1378 """ control the status of a single job with it's cluster id """
1379
1380 if not self.submitted_ids:
1381 return 0, 0, 0, 0
1382
1383 cmd = "bjobs " + ' '.join(self.submitted_ids)
1384 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1385
1386 jobstatus = {}
1387 for line in status.stdout:
1388 line = line.strip()
1389 if 'JOBID' in line:
1390 continue
1391 splitline = line.split()
1392 id = splitline[0]
1393 if id not in self.submitted_ids:
1394 continue
1395 jobstatus[id] = splitline[2]
1396
1397 idle, run, fail = 0, 0, 0
1398 for id in self.submitted_ids[:]:
1399 if id in jobstatus:
1400 status = jobstatus[id]
1401 else:
1402 status = 'MISSING'
1403 if status == 'RUN':
1404 run += 1
1405 elif status == 'PEND':
1406 idle += 1
1407 else:
1408 status = self.check_termination(id)
1409 if status == 'wait':
1410 run += 1
1411 elif status == 'resubmit':
1412 idle += 1
1413
1414 return idle, run, self.submitted - (idle+run+fail), fail
1415
1416 @multiple_try()
1417 - def remove(self, *args,**opts):
1418 """Clean the jobs on the cluster"""
1419
1420 if not self.submitted_ids:
1421 return
1422 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1423 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1424
1426 """Class for dealing with cluster submission on a GE cluster"""
1427
1428 name = 'ge'
1429 job_id = 'JOB_ID'
1430 idle_tag = ['qw']
1431 running_tag = ['r']
1432
1433 @multiple_try()
1434 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1435 required_output=[], nb_submit=0):
1436 """Submit a job prog to a GE cluster"""
1437
1438 text = ""
1439 if cwd is None:
1440 cwd = os.getcwd()
1441 else:
1442 text = " cd %s; bash " % cwd
1443 if stdout is None:
1444 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1445 if stderr is None:
1446 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1447 elif stderr == -2:
1448 stderr = stdout
1449 if log is None:
1450 log = '/dev/null'
1451
1452 text += prog
1453 if argument:
1454 text += ' ' + ' '.join(argument)
1455 text += '\n'
1456 tmp_submit = os.path.join(cwd, 'tmp_submit')
1457 open(tmp_submit,'w').write(text)
1458
1459 a = misc.Popen(['qsub','-o', stdout,
1460 '-e', stderr,
1461 tmp_submit],
1462 stdout=subprocess.PIPE,
1463 stderr=subprocess.STDOUT,
1464 stdin=subprocess.PIPE, cwd=cwd)
1465
1466 output = a.communicate()[0]
1467
1468 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1469 try:
1470 id = pat.search(output).groups()[0]
1471 except:
1472 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1473 % output
1474 self.submitted += 1
1475 self.submitted_ids.append(id)
1476 return id
1477
1478 @multiple_try()
1480 """ control the status of a single job with it's cluster id """
1481 cmd = 'qstat | grep '+str(id)
1482 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1483 if not status:
1484 return 'F'
1485
1486 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1487 stat = ''
1488 for line in status.stdout.read().split('\n'):
1489 if not line:
1490 continue
1491 line = line.strip()
1492 try:
1493 groups = pat.search(line).groups()
1494 except:
1495 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1496 if groups[0] != id: continue
1497 stat = groups[1]
1498 if not stat:
1499 return 'F'
1500 if stat in self.idle_tag:
1501 return 'I'
1502 if stat in self.running_tag:
1503 return 'R'
1504
1505 @multiple_try()
1507 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1508 if not self.submitted_ids:
1509 return 0, 0, 0, 0
1510 idle, run, fail = 0, 0, 0
1511 ongoing = []
1512 for statusflag in ['p', 'r', 'sh']:
1513 cmd = 'qstat -s %s' % statusflag
1514 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1515
1516 pat = re.compile("^(\d+)")
1517 for line in status.stdout.read().split('\n'):
1518 line = line.strip()
1519 try:
1520 id = pat.search(line).groups()[0]
1521 except Exception:
1522 pass
1523 else:
1524 if id not in self.submitted_ids:
1525 continue
1526 ongoing.append(id)
1527 if statusflag == 'p':
1528 idle += 1
1529 if statusflag == 'r':
1530 run += 1
1531 if statusflag == 'sh':
1532 fail += 1
1533 for id in list(self.submitted_ids):
1534 if id not in ongoing:
1535 self.check_termination(id)
1536
1537
1538 return idle, run, self.submitted - idle - run - fail, fail
1539
1540 @multiple_try()
1541 - def remove(self, *args, **opts):
1542 """Clean the jobs on the cluster"""
1543
1544 if not self.submitted_ids:
1545 return
1546 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1547 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1548
1550 """start a computation and not wait for it to finish.
1551 this fonction returns a lock which is locked as long as the job is
1552 running."""
1553
1554 mc = MultiCore(1)
1555 mc.submit(exe, argument, cwd, stdout, **opt)
1556 mc.need_waiting = True
1557 mc.lock.acquire()
1558 return mc.lock
1559
1562 """Basic class for dealing with cluster submission"""
1563
1564 name = 'slurm'
1565 job_id = 'SLURM_JOBID'
1566 idle_tag = ['Q','PD','S','CF']
1567 running_tag = ['R', 'CG']
1568 complete_tag = ['C']
1569 identification_length = 8
1570
1571 @multiple_try()
1572 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1573 required_output=[], nb_submit=0):
1574 """Submit a job prog to a SLURM cluster"""
1575
1576 me_dir = self.get_jobs_identifier(cwd, prog)
1577
1578
1579 if cwd is None:
1580 cwd = os.getcwd()
1581 if stdout is None:
1582 stdout = '/dev/null'
1583 if stderr is None:
1584 stderr = '/dev/null'
1585 elif stderr == -2:
1586 stderr = stdout
1587 if log is None:
1588 log = '/dev/null'
1589
1590 command = ['sbatch', '-o', stdout,
1591 '-J', me_dir,
1592 '-e', stderr, prog] + argument
1593
1594 if self.cluster_queue and self.cluster_queue != 'None':
1595 command.insert(1, '-p')
1596 command.insert(2, self.cluster_queue)
1597
1598 a = misc.Popen(command, stdout=subprocess.PIPE,
1599 stderr=subprocess.STDOUT,
1600 stdin=subprocess.PIPE, cwd=cwd)
1601
1602 output = a.communicate()
1603 output_arr = output[0].split(' ')
1604 id = output_arr[3].rstrip()
1605
1606 if not id.isdigit():
1607 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1608
1609 self.submitted += 1
1610 self.submitted_ids.append(id)
1611 return id
1612
1613 @multiple_try()
1615 """ control the status of a single job with it's cluster id """
1616 cmd = 'squeue j'+str(id)
1617 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1618 stderr=open(os.devnull,'w'))
1619
1620 for line in status.stdout:
1621 line = line.strip()
1622 if 'Invalid' in line:
1623 return 'F'
1624 elif line.startswith(str(id)):
1625 status = line.split()[4]
1626 if status in self.idle_tag:
1627 return 'I'
1628 elif status in self.running_tag:
1629 return 'R'
1630 return 'F'
1631
1632 @multiple_try()
1634 """ control the status of a single job with it's cluster id """
1635 cmd = "squeue"
1636 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1637
1638 me_dir = self.get_jobs_identifier(me_dir)
1639
1640 idle, run, fail = 0, 0, 0
1641 ongoing=[]
1642 for line in status.stdout:
1643 if me_dir in line:
1644 id, _, _,_ , status,_ = line.split(None,5)
1645 ongoing.append(id)
1646 if status in self.idle_tag:
1647 idle += 1
1648 elif status in self.running_tag:
1649 run += 1
1650 elif status in self.complete_tag:
1651 status = self.check_termination(id)
1652 if status == 'wait':
1653 run += 1
1654 elif status == 'resubmit':
1655 idle += 1
1656 else:
1657 fail += 1
1658
1659
1660 for id in list(self.submitted_ids):
1661 if id not in ongoing:
1662 status = self.check_termination(id)
1663 if status == 'wait':
1664 run += 1
1665 elif status == 'resubmit':
1666 idle += 1
1667
1668
1669 return idle, run, self.submitted - (idle+run+fail), fail
1670
1671 @multiple_try()
1672 - def remove(self, *args, **opts):
1673 """Clean the jobs on the cluster"""
1674
1675 if not self.submitted_ids:
1676 return
1677 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1678 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1679
1681 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1682
1683 name= 'htcaas'
1684 job_id = 'HTCAAS_JOBID'
1685
1686 @store_input()
1687 @multiple_try()
1688 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1689 log=None, input_files=[], output_files=[], required_output=[],
1690 nb_submit=0):
1691 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1692 input/output file should be give relative to cwd
1693 """
1694
1695 if 'ajob' in prog:
1696 prog_num = prog.rsplit("ajob",1)[1]
1697 else:
1698 prog_num = '0'
1699
1700 cur_usr = os.getenv('USER')
1701
1702 if cwd is None:
1703 cwd = os.getcwd()
1704
1705 cwd_cp = cwd.rsplit("/",2)
1706
1707
1708 if not stdout is None:
1709 print "stdout: %s" % stdout
1710
1711 if not os.path.exists(prog):
1712 prog = os.path.join(cwd, prog)
1713
1714 if not required_output and output_files:
1715 required_output = output_files
1716
1717
1718 if not 'combine' and not 'pythia' in prog :
1719 cwd_arg = cwd+"/arguments"
1720 temp = ' '.join([str(a) for a in argument])
1721 arg_cmd="echo '"+temp+"' > " + cwd_arg
1722
1723
1724 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1725 if argument :
1726 command.extend(['-a ', '='.join([str(a) for a in argument])])
1727 print command
1728 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1729 id = a.stdout.read().strip()
1730
1731 else:
1732 cwd_arg = cwd+"/arguments"
1733 temp = ' '.join([str(a) for a in argument])
1734
1735
1736
1737
1738 temp_file_name = "sub." + os.path.basename(prog)
1739 text = """#!/bin/bash
1740 MYPWD=%(cwd)s
1741 cd $MYPWD
1742 input_files=(%(input_files)s )
1743 for i in ${input_files[@]}
1744 do
1745 chmod -f +x $i
1746 done
1747 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1748 """
1749 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1750 'arguments': ' '.join([str(a) for a in argument]),
1751 'program': ' ' if '.py' in prog else 'bash'}
1752
1753
1754 new_prog = pjoin(cwd, temp_file_name)
1755 open(new_prog, 'w').write(text % dico)
1756 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1757 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1758 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1759 id = a.stdout.read().strip()
1760
1761 nb_try=0
1762 nb_limit=5
1763 if not id.isdigit() :
1764 print "[ID is not digit]:" + id
1765
1766 while not id.isdigit() :
1767 nb_try+=1
1768 print "[fail_retry]:"+ nb_try
1769 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1770 id = a.stdout.read().strip()
1771 if nb_try > nb_limit :
1772 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1773 break
1774
1775 self.submitted += 1
1776 self.submitted_ids.append(id)
1777
1778 return id
1779
1780 @multiple_try(nb_try=10, sleep=10)
1782 """ control the status of a single job with it's cluster id """
1783
1784 if id == 0 :
1785 status_out ='C'
1786 else :
1787 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1788 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1789 stderr=subprocess.PIPE)
1790 error = status.stderr.read()
1791 if status.returncode or error:
1792 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1793 status_out= status.stdout.read().strip()
1794 status_out= status_out.split(":",1)[1]
1795 if status_out == 'waiting':
1796 status_out='I'
1797 elif status_out == 'preparing' or status_out == 'running':
1798 status_out = 'R'
1799 elif status_out != 'done':
1800 status_out = 'F'
1801 elif status_out == 'done':
1802 status_out = 'C'
1803
1804 return status_out
1805
1806 @multiple_try(nb_try=15, sleep=1)
1808 """ control the status of a single job with it's cluster id """
1809
1810 if not self.submitted_ids:
1811 return 0, 0, 0, 0
1812
1813 ongoing = []
1814 idle, run, fail = 0, 0, 0
1815
1816 if id == 0 :
1817 return 0 , 0, 0, 0
1818 else :
1819 for i in range(len(self.submitted_ids)):
1820 ongoing.append(int(self.submitted_ids[i]))
1821 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
1822 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1823 status_out= status.stdout.read().strip()
1824 status_out= status_out.split(":",1)[1]
1825 if status_out == 'waiting':
1826 idle += 1
1827 elif status_out == 'preparing':
1828 run += 1
1829 elif status_out == 'running':
1830 run += 1
1831 elif status_out != 'done':
1832 fail += 1
1833
1834 if status_out != 'done':
1835 print "["+ self.submitted_ids[i] + "] " + status_out
1836 '''
1837 for i in range(len(self.submitted_ids)):
1838 if int(self.submitted_ids[i]) not in ongoing:
1839 status = self.check_termination(int(self.submitted_ids[i]))
1840 if status = 'waiting':
1841 idle += 1
1842 elif status == 'resubmit':
1843 idle += 1
1844 elif status == 'failed':
1845 fail += 1
1846 '''
1847
1848 return idle, run, self.submitted - (idle+run+fail), fail
1849
1850 @multiple_try()
1851 - def remove(self, *args, **opts):
1852 """Clean the jobson the cluster"""
1853
1854 if not self.submitted_ids:
1855 return
1856 for i in range(len(self.submitted_ids)):
1857 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
1858 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1859
1862 """Class for dealing with cluster submission on a HTCaaS cluster"""
1863
1864 name= 'htcaas2'
1865 job_id = 'HTCAAS2_JOBID'
1866
1867 @store_input()
1868 @multiple_try()
1869 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1870 log=None, input_files=[], output_files=[], required_output=[],
1871 nb_submit=0):
1872 """Submit the job on the cluster NO SHARE DISK
1873 input/output file should be give relative to cwd
1874 """
1875
1876 if 'ajob' in prog:
1877 prog_num = prog.rsplit("ajob",1)[1]
1878 elif 'run_combine' in prog:
1879 prog_num = '0'
1880 else:
1881 prog_num = prog
1882
1883 cur_usr = os.getenv('USER')
1884
1885 import uuid
1886 dir = str(uuid.uuid4().hex)
1887
1888 prog_dir = '_run%s'% prog_num
1889 prog_dir = dir+prog_dir
1890
1891 if cwd is None:
1892 cwd = os.getcwd()
1893
1894 cwd_cp = cwd.rsplit("/",2)
1895
1896 if stdout is None:
1897 stdout='/dev/null'
1898
1899 if not os.path.exists(prog):
1900 prog = os.path.join(cwd, prog)
1901
1902 if not required_output and output_files:
1903 required_output = output_files
1904
1905 if '/' in argument :
1906 temp_file_name = "sub." + os.path.basename(prog)
1907 else :
1908 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
1909
1910
1911 if 'combine' in prog or 'pythia' in prog :
1912 text = """#!/bin/bash
1913 MYPWD=%(cwd)s
1914 cd $MYPWD
1915 script=%(script)s
1916 input_files=(%(input_files)s )
1917 if [ $# -ge 1 ]; then
1918 arg1=$1
1919 else
1920 arg1=''
1921 fi
1922 args=' %(arguments)s'
1923 for i in ${input_files[@]}; do
1924 if [[ "$i" == *$script* ]]; then
1925 script=$i
1926 fi
1927 chmod -f +x $i
1928 done
1929 /bin/bash ${script} ${args} > %(stdout)s
1930 """
1931
1932 elif 'shower' in prog :
1933 text = """#!/bin/bash
1934 MYPWD=%(cwd)s
1935 cd $MYPWD
1936 args=' %(arguments)s'
1937 input_files=( %(input_files)s )
1938 for i in ${input_files[@]}
1939 do
1940 chmod -f +x $i
1941 done
1942 /bin/bash %(script)s ${args} > $MYPWD/done
1943 """
1944
1945 else :
1946 text = """#!/bin/bash
1947 MYPWD=%(cwd)s
1948 #mkdir -p $MYTMP
1949 cd $MYPWD
1950 input_files=( %(input_files)s )
1951 for i in ${input_files[@]}
1952 do
1953 if [[ $i != */*/* ]]; then
1954 i=$PWD"/"$i
1955 fi
1956 echo $i
1957 if [ -d $i ]; then
1958 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1959 else
1960 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s
1961 fi
1962 done
1963 """
1964
1965 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
1966 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
1967 'input_files': ' '.join(input_files + [prog]),
1968 'output_files': ' '.join(output_files), 'stdout': stdout,
1969 'arguments': ' '.join([str(a) for a in argument]),
1970 'program': ' ' if '.py' in prog else 'bash'}
1971
1972
1973 new_prog = pjoin(cwd, temp_file_name)
1974 open(new_prog, 'w').write(text % dico)
1975 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1976
1977
1978 cmd1='/bin/bash '+ cwd+'/'+temp_file_name
1979 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE,
1980 stderr=subprocess.PIPE)
1981
1982
1983
1984 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog:
1985
1986 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s"""
1987 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
1988 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) ,
1989 'prog_dir': prog_dir }
1990 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE,
1991 stderr=subprocess.PIPE)
1992 id = status3.stdout.read().strip()
1993
1994 nb_try=0
1995 nb_limit=5
1996 while not id.isdigit() :
1997 nb_try+=1
1998 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1999 id = a.stdout.read().strip()
2000 if nb_try > nb_limit :
2001 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id
2002 break
2003
2004 temp_file_name2 = "sub." +id
2005 text2 = """#!/bin/bash
2006 MYPWD=%(cwd)s
2007 output_files=( %(output_files)s )
2008 result=done
2009 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then
2010 for i in ${output_files[@]}
2011 do
2012 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s
2013 chmod -Rf 777 ${MYPWD}/$i
2014 done
2015 for i in ${output_files[@]}; do
2016 if [[ -e ${MYPWD}/$i ]]; then
2017 result=done
2018 else
2019 result=running
2020 echo $result
2021 exit 0
2022 fi
2023 done
2024 echo $result
2025 touch ${MYPWD}/done.%(job_id)s
2026 else
2027 for i in ${output_files[@]}; do
2028 if [ -e ${MYPWD}/$i ]; then
2029 result=done
2030 else
2031 rm -f ${MYPWD}/done.%(job_id)s
2032 result=running
2033 echo $result
2034 exit 0
2035 fi
2036 done
2037 echo $result
2038
2039 fi
2040
2041 """
2042 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog),
2043 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir,
2044 'output_files': ' '.join(output_files), 'job_id': id,
2045 'program': ' ' if '.py' in prog else 'bash'}
2046
2047 homePath = os.getenv("HOME")
2048 outPath = homePath +"/MG5"
2049
2050 new_prog2 = pjoin(outPath, temp_file_name2)
2051 open(new_prog2, 'w').write(text2 % dico2)
2052 misc.Popen(['chmod','+x',new_prog2],cwd=cwd)
2053
2054
2055 self.submitted += 1
2056 self.submitted_ids.append(id)
2057
2058 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog:
2059 if '/dev/null' in stdout :
2060 stdout=''
2061
2062 temp_file_shower = "sub.out"
2063 text_shower = """#!/bin/bash
2064 MYPWD=%(cwd)s
2065 result=done
2066 output_files=(%(output_files)s)
2067 for i in ${output_files[@]}; do
2068 if [ -e $MYPWD/$i -o -e $i ]; then
2069 result=done
2070 else
2071 result=running
2072 echo $result
2073 exit 0
2074 fi
2075 done
2076 echo $result
2077 """
2078 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files),
2079 'program': ' ' if '.py' in prog else 'bash'}
2080 homePath = os.getenv("HOME")
2081 outPath = homePath +"/MG5"
2082 new_prog_shower = pjoin(outPath, temp_file_shower)
2083 open(new_prog_shower, 'w').write(text_shower % dico_shower)
2084 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd)
2085
2086 id='-1'
2087 self.submitted += 1
2088 self.submitted_ids.append(id)
2089
2090 else :
2091 id='-2'
2092 self.submitted += 1
2093 self.submitted_ids.append(id)
2094
2095 return id
2096
2097 @multiple_try(nb_try=10, sleep=10)
2099 """ control the status of a single job with it's cluster id """
2100
2101 homePath = os.getenv("HOME")
2102 outPath = homePath +"/MG5"
2103
2104
2105 if id == '0' or id=='-2' :
2106 status_out ='done'
2107 elif id == '-1' :
2108 cmd='/bin/bash ' +outPath+'/sub.out'
2109 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2110 status_out=status.stdout.read().strip()
2111 print "["+id+"]" + status_out
2112 if status_out == 'waiting':
2113 status_out='wait'
2114 elif status_out == 'preparing' or status_out == 'running':
2115 status_out = 'R'
2116 elif status_out != 'done':
2117 status_out = 'F'
2118 elif status_out == 'done':
2119 status_out = 'C'
2120
2121 print "["+id+"]" + status_out
2122 else :
2123 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
2124 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
2125 stderr=subprocess.PIPE)
2126 error = status.stderr.read()
2127 if status.returncode or error:
2128 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
2129 status_out= status.stdout.read().strip()
2130 status_out= status_out.split(":",1)[1]
2131 print "["+id+"]" + status_out
2132 if status_out == 'waiting':
2133 status_out='wait'
2134 elif status_out == 'preparing' or status_out == 'running':
2135 status_out = 'R'
2136 elif status_out == 'failed' :
2137 args = self.retry_args[id]
2138 id_temp = self.submit2(**args)
2139 del self.retry_args[id]
2140 self.submitted_ids.remove(id)
2141 status_out = 'I'
2142 elif status_out != 'done':
2143 status_out = 'F'
2144 elif status_out == 'done':
2145 status_out = 'C'
2146
2147 return status_out
2148
2149
2150 @check_interupt()
2151 @multiple_try(nb_try=15, sleep=10)
2153 """ control the status of a single job with it's cluster id """
2154
2155 if not self.submitted_ids:
2156 return 0, 0, 0, 0
2157
2158 ongoing = []
2159 idle, run, fail = 0, 0, 0
2160
2161 homePath = os.getenv("HOME")
2162 outPath = homePath +"/MG5"
2163
2164 for i in range(len(self.submitted_ids)):
2165 ongoing.append(self.submitted_ids[i])
2166 if self.submitted_ids[i] == '-2' :
2167 return 0,0,0,0
2168 if self.submitted_ids[i] == '0' :
2169
2170 status_out='done'
2171 elif self.submitted_ids[i] == '-1' :
2172 cmd='/bin/bash ' +outPath+'/sub.out'
2173 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2174 status_out=status.stdout.read().strip()
2175 if status_out == 'waiting':
2176 idle += 1
2177 elif status_out == 'preparing':
2178 run += 1
2179 elif status_out == 'running':
2180 run += 1
2181 elif status_out != 'done':
2182 fail += 1
2183 else :
2184 args = self.retry_args[str(self.submitted_ids[i])]
2185 if 'required_output'in args and not args['required_output']:
2186 args['required_output'] = args['output_files']
2187 self.retry_args[str(self.submitted_ids[i])] = args
2188
2189 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status "
2190 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2191 status_out= status.stdout.read().strip()
2192 status_out= status_out.split(":",1)[1]
2193 if status_out == 'waiting':
2194 idle += 1
2195 elif status_out == 'preparing':
2196 run += 1
2197 elif status_out == 'running':
2198 run += 1
2199 elif status_out == 'failed' or status_out == 'canceled':
2200 id = self.submit2(**args)
2201
2202 del self.retry_args[self.submitted_ids[i]]
2203 self.submitted_ids.remove(self.submitted_ids[i])
2204 self.submitted-=1
2205 idle += 1
2206 elif status_out != 'done':
2207 fail += 1
2208 if status_out == 'done':
2209 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i]
2210 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2211 aa= status2.stdout.read().strip()
2212
2213
2214
2215
2216
2217
2218
2219 for path in args['required_output']:
2220 if args['cwd']:
2221 path = pjoin(args['cwd'], path)
2222
2223 temp1=os.path.exists(path)
2224 temp2=os.stat(path).st_size
2225 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
2226 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
2227 aa= status2.stdout.read().strip()
2228 if aa == 'done':
2229 self.submitted_ids[i] = '0'
2230 elif aa == 'running':
2231 run += 1
2232 else :
2233 self.submitted_ids[i]='0'
2234
2235
2236 for i in range(len(self.submitted_ids)):
2237 if str(self.submitted_ids[i]) not in ongoing:
2238 status2= self.check_termination(str(self.submitted_ids[i]))
2239 if status2 == 'wait':
2240 run += 1
2241 elif status2 == 'resubmit':
2242 idle += 1
2243
2244 return idle, run, self.submitted - (idle+run+fail), fail
2245
2246 @multiple_try()
2247 - def remove(self, *args, **opts):
2248 """Clean the jobson the cluster"""
2249
2250 if not self.submitted_ids:
2251 return
2252 for i in range(len(self.submitted_ids)):
2253 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i])
2254 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2255
2256
2257 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2258 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2259 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2260