Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21  import sys 
  22   
  23  logger = logging.getLogger('madgraph.cluster')  
  24   
  25  try: 
  26      from madgraph import MadGraph5Error 
  27      import madgraph.various.misc as misc 
  28  except Exception, error: 
  29      if __debug__: 
  30          print  str(error) 
  31      from internal import MadGraph5Error 
  32      import internal.misc as misc 
  33   
  34  pjoin = os.path.join 
35 36 -class ClusterManagmentError(MadGraph5Error):
37 pass
38
39 -class NotImplemented(MadGraph5Error):
40 pass
41 42 43 multiple_try = misc.multiple_try 44 pjoin = os.path.join
45 46 47 -def check_interupt(error=KeyboardInterrupt):
48 49 def deco_interupt(f): 50 def deco_f_interupt(self, *args, **opt): 51 try: 52 return f(self, *args, **opt) 53 except error: 54 try: 55 self.remove(*args, **opt) 56 except Exception: 57 pass 58 raise error
59 return deco_f_interupt 60 return deco_interupt 61
62 -def store_input(arg=''):
63 64 def deco_store(f): 65 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 66 input_files=[], output_files=[], required_output=[], nb_submit=0): 67 frame = inspect.currentframe() 68 args, _, _, values = inspect.getargvalues(frame) 69 args = dict([(i, values[i]) for i in args if i != 'self']) 70 id = f(self, **args) 71 if self.nb_retry > 0: 72 self.retry_args[id] = args 73 return id
74 return deco_f_store 75 return deco_store 76
77 -def need_transfer(options):
78 """ This function checks whether compression of input files are necessary 79 given the running options given. """ 80 81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 82 return False 83 else: 84 return True
85
86 -class Cluster(object):
87 """Basic Class for all cluster type submission""" 88 name = 'mother class' 89 identifier_length = 14 90
91 - def __init__(self,*args, **opts):
92 """Init the cluster""" 93 94 self.submitted = 0 95 self.submitted_ids = [] 96 self.finish = 0 97 if 'cluster_queue' in opts: 98 self.cluster_queue = opts['cluster_queue'] 99 else: 100 self.cluster_queue = 'madgraph' 101 if 'cluster_temp_path' in opts: 102 self.temp_dir = opts['cluster_temp_path'] 103 else: 104 self.temp_dir = None 105 self.options = {'cluster_status_update': (600, 30)} 106 for key,value in opts.items(): 107 self.options[key] = value 108 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 109 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300 110 self.options = dict(opts) 111 self.retry_args = {} 112 # controlling jobs in controlled type submision 113 self.packet = {} 114 self.id_to_packet = {}
115
116 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 117 log=None, required_output=[], nb_submit=0):
118 """How to make one submission. Return status id on the cluster.""" 119 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
120 121 122 @store_input()
123 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 124 log=None, input_files=[], output_files=[], required_output=[], 125 nb_submit=0):
126 """How to make one submission. Return status id on the cluster. 127 NO SHARE DISK""" 128 129 if cwd is None: 130 cwd = os.getcwd() 131 if not os.path.exists(prog): 132 prog = os.path.join(cwd, prog) 133 134 if not required_output and output_files: 135 required_output = output_files 136 137 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 138 (input_files == [] == output_files): 139 return self.submit(prog, argument, cwd, stdout, stderr, log, 140 required_output=required_output, nb_submit=nb_submit) 141 142 if not input_files and not output_files: 143 # not input/output so not using submit2 144 return self.submit(prog, argument, cwd, stdout, stderr, log, 145 required_output=required_output, nb_submit=nb_submit) 146 147 if cwd is None: 148 cwd = os.getcwd() 149 if not os.path.exists(prog): 150 prog = os.path.join(cwd, prog) 151 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 152 153 text = """#!/bin/bash 154 MYTMP=%(tmpdir)s/run$%(job_id)s 155 MYPWD=%(cwd)s 156 mkdir -p $MYTMP 157 cd $MYPWD 158 input_files=( %(input_files)s ) 159 for i in ${input_files[@]} 160 do 161 cp -R -L $i $MYTMP 162 done 163 cd $MYTMP 164 echo '%(arguments)s' > arguments 165 chmod +x ./%(script)s 166 %(program)s ./%(script)s %(arguments)s 167 exit=$? 168 output_files=( %(output_files)s ) 169 for i in ${output_files[@]} 170 do 171 cp -r $MYTMP/$i $MYPWD 172 done 173 # if [ "$exit" -eq "0" ] 174 # then 175 rm -rf $MYTMP 176 # fi 177 """ 178 179 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 180 'cwd': cwd, 'job_id': self.job_id, 181 'input_files': ' '.join(input_files + [prog]), 182 'output_files': ' '.join(output_files), 183 'arguments': ' '.join([str(a) for a in argument]), 184 'program': ' ' if '.py' in prog else 'bash'} 185 186 # writing a new script for the submission 187 new_prog = pjoin(cwd, temp_file_name) 188 open(new_prog, 'w').write(text % dico) 189 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 190 191 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 192 required_output=required_output, nb_submit=nb_submit)
193 194
195 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 196 log=None, input_files=[], output_files=[], required_output=[], 197 nb_submit=0, packet_member=None):
198 """This function wrap the cluster submition with cluster independant 199 method should not be overwritten (but for DAG type submission)""" 200 201 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 202 output_files, required_output, nb_submit) 203 204 205 if not packet_member: 206 return id 207 else: 208 if isinstance(packet_member, Packet): 209 self.id_to_packet[id] = packet_member 210 packet_member.put(id) 211 if packet_member.tag not in self.packet: 212 self.packet[packet_member.tag] = packet_member 213 else: 214 if packet_member in self.packet: 215 packet = self.packet[packet_member] 216 packet.put(id) 217 self.id_to_packet[id] = packet 218 return id
219
220 - def control(self, me_dir=None):
221 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 222 if not self.submitted_ids: 223 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 224 idle, run, fail = 0, 0, 0 225 for pid in self.submitted_ids[:]: 226 status = self.control_one_job(id) 227 if status == 'I': 228 idle += 1 229 elif status == 'R': 230 run += 1 231 elif status == 'F': 232 self.finish +=1 233 self.submitted_ids.remove(pid) 234 else: 235 fail += 1 236 237 return idle, run, self.finish, fail
238
239 - def control_one_job(self, pid):
240 """ control the status of a single job with it's cluster id """ 241 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
242
243 - def get_jobs_identifier(self, path, second_path=None):
244 """get a unique run_name for all the jobs helps to identify the runs 245 in the controller for some cluster.""" 246 247 if second_path: 248 path = os.path.realpath(pjoin(path, second_path)) 249 elif not os.path.exists(path): 250 return path # job already done 251 252 if 'SubProcesses' in path: 253 target = path.rsplit('/SubProcesses',1)[0] 254 elif 'MCatNLO' in path: 255 target = path.rsplit('/MCatNLO',1)[0] 256 elif second_path: 257 target=path 258 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 259 else: 260 target = path 261 262 if target.endswith('/'): 263 target = target[:-1] 264 265 target = misc.digest(target)[-self.identifier_length:] 266 if not target[0].isalpha(): 267 target = 'a' + target[1:] 268 269 return target
270 271 272 @check_interupt()
273 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
274 """Wait that all job are finish. 275 if minimal_job set, then return if idle + run is lower than that number""" 276 277 278 mode = 1 # 0 is long waiting/ 1 is short waiting 279 nb_iter = 0 280 nb_short = 0 281 change_at = 5 # number of iteration from which we wait longer between update. 282 283 if update_first: 284 idle, run, finish, fail = self.control(me_dir) 285 update_first(idle, run, finish) 286 287 #usefull shortcut for readibility 288 longtime, shorttime = self.options['cluster_status_update'] 289 290 nb_job = 0 291 while 1: 292 old_mode = mode 293 nb_iter += 1 294 idle, run, finish, fail = self.control(me_dir) 295 if nb_job: 296 if idle + run + finish + fail != nb_job: 297 nb_job = idle + run + finish + fail 298 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 299 else: 300 nb_job = idle + run + finish + fail 301 if fail: 302 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 303 if idle + run == 0: 304 #time.sleep(20) #security to ensure that the file are really written on the disk 305 logger.info('All jobs finished') 306 fct(idle, run, finish) 307 break 308 if idle + run < minimal_job: 309 return 310 fct(idle, run, finish) 311 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 312 if nb_iter < change_at: 313 mode = 1 314 elif idle < run: 315 if old_mode == 0: 316 if nb_short: 317 mode = 0 #we already be back from short to long so stay in long 318 #check if we need to go back to short mode 319 elif idle: 320 if nb_iter > change_at + int(longtime)//shorttime: 321 mode = 0 #stay in long waiting mode 322 else: 323 mode = 1 # pass in short waiting mode 324 nb_short =0 325 else: 326 mode = 1 # pass in short waiting mode 327 nb_short = 0 328 elif old_mode == 1: 329 nb_short +=1 330 if nb_short > 3* max(change_at, int(longtime)//shorttime): 331 mode = 0 #go back in slow waiting 332 else: 333 mode = 0 334 335 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 336 if old_mode > mode: 337 logger.info('''Start to wait %ss between checking status. 338 Note that you can change this time in the configuration file. 339 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 340 341 #now Waiting! 342 if mode == 0: 343 try: 344 time.sleep(self.options['cluster_status_update'][0]) 345 except KeyboardInterrupt: 346 logger.info('start to update the status') 347 nb_iter = min(0, change_at -2) 348 nb_short = 0 349 else: 350 time.sleep(self.options['cluster_status_update'][1]) 351 352 353 self.submitted = 0 354 self.submitted_ids = []
355
356 - def check_termination(self, job_id):
357 """Check the termination of the jobs with job_id and relaunch it if needed.""" 358 359 360 if job_id not in self.retry_args: 361 return True 362 363 args = self.retry_args[job_id] 364 if 'time_check' in args: 365 time_check = args['time_check'] 366 else: 367 time_check = 0 368 369 for path in args['required_output']: 370 if args['cwd']: 371 path = pjoin(args['cwd'], path) 372 # check that file exists and is not empty. 373 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 374 break 375 else: 376 # all requested output are present 377 if time_check > 0: 378 logger.info('Job %s Finally found the missing output.' % (job_id)) 379 del self.retry_args[job_id] 380 self.submitted_ids.remove(job_id) 381 # check if the job_id is in a packet 382 if job_id in self.id_to_packet: 383 nb_in_packet = self.id_to_packet[job_id].remove_one() 384 if nb_in_packet == 0: 385 # packet done run the associate function 386 packet = self.id_to_packet[job_id] 387 # fully ensure that the packet is finished (thread safe) 388 packet.queue.join() 389 #running the function 390 packet.fct(*packet.args) 391 del self.id_to_packet[job_id] 392 return 'resubmit' 393 394 return 'done' 395 396 if time_check == 0: 397 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 398 args['time_check'] = time.time() 399 return 'wait' 400 elif self.cluster_retry_wait > time.time() - time_check: 401 return 'wait' 402 403 #jobs failed to be completed even after waiting time!! 404 if self.nb_retry < 0: 405 logger.critical('''Fail to run correctly job %s. 406 with option: %s 407 file missing: %s''' % (job_id, args, path)) 408 raw_input('press enter to continue.') 409 elif self.nb_retry == 0: 410 logger.critical('''Fail to run correctly job %s. 411 with option: %s 412 file missing: %s. 413 Stopping all runs.''' % (job_id, args, path)) 414 self.remove() 415 elif args['nb_submit'] >= self.nb_retry: 416 logger.critical('''Fail to run correctly job %s. 417 with option: %s 418 file missing: %s 419 Fails %s times 420 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 421 self.remove() 422 else: 423 args['nb_submit'] += 1 424 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 425 del self.retry_args[job_id] 426 self.submitted_ids.remove(job_id) 427 if 'time_check' in args: 428 del args['time_check'] 429 if job_id in self.id_to_packet: 430 self.id_to_packet[job_id].remove_one() 431 args['packet_member'] = self.id_to_packet[job_id] 432 del self.id_to_packet[job_id] 433 self.cluster_submit(**args) 434 else: 435 self.submit2(**args) 436 return 'resubmit' 437 return 'done'
438 439 @check_interupt()
440 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 441 stderr=None, log=None, required_output=[], nb_submit=0, 442 input_files=[], output_files=[]):
443 """launch one job on the cluster and wait for it""" 444 445 special_output = False # tag for concatenate the error with the output. 446 if stderr == -2 and stdout: 447 #We are suppose to send the output to stdout 448 special_output = True 449 stderr = stdout + '.err' 450 451 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 452 required_output=required_output, input_files=input_files, 453 output_files=output_files) 454 455 frame = inspect.currentframe() 456 args, _, _, values = inspect.getargvalues(frame) 457 args = dict([(i, values[i]) for i in args if i != 'self']) 458 self.retry_args[id] = args 459 460 nb_wait=0 461 while 1: 462 nb_wait+=1 463 status = self.control_one_job(id) 464 if not status in ['R','I']: 465 status = self.check_termination(id) 466 if status in ['wait']: 467 time.sleep(30) 468 continue 469 elif status in ['resubmit']: 470 id = self.submitted_ids[0] 471 time.sleep(30) 472 continue 473 #really stop! 474 time.sleep(30) #security to ensure that the file are really written on the disk 475 break 476 time.sleep(self.options['cluster_status_update'][1]) 477 478 if required_output: 479 status = self.check_termination(id) 480 if status == 'wait': 481 run += 1 482 elif status == 'resubmit': 483 idle += 1 484 485 486 if special_output: 487 # combine the stdout and the stderr 488 #wait up to 50 s to see if those files exists 489 for i in range(5): 490 if os.path.exists(stdout): 491 if not os.path.exists(stderr): 492 time.sleep(5) 493 if os.path.exists(stderr): 494 err_text = open(stderr).read() 495 if not err_text: 496 return 497 logger.warning(err_text) 498 text = open(stdout).read() 499 open(stdout,'w').write(text + err_text) 500 else: 501 return 502 time.sleep(10)
503
504 - def remove(self, *args, **opts):
505 """ """ 506 logger.warning("""This cluster didn't support job removal, 507 the jobs are still running on the cluster.""")
508
509 -class Packet(object):
510 """ an object for handling packet of job, it is designed to be thread safe 511 """ 512
513 - def __init__(self, name, fct, args, opts={}):
514 import Queue 515 import threading 516 self.queue = Queue.Queue() 517 self.tag = name 518 self.fct = fct 519 self.args = args 520 self.opts = opts 521 self.done = threading.Event()
522
523 - def put(self, *args, **opts):
524 self.queue.put(*args, **opts)
525 526 append = put 527
528 - def remove_one(self):
529 self.queue.get(True) 530 self.queue.task_done() 531 return self.queue.qsize()
532
533 -class MultiCore(Cluster):
534 """class for dealing with the submission in multiple node""" 535 536 job_id = "$" 537
538 - def __init__(self, *args, **opt):
539 """Init the cluster """ 540 541 542 super(MultiCore, self).__init__(self, *args, **opt) 543 544 import Queue 545 import threading 546 import thread 547 self.queue = Queue.Queue() # list of job to do 548 self.done = Queue.Queue() # list of job finisned 549 self.submitted = Queue.Queue() # one entry by job submitted 550 self.stoprequest = threading.Event() #flag to ensure everything to close 551 self.demons = [] 552 self.nb_done =0 553 if 'nb_core' in opt: 554 self.nb_core = opt['nb_core'] 555 elif isinstance(args[0],int): 556 self.nb_core = args[0] 557 else: 558 self.nb_core = 1 559 self.update_fct = None 560 561 self.lock = threading.Event() # allow nice lock of the main thread 562 self.pids = Queue.Queue() # allow to clean jobs submit via subprocess 563 self.done_pid = [] # list of job finisned 564 self.done_pid_queue = Queue.Queue() 565 self.fail_msg = None 566 567 # starting the worker node 568 for _ in range(self.nb_core): 569 self.start_demon()
570 571
572 - def start_demon(self):
573 import threading 574 t = threading.Thread(target=self.worker) 575 t.daemon = True 576 t.start() 577 self.demons.append(t)
578 579
580 - def worker(self):
581 import Queue 582 import thread 583 while not self.stoprequest.isSet(): 584 try: 585 args = self.queue.get() 586 tag, exe, arg, opt = args 587 try: 588 # check for executable case 589 if isinstance(exe,str): 590 if os.path.exists(exe) and not exe.startswith('/'): 591 exe = './' + exe 592 if opt['stderr'] == None: 593 opt['stderr'] = subprocess.STDOUT 594 proc = misc.Popen([exe] + arg, **opt) 595 pid = proc.pid 596 self.pids.put(pid) 597 proc.wait() 598 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 599 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 600 (' '.join([exe]+arg), proc.returncode) 601 logger.warning(fail_msg) 602 self.stoprequest.set() 603 self.remove(fail_msg) 604 # handle the case when this is a python function. Note that 605 # this use Thread so they are NO built-in parralelization this is 606 # going to work on a single core! (but this is fine for IO intensive 607 # function. for CPU intensive fct this will slow down the computation 608 else: 609 pid = tag 610 self.pids.put(pid) 611 # the function should return 0 if everything is fine 612 # the error message otherwise 613 returncode = exe(*arg, **opt) 614 if returncode != 0: 615 logger.warning("fct %s does not return 0. Starts to stop the code in a clean way.", exe) 616 self.stoprequest.set() 617 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 618 except Exception,error: 619 self.fail_msg = sys.exc_info() 620 logger.warning(str(error)) 621 self.stoprequest.set() 622 self.remove(error) 623 624 if __debug__: 625 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2] 626 627 self.queue.task_done() 628 self.done.put(tag) 629 self.done_pid_queue.put(pid) 630 #release the mother to print the status on the screen 631 try: 632 self.lock.set() 633 except thread.error: 634 continue 635 except Queue.Empty: 636 continue
637 638 639 640
641 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 642 log=None, required_output=[], nb_submit=0):
643 """submit a job on multicore machine""" 644 645 tag = (prog, tuple(argument), cwd, nb_submit) 646 if isinstance(prog, str): 647 648 649 opt = {'cwd': cwd, 650 'stdout':stdout, 651 'stderr': stderr} 652 self.queue.put((tag, prog, argument, opt)) 653 self.submitted.put(1) 654 return tag 655 else: 656 # python function 657 self.queue.put((tag, prog, argument, {})) 658 self.submitted.put(1) 659 return tag
660
661 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 662 stderr=None, log=None, **opts):
663 """launch one job and wait for it""" 664 if isinstance(stdout, str): 665 stdout = open(stdout, 'w') 666 if isinstance(stderr, str): 667 stdout = open(stderr, 'w') 668 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
669
670 - def remove(self, error=None):
671 """Ensure that all thread are killed""" 672 673 # ensure the worker to stop 674 self.stoprequest.set() 675 if error and not self.fail_msg: 676 self.fail_msg = error 677 678 # cleaning the queue done_pid_queue and move them to done_pid 679 while not self.done_pid_queue.empty(): 680 pid = self.done_pid_queue.get() 681 self.done_pid.append(pid) 682 # self.done_pid_queue.task_done() 683 684 while not self.pids.empty(): 685 pid = self.pids.get() 686 self.pids.task_done() 687 if isinstance(pid, tuple): 688 continue 689 if pid in self.done_pid: 690 continue 691 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 692 % {'pid':pid} ) 693 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
694 695
696 - def wait(self, me_dir, update_status, update_first=None):
697 """Waiting that all the jobs are done. This function also control that 698 the submission by packet are handle correctly (i.e. submit the function)""" 699 700 import Queue 701 import threading 702 703 try: # to catch KeyBoardInterupt to see which kind of error to display 704 last_status = (0, 0, 0) 705 sleep_time = 1 706 use_lock = True 707 first = True 708 while True: 709 force_one_more_loop = False # some security 710 711 # Loop over the job tagged as done to check if some packet of jobs 712 # are finished in case, put the associate function in the queue 713 while self.done.qsize(): 714 try: 715 tag = self.done.get(True, 1) 716 except Queue.Empty: 717 pass 718 else: 719 if self.id_to_packet and tuple(tag) in self.id_to_packet: 720 packet = self.id_to_packet[tuple(tag)] 721 remaining = packet.remove_one() 722 if remaining == 0: 723 # fully ensure that the packet is finished (thread safe) 724 packet.queue.join() 725 self.submit(packet.fct, packet.args) 726 force_one_more_loop = True 727 self.nb_done += 1 728 self.done.task_done() 729 730 # Get from the various queue the Idle/Done/Running information 731 # Those variable should be thread safe but approximate. 732 Idle = self.queue.qsize() 733 Done = self.nb_done + self.done.qsize() 734 Running = max(0, self.submitted.qsize() - Idle - Done) 735 736 if Idle + Running <= 0 and not force_one_more_loop: 737 update_status(Idle, Running, Done) 738 # Going the quit since everything is done 739 # Fully Ensure that everything is indeed done. 740 self.queue.join() 741 break 742 743 if (Idle, Running, Done) != last_status: 744 if first and update_first: 745 update_first(Idle, Running, Done) 746 first = False 747 else: 748 update_status(Idle, Running, Done) 749 last_status = (Idle, Running, Done) 750 751 # cleaning the queue done_pid_queue and move them to done_pid 752 while not self.done_pid_queue.empty(): 753 pid = self.done_pid_queue.get() 754 self.done_pid.append(pid) 755 self.done_pid_queue.task_done() 756 757 758 # Define how to wait for the next iteration 759 if use_lock: 760 # simply wait that a worker release the lock 761 use_lock = self.lock.wait(300) 762 self.lock.clear() 763 if not use_lock and Idle > 0: 764 use_lock = True 765 else: 766 # to be sure that we will never fully lock at the end pass to 767 # a simple time.sleep() 768 time.sleep(sleep_time) 769 sleep_time = min(sleep_time + 2, 180) 770 if update_first: 771 update_first(Idle, Running, Done) 772 773 if self.stoprequest.isSet(): 774 if isinstance(self.fail_msg, Exception): 775 raise self.fail_msg 776 elif isinstance(self.fail_msg, str): 777 raise Exception, self.fail_msg 778 else: 779 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 780 # reset variable for next submission 781 try: 782 self.lock.clear() 783 except Exception: 784 pass 785 self.done = Queue.Queue() 786 self.done_pid = [] 787 self.done_pid_queue = Queue.Queue() 788 self.nb_done = 0 789 self.submitted = Queue.Queue() 790 self.pids = Queue.Queue() 791 self.stoprequest.clear() 792 793 except KeyboardInterrupt: 794 # if one of the node fails -> return that error 795 if isinstance(self.fail_msg, Exception): 796 raise self.fail_msg 797 elif isinstance(self.fail_msg, str): 798 raise Exception, self.fail_msg 799 elif self.fail_msg: 800 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 801 # else return orignal error 802 raise
803
804 -class CondorCluster(Cluster):
805 """Basic class for dealing with cluster submission""" 806 807 name = 'condor' 808 job_id = 'CONDOR_ID' 809 810 811 812 @multiple_try()
813 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 814 required_output=[], nb_submit=0):
815 """Submit a job prog to a Condor cluster""" 816 817 text = """Executable = %(prog)s 818 output = %(stdout)s 819 error = %(stderr)s 820 log = %(log)s 821 %(argument)s 822 environment = CONDOR_ID=$(Cluster).$(Process) 823 Universe = vanilla 824 notification = Error 825 Initialdir = %(cwd)s 826 %(requirement)s 827 getenv=True 828 queue 1 829 """ 830 831 if self.cluster_queue not in ['None', None]: 832 requirement = 'Requirements = %s=?=True' % self.cluster_queue 833 else: 834 requirement = '' 835 836 if cwd is None: 837 cwd = os.getcwd() 838 if stdout is None: 839 stdout = '/dev/null' 840 if stderr is None: 841 stderr = '/dev/null' 842 if log is None: 843 log = '/dev/null' 844 if not os.path.exists(prog): 845 prog = os.path.join(cwd, prog) 846 if argument: 847 argument = 'Arguments = %s' % ' '.join(argument) 848 else: 849 argument = '' 850 851 852 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 853 'stderr': stderr,'log': log,'argument': argument, 854 'requirement': requirement} 855 856 #open('submit_condor','w').write(text % dico) 857 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 858 stdin=subprocess.PIPE) 859 output, _ = a.communicate(text % dico) 860 #output = a.stdout.read() 861 #Submitting job(s). 862 #Logging submit event(s). 863 #1 job(s) submitted to cluster 2253622. 864 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 865 try: 866 id = pat.search(output).groups()[0] 867 except: 868 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 869 % output 870 self.submitted += 1 871 self.submitted_ids.append(id) 872 return id
873 874 @store_input() 875 @multiple_try()
876 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 877 log=None, input_files=[], output_files=[], required_output=[], 878 nb_submit=0):
879 """Submit the job on the cluster NO SHARE DISK 880 input/output file should be give relative to cwd 881 """ 882 883 if not required_output and output_files: 884 required_output = output_files 885 886 if (input_files == [] == output_files): 887 return self.submit(prog, argument, cwd, stdout, stderr, log, 888 required_output=required_output, nb_submit=nb_submit) 889 890 text = """Executable = %(prog)s 891 output = %(stdout)s 892 error = %(stderr)s 893 log = %(log)s 894 %(argument)s 895 should_transfer_files = YES 896 when_to_transfer_output = ON_EXIT 897 transfer_input_files = %(input_files)s 898 %(output_files)s 899 Universe = vanilla 900 notification = Error 901 Initialdir = %(cwd)s 902 %(requirement)s 903 getenv=True 904 queue 1 905 """ 906 907 if self.cluster_queue not in ['None', None]: 908 requirement = 'Requirements = %s=?=True' % self.cluster_queue 909 else: 910 requirement = '' 911 912 if cwd is None: 913 cwd = os.getcwd() 914 if stdout is None: 915 stdout = '/dev/null' 916 if stderr is None: 917 stderr = '/dev/null' 918 if log is None: 919 log = '/dev/null' 920 if not os.path.exists(prog): 921 prog = os.path.join(cwd, prog) 922 if argument: 923 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 924 else: 925 argument = '' 926 # input/output file treatment 927 if input_files: 928 input_files = ','.join(input_files) 929 else: 930 input_files = '' 931 if output_files: 932 output_files = 'transfer_output_files = %s' % ','.join(output_files) 933 else: 934 output_files = '' 935 936 937 938 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 939 'stderr': stderr,'log': log,'argument': argument, 940 'requirement': requirement, 'input_files':input_files, 941 'output_files':output_files} 942 943 #open('submit_condor','w').write(text % dico) 944 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 945 stdin=subprocess.PIPE) 946 output, _ = a.communicate(text % dico) 947 #output = a.stdout.read() 948 #Submitting job(s). 949 #Logging submit event(s). 950 #1 job(s) submitted to cluster 2253622. 951 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 952 try: 953 id = pat.search(output).groups()[0] 954 except: 955 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 956 % output 957 self.submitted += 1 958 self.submitted_ids.append(id) 959 return id
960 961 962 963 964 965 @multiple_try(nb_try=10, sleep=10)
966 - def control_one_job(self, id):
967 """ control the status of a single job with it's cluster id """ 968 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 969 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 970 stderr=subprocess.PIPE) 971 972 error = status.stderr.read() 973 if status.returncode or error: 974 raise ClusterManagmentError, 'condor_q returns error: %s' % error 975 976 return status.stdout.readline().strip()
977 978 @check_interupt() 979 @multiple_try(nb_try=10, sleep=10)
980 - def control(self, me_dir):
981 """ control the status of a single job with it's cluster id """ 982 983 if not self.submitted_ids: 984 return 0, 0, 0, 0 985 986 packet = 15000 987 idle, run, fail = 0, 0, 0 988 ongoing = [] 989 for i in range(1+(len(self.submitted_ids)-1)//packet): 990 start = i * packet 991 stop = (i+1) * packet 992 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 993 " -format \'%-2s\ ' \'ClusterId\' " + \ 994 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 995 996 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 997 stderr=subprocess.PIPE) 998 error = status.stderr.read() 999 if status.returncode or error: 1000 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1001 1002 for line in status.stdout: 1003 id, status = line.strip().split() 1004 ongoing.append(int(id)) 1005 if status in ['I','U']: 1006 idle += 1 1007 elif status == 'R': 1008 run += 1 1009 elif status != 'C': 1010 fail += 1 1011 1012 for id in list(self.submitted_ids): 1013 if int(id) not in ongoing: 1014 status = self.check_termination(id) 1015 if status == 'wait': 1016 run += 1 1017 elif status == 'resubmit': 1018 idle += 1 1019 1020 return idle, run, self.submitted - (idle+run+fail), fail
1021 1022 @multiple_try()
1023 - def remove(self, *args, **opts):
1024 """Clean the jobson the cluster""" 1025 1026 if not self.submitted_ids: 1027 return 1028 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1029 1030 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1031 self.submitted_ids = []
1032
1033 -class PBSCluster(Cluster):
1034 """Basic class for dealing with cluster submission""" 1035 1036 name = 'pbs' 1037 job_id = 'PBS_JOBID' 1038 idle_tag = ['Q'] 1039 running_tag = ['T','E','R'] 1040 complete_tag = ['C'] 1041 1042 maximum_submited_jobs = 2500 1043 1044 @multiple_try()
1045 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1046 required_output=[], nb_submit=0):
1047 """Submit a job prog to a PBS cluster""" 1048 1049 me_dir = self.get_jobs_identifier(cwd, prog) 1050 1051 if len(self.submitted_ids) > self.maximum_submited_jobs: 1052 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1053 self.wait(me_dir, fct, self.maximum_submited_jobs) 1054 1055 1056 text = "" 1057 if cwd is None: 1058 cwd = os.getcwd() 1059 else: 1060 text = " cd %s;" % cwd 1061 if stdout is None: 1062 stdout = '/dev/null' 1063 if stderr is None: 1064 stderr = '/dev/null' 1065 elif stderr == -2: # -2 is subprocess.STDOUT 1066 stderr = stdout 1067 if log is None: 1068 log = '/dev/null' 1069 1070 if not os.path.isabs(prog): 1071 text += "./%s" % prog 1072 else: 1073 text+= prog 1074 1075 if argument: 1076 text += ' ' + ' '.join(argument) 1077 1078 command = ['qsub','-o', stdout, 1079 '-N', me_dir, 1080 '-e', stderr, 1081 '-V'] 1082 1083 if self.cluster_queue and self.cluster_queue != 'None': 1084 command.extend(['-q', self.cluster_queue]) 1085 1086 a = misc.Popen(command, stdout=subprocess.PIPE, 1087 stderr=subprocess.STDOUT, 1088 stdin=subprocess.PIPE, cwd=cwd) 1089 1090 output = a.communicate(text)[0] 1091 id = output.split('.')[0] 1092 if not id.isdigit() or a.returncode !=0: 1093 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1094 % output 1095 1096 self.submitted += 1 1097 self.submitted_ids.append(id) 1098 return id
1099 1100 @multiple_try()
1101 - def control_one_job(self, id):
1102 """ control the status of a single job with it's cluster id """ 1103 cmd = 'qstat '+str(id) 1104 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1105 stderr=subprocess.STDOUT) 1106 1107 for line in status.stdout: 1108 line = line.strip() 1109 if 'cannot connect to server' in line or 'cannot read reply' in line: 1110 raise ClusterManagmentError, 'server disconnected' 1111 if 'Unknown' in line: 1112 return 'F' 1113 elif line.startswith(str(id)): 1114 jobstatus = line.split()[4] 1115 else: 1116 jobstatus="" 1117 1118 if status.returncode != 0 and status.returncode is not None: 1119 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1120 if jobstatus in self.idle_tag: 1121 return 'I' 1122 elif jobstatus in self.running_tag: 1123 return 'R' 1124 return 'F'
1125 1126 1127 @multiple_try()
1128 - def control(self, me_dir):
1129 """ control the status of a single job with it's cluster id """ 1130 cmd = "qstat" 1131 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1132 1133 me_dir = self.get_jobs_identifier(me_dir) 1134 1135 ongoing = [] 1136 1137 idle, run, fail = 0, 0, 0 1138 for line in status.stdout: 1139 if 'cannot connect to server' in line or 'cannot read reply' in line: 1140 raise ClusterManagmentError, 'server disconnected' 1141 if me_dir in line: 1142 ongoing.append(line.split()[0].split('.')[0]) 1143 status2 = line.split()[4] 1144 if status2 in self.idle_tag: 1145 idle += 1 1146 elif status2 in self.running_tag: 1147 run += 1 1148 elif status2 in self.complete_tag: 1149 if not self.check_termination(line.split()[0].split('.')[0]): 1150 idle += 1 1151 else: 1152 fail += 1 1153 1154 if status.returncode != 0 and status.returncode is not None: 1155 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1156 1157 for id in list(self.submitted_ids): 1158 if id not in ongoing: 1159 status2 = self.check_termination(id) 1160 if status2 == 'wait': 1161 run += 1 1162 elif status2 == 'resubmit': 1163 idle += 1 1164 1165 return idle, run, self.submitted - (idle+run+fail), fail
1166 1167 @multiple_try()
1168 - def remove(self, *args, **opts):
1169 """Clean the jobs on the cluster""" 1170 1171 if not self.submitted_ids: 1172 return 1173 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1174 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1175 self.submitted_ids = []
1176
1177 1178 -class SGECluster(Cluster):
1179 """Basic class for dealing with cluster submission""" 1180 # Class written by Arian Abrahantes. 1181 1182 name = 'sge' 1183 job_id = 'JOB_ID' 1184 idle_tag = ['qw', 'hqw','hRqw','w'] 1185 running_tag = ['r','t','Rr','Rt'] 1186 identifier_length = 10 1187
1188 - def def_get_path(self,location):
1189 """replace string for path issues""" 1190 location = os.path.realpath(location) 1191 homePath = os.getenv("HOME") 1192 if homePath: 1193 location = location.replace(homePath,'$HOME') 1194 return location
1195 1196 @multiple_try()
1197 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1198 required_output=[], nb_submit=0):
1199 """Submit a job prog to an SGE cluster""" 1200 1201 me_dir = self.get_jobs_identifier(cwd, prog) 1202 1203 1204 if cwd is None: 1205 #cwd = os.getcwd() 1206 cwd = self.def_get_path(os.getcwd()) 1207 cwd1 = self.def_get_path(cwd) 1208 text = " cd %s;" % cwd1 1209 if stdout is None: 1210 stdout = '/dev/null' 1211 else: 1212 stdout = self.def_get_path(stdout) 1213 if stderr is None: 1214 stderr = '/dev/null' 1215 elif stderr == -2: # -2 is subprocess.STDOUT 1216 stderr = stdout 1217 else: 1218 stderr = self.def_get_path(stderr) 1219 1220 if log is None: 1221 log = '/dev/null' 1222 else: 1223 log = self.def_get_path(log) 1224 1225 text += prog 1226 if argument: 1227 text += ' ' + ' '.join(argument) 1228 1229 #if anything slips through argument 1230 #print "!=== inteded change ",text.replace('/srv/nfs','') 1231 #text = text.replace('/srv/nfs','') 1232 homePath = os.getenv("HOME") 1233 if homePath: 1234 text = text.replace(homePath,'$HOME') 1235 1236 logger.debug("!=== input %s" % text) 1237 logger.debug("!=== output %s" % stdout) 1238 logger.debug("!=== error %s" % stderr) 1239 logger.debug("!=== logs %s" % log) 1240 1241 command = ['qsub','-o', stdout, 1242 '-N', me_dir, 1243 '-e', stderr, 1244 '-V'] 1245 1246 if self.cluster_queue and self.cluster_queue != 'None': 1247 command.extend(['-q', self.cluster_queue]) 1248 1249 a = misc.Popen(command, stdout=subprocess.PIPE, 1250 stderr=subprocess.STDOUT, 1251 stdin=subprocess.PIPE, cwd=cwd) 1252 1253 output = a.communicate(text)[0] 1254 id = output.split(' ')[2] 1255 if not id.isdigit(): 1256 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1257 % output 1258 self.submitted += 1 1259 self.submitted_ids.append(id) 1260 logger.debug(output) 1261 1262 return id
1263 1264 @multiple_try()
1265 - def control_one_job(self, id):
1266 """ control the status of a single job with it's cluster id """ 1267 #cmd = 'qstat '+str(id) 1268 cmd = 'qstat ' 1269 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1270 for line in status.stdout: 1271 #print "!==",line 1272 #line = line.strip() 1273 #if 'Unknown' in line: 1274 # return 'F' 1275 #elif line.startswith(str(id)): 1276 # status = line.split()[4] 1277 if str(id) in line: 1278 status = line.split()[4] 1279 #print "!=status", status 1280 if status in self.idle_tag: 1281 return 'I' 1282 elif status in self.running_tag: 1283 return 'R' 1284 return 'F'
1285 1286 @multiple_try()
1287 - def control(self, me_dir):
1288 """ control the status of a single job with it's cluster id """ 1289 cmd = "qstat " 1290 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1291 1292 me_dir = self.get_jobs_identifier(me_dir) 1293 1294 finished = list(self.submitted_ids) 1295 1296 idle, run, fail = 0, 0, 0 1297 for line in status.stdout: 1298 if me_dir in line: 1299 id,_,_,_,status = line.split()[:5] 1300 if status in self.idle_tag: 1301 idle += 1 1302 finished.remove(id) 1303 elif status in self.running_tag: 1304 run += 1 1305 finished.remove(id) 1306 else: 1307 logger.debug(line) 1308 fail += 1 1309 finished.remove(id) 1310 1311 for id in finished: 1312 self.check_termination(id) 1313 1314 return idle, run, self.submitted - (idle+run+fail), fail
1315 1316 1317 1318 @multiple_try()
1319 - def remove(self, *args, **opts):
1320 """Clean the jobs on the cluster""" 1321 1322 if not self.submitted_ids: 1323 return 1324 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1325 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1326 self.submitted_ids = []
1327
1328 1329 -class LSFCluster(Cluster):
1330 """Basic class for dealing with cluster submission""" 1331 1332 name = 'lsf' 1333 job_id = 'LSB_JOBID' 1334 1335 @multiple_try()
1336 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1337 required_output=[], nb_submit=0):
1338 """Submit the job prog to an LSF cluster""" 1339 1340 1341 me_dir = self.get_jobs_identifier(cwd, prog) 1342 1343 text = "" 1344 command = ['bsub', '-C0', '-J', me_dir] 1345 if cwd is None: 1346 cwd = os.getcwd() 1347 else: 1348 text = " cd %s;" % cwd 1349 if stdout and isinstance(stdout, str): 1350 command.extend(['-o', stdout]) 1351 if stderr and isinstance(stdout, str): 1352 command.extend(['-e', stderr]) 1353 elif stderr == -2: # -2 is subprocess.STDOUT 1354 pass 1355 if log is None: 1356 log = '/dev/null' 1357 1358 text += prog 1359 if argument: 1360 text += ' ' + ' '.join(argument) 1361 1362 if self.cluster_queue and self.cluster_queue != 'None': 1363 command.extend(['-q', self.cluster_queue]) 1364 1365 a = misc.Popen(command, stdout=subprocess.PIPE, 1366 stderr=subprocess.STDOUT, 1367 stdin=subprocess.PIPE, cwd=cwd) 1368 1369 output = a.communicate(text)[0] 1370 #Job <nnnn> is submitted to default queue <normal>. 1371 try: 1372 id = output.split('>',1)[0].split('<')[1] 1373 except: 1374 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1375 % output 1376 if not id.isdigit(): 1377 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1378 % output 1379 self.submitted += 1 1380 self.submitted_ids.append(id) 1381 return id
1382 1383 1384 @multiple_try()
1385 - def control_one_job(self, id):
1386 """ control the status of a single job with it's cluster id """ 1387 1388 cmd = 'bjobs '+str(id) 1389 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1390 1391 for line in status.stdout: 1392 line = line.strip().upper() 1393 if 'JOBID' in line: 1394 continue 1395 elif str(id) not in line: 1396 continue 1397 status = line.split()[2] 1398 if status == 'RUN': 1399 return 'R' 1400 elif status == 'PEND': 1401 return 'I' 1402 elif status == 'DONE': 1403 return 'F' 1404 else: 1405 return 'H' 1406 return 'F'
1407 1408 @multiple_try()
1409 - def control(self, me_dir):
1410 """ control the status of a single job with it's cluster id """ 1411 1412 if not self.submitted_ids: 1413 return 0, 0, 0, 0 1414 1415 cmd = "bjobs " + ' '.join(self.submitted_ids) 1416 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1417 1418 jobstatus = {} 1419 for line in status.stdout: 1420 line = line.strip() 1421 if 'JOBID' in line: 1422 continue 1423 splitline = line.split() 1424 id = splitline[0] 1425 if id not in self.submitted_ids: 1426 continue 1427 jobstatus[id] = splitline[2] 1428 1429 idle, run, fail = 0, 0, 0 1430 for id in self.submitted_ids[:]: 1431 if id in jobstatus: 1432 status = jobstatus[id] 1433 else: 1434 status = 'MISSING' 1435 if status == 'RUN': 1436 run += 1 1437 elif status == 'PEND': 1438 idle += 1 1439 else: 1440 status = self.check_termination(id) 1441 if status == 'wait': 1442 run += 1 1443 elif status == 'resubmit': 1444 idle += 1 1445 1446 return idle, run, self.submitted - (idle+run+fail), fail
1447 1448 @multiple_try()
1449 - def remove(self, *args,**opts):
1450 """Clean the jobs on the cluster""" 1451 1452 if not self.submitted_ids: 1453 return 1454 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1455 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1456 self.submitted_ids = []
1457
1458 -class GECluster(Cluster):
1459 """Class for dealing with cluster submission on a GE cluster""" 1460 1461 name = 'ge' 1462 job_id = 'JOB_ID' 1463 idle_tag = ['qw'] 1464 running_tag = ['r'] 1465 1466 @multiple_try()
1467 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1468 required_output=[], nb_submit=0):
1469 """Submit a job prog to a GE cluster""" 1470 1471 text = "" 1472 if cwd is None: 1473 cwd = os.getcwd() 1474 else: 1475 text = " cd %s; bash " % cwd 1476 if stdout is None: 1477 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1478 if stderr is None: 1479 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1480 elif stderr == -2: # -2 is subprocess.STDOUT 1481 stderr = stdout 1482 if log is None: 1483 log = '/dev/null' 1484 1485 text += prog 1486 if argument: 1487 text += ' ' + ' '.join(argument) 1488 text += '\n' 1489 tmp_submit = os.path.join(cwd, 'tmp_submit') 1490 open(tmp_submit,'w').write(text) 1491 1492 a = misc.Popen(['qsub','-o', stdout, 1493 '-e', stderr, 1494 tmp_submit], 1495 stdout=subprocess.PIPE, 1496 stderr=subprocess.STDOUT, 1497 stdin=subprocess.PIPE, cwd=cwd) 1498 1499 output = a.communicate()[0] 1500 #Your job 874511 ("test.sh") has been submitted 1501 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1502 try: 1503 id = pat.search(output).groups()[0] 1504 except: 1505 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1506 % output 1507 self.submitted += 1 1508 self.submitted_ids.append(id) 1509 return id
1510 1511 @multiple_try()
1512 - def control_one_job(self, id):
1513 """ control the status of a single job with it's cluster id """ 1514 cmd = 'qstat | grep '+str(id) 1515 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1516 if not status: 1517 return 'F' 1518 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1519 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1520 stat = '' 1521 for line in status.stdout.read().split('\n'): 1522 if not line: 1523 continue 1524 line = line.strip() 1525 try: 1526 groups = pat.search(line).groups() 1527 except: 1528 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1529 if groups[0] != id: continue 1530 stat = groups[1] 1531 if not stat: 1532 return 'F' 1533 if stat in self.idle_tag: 1534 return 'I' 1535 if stat in self.running_tag: 1536 return 'R'
1537 1538 @multiple_try()
1539 - def control(self, me_dir=None):
1540 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1541 if not self.submitted_ids: 1542 return 0, 0, 0, 0 1543 idle, run, fail = 0, 0, 0 1544 ongoing = [] 1545 for statusflag in ['p', 'r', 'sh']: 1546 cmd = 'qstat -s %s' % statusflag 1547 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1548 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1549 pat = re.compile("^(\d+)") 1550 for line in status.stdout.read().split('\n'): 1551 line = line.strip() 1552 try: 1553 id = pat.search(line).groups()[0] 1554 except Exception: 1555 pass 1556 else: 1557 if id not in self.submitted_ids: 1558 continue 1559 ongoing.append(id) 1560 if statusflag == 'p': 1561 idle += 1 1562 if statusflag == 'r': 1563 run += 1 1564 if statusflag == 'sh': 1565 fail += 1 1566 for id in list(self.submitted_ids): 1567 if id not in ongoing: 1568 self.check_termination(id) 1569 #self.submitted_ids = ongoing 1570 1571 return idle, run, self.submitted - idle - run - fail, fail
1572 1573 @multiple_try()
1574 - def remove(self, *args, **opts):
1575 """Clean the jobs on the cluster""" 1576 1577 if not self.submitted_ids: 1578 return 1579 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1580 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1581 self.submitted_ids = []
1582
1583 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1584 """start a computation and not wait for it to finish. 1585 this fonction returns a lock which is locked as long as the job is 1586 running.""" 1587 1588 mc = MultiCore(1) 1589 mc.submit(exe, argument, cwd, stdout, **opt) 1590 mc.need_waiting = True 1591 return mc.lock
1592
1593 1594 -class SLURMCluster(Cluster):
1595 """Basic class for dealing with cluster submission""" 1596 1597 name = 'slurm' 1598 job_id = 'SLURM_JOBID' 1599 idle_tag = ['Q','PD','S','CF'] 1600 running_tag = ['R', 'CG'] 1601 complete_tag = ['C'] 1602 identification_length = 8 1603 1604 @multiple_try()
1605 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1606 required_output=[], nb_submit=0):
1607 """Submit a job prog to a SLURM cluster""" 1608 1609 me_dir = self.get_jobs_identifier(cwd, prog) 1610 1611 1612 if cwd is None: 1613 cwd = os.getcwd() 1614 if stdout is None: 1615 stdout = '/dev/null' 1616 if stderr is None: 1617 stderr = '/dev/null' 1618 elif stderr == -2: # -2 is subprocess.STDOUT 1619 stderr = stdout 1620 if log is None: 1621 log = '/dev/null' 1622 1623 command = ['sbatch', '-o', stdout, 1624 '-J', me_dir, 1625 '-e', stderr, prog] + argument 1626 1627 if self.cluster_queue and self.cluster_queue != 'None': 1628 command.insert(1, '-p') 1629 command.insert(2, self.cluster_queue) 1630 1631 a = misc.Popen(command, stdout=subprocess.PIPE, 1632 stderr=subprocess.STDOUT, 1633 stdin=subprocess.PIPE, cwd=cwd) 1634 1635 output = a.communicate() 1636 output_arr = output[0].split(' ') 1637 id = output_arr[3].rstrip() 1638 1639 if not id.isdigit(): 1640 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1641 1642 self.submitted += 1 1643 self.submitted_ids.append(id) 1644 return id
1645 1646 @multiple_try()
1647 - def control_one_job(self, id):
1648 """ control the status of a single job with it's cluster id """ 1649 cmd = 'squeue j'+str(id) 1650 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1651 stderr=open(os.devnull,'w')) 1652 1653 for line in status.stdout: 1654 line = line.strip() 1655 if 'Invalid' in line: 1656 return 'F' 1657 elif line.startswith(str(id)): 1658 status = line.split()[4] 1659 if status in self.idle_tag: 1660 return 'I' 1661 elif status in self.running_tag: 1662 return 'R' 1663 return 'F'
1664 1665 @multiple_try()
1666 - def control(self, me_dir):
1667 """ control the status of a single job with it's cluster id """ 1668 cmd = "squeue" 1669 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1670 1671 me_dir = self.get_jobs_identifier(me_dir) 1672 1673 idle, run, fail = 0, 0, 0 1674 ongoing=[] 1675 for line in status.stdout: 1676 if me_dir in line: 1677 id, _, _,_ , status,_ = line.split(None,5) 1678 ongoing.append(id) 1679 if status in self.idle_tag: 1680 idle += 1 1681 elif status in self.running_tag: 1682 run += 1 1683 elif status in self.complete_tag: 1684 status = self.check_termination(id) 1685 if status == 'wait': 1686 run += 1 1687 elif status == 'resubmit': 1688 idle += 1 1689 else: 1690 fail += 1 1691 1692 #control other finished job 1693 for id in list(self.submitted_ids): 1694 if id not in ongoing: 1695 status = self.check_termination(id) 1696 if status == 'wait': 1697 run += 1 1698 elif status == 'resubmit': 1699 idle += 1 1700 1701 1702 return idle, run, self.submitted - (idle+run+fail), fail
1703 1704 @multiple_try()
1705 - def remove(self, *args, **opts):
1706 """Clean the jobs on the cluster""" 1707 1708 if not self.submitted_ids: 1709 return 1710 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1711 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1712 self.submitted_ids = []
1713
1714 -class HTCaaSCluster(Cluster):
1715 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1716 1717 name= 'htcaas' 1718 job_id = 'HTCAAS_JOBID' 1719 1720 @store_input() 1721 @multiple_try()
1722 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1723 log=None, input_files=[], output_files=[], required_output=[], 1724 nb_submit=0):
1725 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1726 input/output file should be give relative to cwd 1727 """ 1728 # To make workspace name(temp) 1729 if 'ajob' in prog: 1730 prog_num = prog.rsplit("ajob",1)[1] 1731 else: 1732 prog_num = '0' 1733 1734 cur_usr = os.getenv('USER') 1735 1736 if cwd is None: 1737 cwd = os.getcwd() 1738 1739 cwd_cp = cwd.rsplit("/",2) 1740 #print 'This is HTCaaS Mode' 1741 1742 if not stdout is None: 1743 print "stdout: %s" % stdout 1744 1745 if not os.path.exists(prog): 1746 prog = os.path.join(cwd, prog) 1747 1748 if not required_output and output_files: 1749 required_output = output_files 1750 1751 1752 if not 'combine' and not 'pythia' in prog : 1753 cwd_arg = cwd+"/arguments" 1754 temp = ' '.join([str(a) for a in argument]) 1755 arg_cmd="echo '"+temp+"' > " + cwd_arg 1756 #print arg_cmd 1757 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1758 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1759 if argument : 1760 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1761 print command 1762 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1763 id = a.stdout.read().strip() 1764 1765 else: 1766 cwd_arg = cwd+"/arguments" 1767 temp = ' '.join([str(a) for a in argument]) 1768 #arg_cmd="echo '"+temp+"' > " + cwd_arg 1769 #print arg_cmd 1770 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1771 #print os.path.basename(prog) 1772 temp_file_name = "sub." + os.path.basename(prog) 1773 text = """#!/bin/bash 1774 MYPWD=%(cwd)s 1775 cd $MYPWD 1776 input_files=(%(input_files)s ) 1777 for i in ${input_files[@]} 1778 do 1779 chmod -f +x $i 1780 done 1781 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1782 """ 1783 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1784 'arguments': ' '.join([str(a) for a in argument]), 1785 'program': ' ' if '.py' in prog else 'bash'} 1786 1787 # writing a new script for the submission 1788 new_prog = pjoin(cwd, temp_file_name) 1789 open(new_prog, 'w').write(text % dico) 1790 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1791 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1792 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1793 id = a.stdout.read().strip() 1794 1795 nb_try=0 1796 nb_limit=5 1797 if not id.isdigit() : 1798 print "[ID is not digit]:" + id 1799 1800 while not id.isdigit() : 1801 nb_try+=1 1802 print "[fail_retry]:"+ nb_try 1803 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1804 id = a.stdout.read().strip() 1805 if nb_try > nb_limit : 1806 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1807 break 1808 1809 self.submitted += 1 1810 self.submitted_ids.append(id) 1811 1812 return id
1813 1814 @multiple_try(nb_try=10, sleep=10)
1815 - def control_one_job(self, id):
1816 """ control the status of a single job with it's cluster id """ 1817 1818 if id == 0 : 1819 status_out ='C' 1820 else : 1821 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1822 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1823 stderr=subprocess.PIPE) 1824 error = status.stderr.read() 1825 if status.returncode or error: 1826 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1827 status_out= status.stdout.read().strip() 1828 status_out= status_out.split(":",1)[1] 1829 if status_out == 'waiting': 1830 status_out='I' 1831 elif status_out == 'preparing' or status_out == 'running': 1832 status_out = 'R' 1833 elif status_out != 'done': 1834 status_out = 'F' 1835 elif status_out == 'done': 1836 status_out = 'C' 1837 1838 return status_out
1839 1840 @multiple_try(nb_try=15, sleep=1)
1841 - def control(self, me_dir):
1842 """ control the status of a single job with it's cluster id """ 1843 #print "HTCaaS2 Control" 1844 if not self.submitted_ids: 1845 return 0, 0, 0, 0 1846 1847 ongoing = [] 1848 idle, run, fail = 0, 0, 0 1849 1850 if id == 0 : 1851 return 0 , 0, 0, 0 1852 else : 1853 for i in range(len(self.submitted_ids)): 1854 ongoing.append(int(self.submitted_ids[i])) 1855 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 1856 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1857 status_out= status.stdout.read().strip() 1858 status_out= status_out.split(":",1)[1] 1859 if status_out == 'waiting': 1860 idle += 1 1861 elif status_out == 'preparing': 1862 run += 1 1863 elif status_out == 'running': 1864 run += 1 1865 elif status_out != 'done': 1866 fail += 1 1867 1868 if status_out != 'done': 1869 print "["+ self.submitted_ids[i] + "] " + status_out 1870 ''' 1871 for i in range(len(self.submitted_ids)): 1872 if int(self.submitted_ids[i]) not in ongoing: 1873 status = self.check_termination(int(self.submitted_ids[i])) 1874 if status = 'waiting': 1875 idle += 1 1876 elif status == 'resubmit': 1877 idle += 1 1878 elif status == 'failed': 1879 fail += 1 1880 ''' 1881 1882 return idle, run, self.submitted - (idle+run+fail), fail
1883 1884 @multiple_try()
1885 - def remove(self, *args, **opts):
1886 """Clean the jobson the cluster""" 1887 1888 if not self.submitted_ids: 1889 return 1890 for i in range(len(self.submitted_ids)): 1891 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 1892 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1893 self.submitted_ids = []
1894
1895 1896 -class HTCaaS2Cluster(Cluster):
1897 """Class for dealing with cluster submission on a HTCaaS cluster""" 1898 1899 name= 'htcaas2' 1900 job_id = 'HTCAAS2_JOBID' 1901 1902 @store_input() 1903 @multiple_try()
1904 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1905 log=None, input_files=[], output_files=[], required_output=[], 1906 nb_submit=0):
1907 """Submit the job on the cluster NO SHARE DISK 1908 input/output file should be give relative to cwd 1909 """ 1910 # To make workspace name(temp) 1911 if 'ajob' in prog: 1912 prog_num = prog.rsplit("ajob",1)[1] 1913 elif 'run_combine' in prog: 1914 prog_num = '0' 1915 else: 1916 prog_num = prog 1917 1918 cur_usr = os.getenv('USER') 1919 1920 import uuid 1921 dir = str(uuid.uuid4().hex) 1922 #dir = str(int(time())) 1923 prog_dir = '_run%s'% prog_num 1924 prog_dir = dir+prog_dir 1925 1926 if cwd is None: 1927 cwd = os.getcwd() 1928 1929 cwd_cp = cwd.rsplit("/",2) 1930 1931 if stdout is None: 1932 stdout='/dev/null' 1933 1934 if not os.path.exists(prog): 1935 prog = os.path.join(cwd, prog) 1936 1937 if not required_output and output_files: 1938 required_output = output_files 1939 1940 if '/' in argument : 1941 temp_file_name = "sub." + os.path.basename(prog) 1942 else : 1943 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 1944 1945 1946 if 'combine' in prog or 'pythia' in prog : 1947 text = """#!/bin/bash 1948 MYPWD=%(cwd)s 1949 cd $MYPWD 1950 script=%(script)s 1951 input_files=(%(input_files)s ) 1952 if [ $# -ge 1 ]; then 1953 arg1=$1 1954 else 1955 arg1='' 1956 fi 1957 args=' %(arguments)s' 1958 for i in ${input_files[@]}; do 1959 if [[ "$i" == *$script* ]]; then 1960 script=$i 1961 fi 1962 chmod -f +x $i 1963 done 1964 /bin/bash ${script} ${args} > %(stdout)s 1965 """ 1966 1967 elif 'shower' in prog : 1968 text = """#!/bin/bash 1969 MYPWD=%(cwd)s 1970 cd $MYPWD 1971 args=' %(arguments)s' 1972 input_files=( %(input_files)s ) 1973 for i in ${input_files[@]} 1974 do 1975 chmod -f +x $i 1976 done 1977 /bin/bash %(script)s ${args} > $MYPWD/done 1978 """ 1979 1980 else : 1981 text = """#!/bin/bash 1982 MYPWD=%(cwd)s 1983 #mkdir -p $MYTMP 1984 cd $MYPWD 1985 input_files=( %(input_files)s ) 1986 for i in ${input_files[@]} 1987 do 1988 if [[ $i != */*/* ]]; then 1989 i=$PWD"/"$i 1990 fi 1991 echo $i 1992 if [ -d $i ]; then 1993 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1994 else 1995 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1996 fi 1997 done 1998 """ 1999 2000 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 2001 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 2002 'input_files': ' '.join(input_files + [prog]), 2003 'output_files': ' '.join(output_files), 'stdout': stdout, 2004 'arguments': ' '.join([str(a) for a in argument]), 2005 'program': ' ' if '.py' in prog else 'bash'} 2006 2007 # writing a new script for the submission 2008 new_prog = pjoin(cwd, temp_file_name) 2009 open(new_prog, 'w').write(text % dico) 2010 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 2011 2012 # print temp_file_name 2013 cmd1='/bin/bash '+ cwd+'/'+temp_file_name 2014 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE, 2015 stderr=subprocess.PIPE) 2016 #print '%s' % status1.stdout.read() 2017 2018 2019 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog: 2020 2021 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s""" 2022 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 2023 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) , 2024 'prog_dir': prog_dir } 2025 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE, 2026 stderr=subprocess.PIPE) 2027 id = status3.stdout.read().strip() 2028 ## exception 2029 nb_try=0 2030 nb_limit=5 2031 while not id.isdigit() : 2032 nb_try+=1 2033 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 2034 id = a.stdout.read().strip() 2035 if nb_try > nb_limit : 2036 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id 2037 break 2038 2039 temp_file_name2 = "sub." +id 2040 text2 = """#!/bin/bash 2041 MYPWD=%(cwd)s 2042 output_files=( %(output_files)s ) 2043 result=done 2044 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then 2045 for i in ${output_files[@]} 2046 do 2047 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s 2048 chmod -Rf 777 ${MYPWD}/$i 2049 done 2050 for i in ${output_files[@]}; do 2051 if [[ -e ${MYPWD}/$i ]]; then 2052 result=done 2053 else 2054 result=running 2055 echo $result 2056 exit 0 2057 fi 2058 done 2059 echo $result 2060 touch ${MYPWD}/done.%(job_id)s 2061 else 2062 for i in ${output_files[@]}; do 2063 if [ -e ${MYPWD}/$i ]; then 2064 result=done 2065 else 2066 rm -f ${MYPWD}/done.%(job_id)s 2067 result=running 2068 echo $result 2069 exit 0 2070 fi 2071 done 2072 echo $result 2073 2074 fi 2075 2076 """ 2077 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 2078 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 2079 'output_files': ' '.join(output_files), 'job_id': id, 2080 'program': ' ' if '.py' in prog else 'bash'} 2081 2082 homePath = os.getenv("HOME") 2083 outPath = homePath +"/MG5" 2084 2085 new_prog2 = pjoin(outPath, temp_file_name2) 2086 open(new_prog2, 'w').write(text2 % dico2) 2087 misc.Popen(['chmod','+x',new_prog2],cwd=cwd) 2088 2089 2090 self.submitted += 1 2091 self.submitted_ids.append(id) 2092 2093 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog: 2094 if '/dev/null' in stdout : 2095 stdout='' 2096 2097 temp_file_shower = "sub.out" 2098 text_shower = """#!/bin/bash 2099 MYPWD=%(cwd)s 2100 result=done 2101 output_files=(%(output_files)s) 2102 for i in ${output_files[@]}; do 2103 if [ -e $MYPWD/$i -o -e $i ]; then 2104 result=done 2105 else 2106 result=running 2107 echo $result 2108 exit 0 2109 fi 2110 done 2111 echo $result 2112 """ 2113 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files), 2114 'program': ' ' if '.py' in prog else 'bash'} 2115 homePath = os.getenv("HOME") 2116 outPath = homePath +"/MG5" 2117 new_prog_shower = pjoin(outPath, temp_file_shower) 2118 open(new_prog_shower, 'w').write(text_shower % dico_shower) 2119 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd) 2120 2121 id='-1' 2122 self.submitted += 1 2123 self.submitted_ids.append(id) 2124 2125 else : 2126 id='-2' 2127 self.submitted += 1 2128 self.submitted_ids.append(id) 2129 2130 return id
2131 2132 @multiple_try(nb_try=10, sleep=10)
2133 - def control_one_job(self, id):
2134 """ control the status of a single job with it's cluster id """ 2135 2136 homePath = os.getenv("HOME") 2137 outPath = homePath +"/MG5" 2138 2139 2140 if id == '0' or id=='-2' : 2141 status_out ='done' 2142 elif id == '-1' : 2143 cmd='/bin/bash ' +outPath+'/sub.out' 2144 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2145 status_out=status.stdout.read().strip() 2146 print "["+id+"]" + status_out 2147 if status_out == 'waiting': 2148 status_out='wait' 2149 elif status_out == 'preparing' or status_out == 'running': 2150 status_out = 'R' 2151 elif status_out != 'done': 2152 status_out = 'F' 2153 elif status_out == 'done': 2154 status_out = 'C' 2155 2156 print "["+id+"]" + status_out 2157 else : 2158 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 2159 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 2160 stderr=subprocess.PIPE) 2161 error = status.stderr.read() 2162 if status.returncode or error: 2163 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 2164 status_out= status.stdout.read().strip() 2165 status_out= status_out.split(":",1)[1] 2166 print "["+id+"]" + status_out 2167 if status_out == 'waiting': 2168 status_out='wait' 2169 elif status_out == 'preparing' or status_out == 'running': 2170 status_out = 'R' 2171 elif status_out == 'failed' : 2172 args = self.retry_args[id] 2173 id_temp = self.submit2(**args) 2174 del self.retry_args[id] 2175 self.submitted_ids.remove(id) 2176 status_out = 'I' 2177 elif status_out != 'done': 2178 status_out = 'F' 2179 elif status_out == 'done': 2180 status_out = 'C' 2181 2182 return status_out
2183 2184 2185 @check_interupt() 2186 @multiple_try(nb_try=15, sleep=10)
2187 - def control(self, me_dir):
2188 """ control the status of a single job with it's cluster id """ 2189 2190 if not self.submitted_ids: 2191 return 0, 0, 0, 0 2192 2193 ongoing = [] 2194 idle, run, fail = 0, 0, 0 2195 2196 homePath = os.getenv("HOME") 2197 outPath = homePath +"/MG5" 2198 2199 for i in range(len(self.submitted_ids)): 2200 ongoing.append(self.submitted_ids[i]) 2201 if self.submitted_ids[i] == '-2' : 2202 return 0,0,0,0 2203 if self.submitted_ids[i] == '0' : 2204 # ongoing.append('0') 2205 status_out='done' 2206 elif self.submitted_ids[i] == '-1' : 2207 cmd='/bin/bash ' +outPath+'/sub.out' 2208 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2209 status_out=status.stdout.read().strip() 2210 if status_out == 'waiting': 2211 idle += 1 2212 elif status_out == 'preparing': 2213 run += 1 2214 elif status_out == 'running': 2215 run += 1 2216 elif status_out != 'done': 2217 fail += 1 2218 else : 2219 args = self.retry_args[str(self.submitted_ids[i])] 2220 if 'required_output'in args and not args['required_output']: 2221 args['required_output'] = args['output_files'] 2222 self.retry_args[str(self.submitted_ids[i])] = args 2223 2224 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 2225 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 2226 status_out= status.stdout.read().strip() 2227 status_out= status_out.split(":",1)[1] 2228 if status_out == 'waiting': 2229 idle += 1 2230 elif status_out == 'preparing': 2231 run += 1 2232 elif status_out == 'running': 2233 run += 1 2234 elif status_out == 'failed' or status_out == 'canceled': 2235 id = self.submit2(**args) 2236 #self.submitted_ids[i]=id 2237 del self.retry_args[self.submitted_ids[i]] 2238 self.submitted_ids.remove(self.submitted_ids[i]) 2239 self.submitted-=1 2240 idle += 1 2241 elif status_out != 'done': 2242 fail += 1 2243 if status_out == 'done': 2244 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i] 2245 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2246 aa= status2.stdout.read().strip() 2247 #result= self.check_termination(str(self.submitted_ids[i])) 2248 #print result 2249 #if not result : 2250 #if not self.check_termination(str(self.submitted_ids[i])): 2251 # print "not_self" + self.submitted_ids[i] 2252 # idle += 1 2253 #else : 2254 for path in args['required_output']: 2255 if args['cwd']: 2256 path = pjoin(args['cwd'], path) 2257 # check that file exists and is not empty. 2258 temp1=os.path.exists(path) 2259 temp2=os.stat(path).st_size 2260 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 2261 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2262 aa= status2.stdout.read().strip() 2263 if aa == 'done': 2264 self.submitted_ids[i] = '0' 2265 elif aa == 'running': 2266 run += 1 2267 else : 2268 self.submitted_ids[i]='0' 2269 2270 2271 for i in range(len(self.submitted_ids)): 2272 if str(self.submitted_ids[i]) not in ongoing: 2273 status2= self.check_termination(str(self.submitted_ids[i])) 2274 if status2 == 'wait': 2275 run += 1 2276 elif status2 == 'resubmit': 2277 idle += 1 2278 2279 return idle, run, self.submitted - (idle+run+fail), fail
2280 2281 @multiple_try()
2282 - def remove(self, *args, **opts):
2283 """Clean the jobson the cluster""" 2284 2285 if not self.submitted_ids: 2286 return 2287 for i in range(len(self.submitted_ids)): 2288 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 2289 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 2290 self.submitted_ids = []
2291 2292 2293 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2294 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2295 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2296