Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21   
  22  logger = logging.getLogger('madgraph.cluster')  
  23   
  24  try: 
  25      from madgraph import MadGraph5Error 
  26      import madgraph.various.misc as misc 
  27  except Exception, error: 
  28      if __debug__: 
  29          print  str(error) 
  30      from internal import MadGraph5Error 
  31      import internal.misc as misc 
  32   
  33  pjoin = os.path.join 
34 35 -class ClusterManagmentError(MadGraph5Error):
36 pass
37
38 -class NotImplemented(MadGraph5Error):
39 pass
40 41 42 multiple_try = misc.multiple_try 43 pjoin = os.path.join
44 45 46 -def check_interupt(error=KeyboardInterrupt):
47 48 def deco_interupt(f): 49 def deco_f_interupt(self, *args, **opt): 50 try: 51 return f(self, *args, **opt) 52 except error: 53 try: 54 self.remove(*args, **opt) 55 except Exception: 56 pass 57 raise error
58 return deco_f_interupt 59 return deco_interupt 60
61 -def store_input(arg=''):
62 63 def deco_store(f): 64 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 65 input_files=[], output_files=[], required_output=[], nb_submit=0): 66 frame = inspect.currentframe() 67 args, _, _, values = inspect.getargvalues(frame) 68 args = dict([(i, values[i]) for i in args if i != 'self']) 69 id = f(self, **args) 70 if self.nb_retry > 0: 71 self.retry_args[id] = args 72 return id
73 return deco_f_store 74 return deco_store 75
76 77 -class Cluster(object):
78 """Basic Class for all cluster type submission""" 79 name = 'mother class' 80 identifier_length = 14 81
82 - def __init__(self,*args, **opts):
83 """Init the cluster""" 84 85 self.submitted = 0 86 self.submitted_ids = [] 87 self.finish = 0 88 if 'cluster_queue' in opts: 89 self.cluster_queue = opts['cluster_queue'] 90 else: 91 self.cluster_queue = 'madgraph' 92 if 'cluster_temp_path' in opts: 93 self.temp_dir = opts['cluster_temp_path'] 94 else: 95 self.temp_dir = None 96 self.options = {'cluster_status_update': (600, 30)} 97 for key,value in opts.items(): 98 self.options[key] = value 99 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 100 self.cluster_retry_wait = opts['cluster_retry_wait'] if 'cluster_retry_wait' in opts else 300 101 self.options = dict(opts) 102 self.retry_args = {}
103 104
105 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 106 log=None, required_output=[], nb_submit=0):
107 """How to make one submission. Return status id on the cluster.""" 108 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
109 110 @store_input()
111 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 112 log=None, input_files=[], output_files=[], required_output=[],nb_submit=0):
113 """How to make one submission. Return status id on the cluster. 114 NO SHARE DISK""" 115 116 if cwd is None: 117 cwd = os.getcwd() 118 if not os.path.exists(prog): 119 prog = os.path.join(cwd, prog) 120 121 if not required_output and output_files: 122 required_output = output_files 123 124 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 125 (input_files == [] == output_files): 126 return self.submit(prog, argument, cwd, stdout, stderr, log, 127 required_output=required_output, nb_submit=nb_submit) 128 129 if not input_files and not output_files: 130 # not input/output so not using submit2 131 return self.submit(prog, argument, cwd, stdout, stderr, log, 132 required_output=required_output, nb_submit=nb_submit) 133 134 if cwd is None: 135 cwd = os.getcwd() 136 if not os.path.exists(prog): 137 prog = os.path.join(cwd, prog) 138 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 139 140 text = """#!/bin/bash 141 MYTMP=%(tmpdir)s/run$%(job_id)s 142 MYPWD=%(cwd)s 143 mkdir -p $MYTMP 144 cd $MYPWD 145 input_files=( %(input_files)s ) 146 for i in ${input_files[@]} 147 do 148 cp -R -L $i $MYTMP 149 done 150 cd $MYTMP 151 echo '%(arguments)s' > arguments 152 chmod +x ./%(script)s 153 %(program)s ./%(script)s %(arguments)s 154 exit=$? 155 output_files=( %(output_files)s ) 156 for i in ${output_files[@]} 157 do 158 cp -r $MYTMP/$i $MYPWD 159 done 160 # if [ "$exit" -eq "0" ] 161 # then 162 rm -rf $MYTMP 163 # fi 164 """ 165 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 166 'cwd': cwd, 'job_id': self.job_id, 167 'input_files': ' '.join(input_files + [prog]), 168 'output_files': ' '.join(output_files), 169 'arguments': ' '.join([str(a) for a in argument]), 170 'program': ' ' if '.py' in prog else 'bash'} 171 172 # writing a new script for the submission 173 new_prog = pjoin(cwd, temp_file_name) 174 open(new_prog, 'w').write(text % dico) 175 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 176 177 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 178 required_output=required_output, nb_submit=nb_submit)
179 180
181 - def control(self, me_dir=None):
182 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 183 if not self.submitted_ids: 184 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 185 idle, run, fail = 0, 0, 0 186 for pid in self.submitted_ids[:]: 187 status = self.control_one_job(id) 188 if status == 'I': 189 idle += 1 190 elif status == 'R': 191 run += 1 192 elif status == 'F': 193 self.finish +=1 194 self.submitted_ids.remove(pid) 195 else: 196 fail += 1 197 198 return idle, run, self.finish, fail
199
200 - def control_one_job(self, pid):
201 """ control the status of a single job with it's cluster id """ 202 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
203
204 - def get_jobs_identifier(self, path, second_path=None):
205 """get a unique run_name for all the jobs helps to identify the runs 206 in the controller for some cluster.""" 207 208 if second_path: 209 path = os.path.realpath(pjoin(path, second_path)) 210 elif not os.path.exists(path): 211 return path # job already done 212 213 if 'SubProcesses' in path: 214 target = path.rsplit('/SubProcesses',1)[0] 215 elif 'MCatNLO' in path: 216 target = path.rsplit('/MCatNLO',1)[0] 217 elif second_path: 218 target=path 219 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 220 else: 221 target = path 222 223 if target.endswith('/'): 224 target = target[:-1] 225 226 target = misc.digest(target)[-self.identifier_length:] 227 if not target[0].isalpha(): 228 target = 'a' + target[1:] 229 230 return target
231 232 233 @check_interupt()
234 - def wait(self, me_dir, fct, minimal_job=0):
235 """Wait that all job are finish. 236 if minimal_job set, then return if idle + run is lower than that number""" 237 238 239 mode = 1 # 0 is long waiting/ 1 is short waiting 240 nb_iter = 0 241 nb_short = 0 242 change_at = 5 # number of iteration from which we wait longer between update. 243 #usefull shortcut for readibility 244 longtime, shorttime = self.options['cluster_status_update'] 245 246 while 1: 247 old_mode = mode 248 nb_iter += 1 249 idle, run, finish, fail = self.control(me_dir) 250 if fail: 251 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 252 if idle + run == 0: 253 #time.sleep(20) #security to ensure that the file are really written on the disk 254 logger.info('All jobs finished') 255 break 256 if idle + run < minimal_job: 257 return 258 fct(idle, run, finish) 259 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 260 if nb_iter < change_at: 261 mode = 1 262 elif idle < run: 263 if old_mode == 0: 264 if nb_short: 265 mode = 0 #we already be back from short to long so stay in long 266 #check if we need to go back to short mode 267 elif idle: 268 if nb_iter > change_at + int(longtime)//shorttime: 269 mode = 0 #stay in long waiting mode 270 else: 271 mode = 1 # pass in short waiting mode 272 nb_short =0 273 else: 274 mode = 1 # pass in short waiting mode 275 nb_short = 0 276 elif old_mode == 1: 277 nb_short +=1 278 if nb_short > 3* max(change_at, int(longtime)//shorttime): 279 mode = 0 #go back in slow waiting 280 else: 281 mode = 0 282 283 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 284 if old_mode > mode: 285 logger.info('''Start to wait %ss between checking status. 286 Note that you can change this time in the configuration file. 287 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 288 289 #now Waiting! 290 if mode == 0: 291 try: 292 time.sleep(self.options['cluster_status_update'][0]) 293 except KeyboardInterrupt: 294 logger.info('start to update the status') 295 nb_iter = min(0, change_at -2) 296 nb_short = 0 297 else: 298 time.sleep(self.options['cluster_status_update'][1]) 299 300 301 self.submitted = 0 302 self.submitted_ids = []
303
304 - def check_termination(self, job_id):
305 """Check the termination of the jobs with job_id and relaunch it if needed.""" 306 307 308 if job_id not in self.retry_args: 309 return True 310 311 args = self.retry_args[job_id] 312 if 'time_check' in args: 313 time_check = args['time_check'] 314 else: 315 time_check = 0 316 317 for path in args['required_output']: 318 if args['cwd']: 319 path = pjoin(args['cwd'], path) 320 # check that file exists and is not empty. 321 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 322 break 323 else: 324 # all requested output are present 325 if time_check > 0: 326 logger.info('Job %s Finally found the missing output.' % (job_id)) 327 del self.retry_args[job_id] 328 self.submitted_ids.remove(job_id) 329 return 'done' 330 331 if time_check == 0: 332 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 333 args['time_check'] = time.time() 334 return 'wait' 335 elif self.cluster_retry_wait > time.time() - time_check: 336 return 'wait' 337 338 #jobs failed to be completed even after waiting time!! 339 if self.nb_retry < 0: 340 logger.critical('''Fail to run correctly job %s. 341 with option: %s 342 file missing: %s''' % (job_id, args, path)) 343 raw_input('press enter to continue.') 344 elif self.nb_retry == 0: 345 logger.critical('''Fail to run correctly job %s. 346 with option: %s 347 file missing: %s. 348 Stopping all runs.''' % (job_id, args, path)) 349 #self.remove() 350 elif args['nb_submit'] >= self.nb_retry: 351 logger.critical('''Fail to run correctly job %s. 352 with option: %s 353 file missing: %s 354 Fails %s times 355 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 356 #self.remove() 357 else: 358 args['nb_submit'] += 1 359 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 360 del self.retry_args[job_id] 361 self.submitted_ids.remove(job_id) 362 if 'time_check' in args: 363 del args['time_check'] 364 self.submit2(**args) 365 return 'resubmit' 366 return 'done'
367 368 369 370 @check_interupt()
371 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 372 stderr=None, log=None, required_output=[], nb_submit=0, 373 input_files=[], output_files=[]):
374 """launch one job on the cluster and wait for it""" 375 376 special_output = False # tag for concatenate the error with the output. 377 if stderr == -2 and stdout: 378 #We are suppose to send the output to stdout 379 special_output = True 380 stderr = stdout + '.err' 381 382 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 383 required_output=required_output, input_files=input_files, 384 output_files=output_files) 385 386 frame = inspect.currentframe() 387 args, _, _, values = inspect.getargvalues(frame) 388 args = dict([(i, values[i]) for i in args if i != 'self']) 389 self.retry_args[id] = args 390 391 nb_wait=0 392 while 1: 393 nb_wait+=1 394 status = self.control_one_job(id) 395 if not status in ['R','I']: 396 status = self.check_termination(id) 397 if status in ['wait']: 398 time.sleep(30) 399 continue 400 elif status in ['resubmit']: 401 id = self.submitted_ids[0] 402 time.sleep(30) 403 continue 404 #really stop! 405 time.sleep(30) #security to ensure that the file are really written on the disk 406 break 407 time.sleep(self.options['cluster_status_update'][1]) 408 409 if required_output: 410 status = self.check_termination(id) 411 if status == 'wait': 412 run += 1 413 elif status == 'resubmit': 414 idle += 1 415 416 417 if special_output: 418 # combine the stdout and the stderr 419 #wait up to 50 s to see if those files exists 420 for i in range(5): 421 if os.path.exists(stdout): 422 if not os.path.exists(stderr): 423 time.sleep(5) 424 if os.path.exists(stderr): 425 err_text = open(stderr).read() 426 if not err_text: 427 return 428 logger.warning(err_text) 429 text = open(stdout).read() 430 open(stdout,'w').write(text + err_text) 431 else: 432 return 433 time.sleep(10)
434
435 - def remove(self, *args, **opts):
436 """ """ 437 logger.warning("""This cluster didn't support job removal, 438 the jobs are still running on the cluster.""")
439
440 -class MultiCore(Cluster):
441 """ class for dealing with the submission in multiple node""" 442 443 job_id = '$' 444
445 - def __init__(self, *args, **opt):
446 """Init the cluster""" 447 import thread 448 super(MultiCore, self).__init__(self, *args, **opt) 449 450 451 self.submitted = 0 452 self.finish = 0 453 if 'nb_core' in opt: 454 self.nb_core = opt['nb_core'] 455 elif isinstance(args[0],int): 456 self.nb_core = args[0] 457 else: 458 self.nb_core = 1 459 self.update_fct = None 460 461 # initialize the thread controler 462 self.need_waiting = False 463 self.nb_used = 0 464 self.lock = thread.allocate_lock() 465 self.done = 0 466 self.waiting_submission = [] 467 self.pids = [] 468 self.fail_msg = None
469
470 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 471 stderr=None, log=None, **opts):
472 """launch one job and wait for it""" 473 if isinstance(stdout, str): 474 stdout = open(stdout, 'w') 475 if isinstance(stderr, str): 476 stdout = open(stderr, 'w') 477 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
478 479
480 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 481 log=None, required_output=[], nb_submit=0):
482 """submit a job on multicore machine""" 483 484 self.submitted +=1 485 if cwd is None: 486 cwd = os.getcwd() 487 if isinstance(prog, str): 488 if not os.path.exists(prog) and not misc.which(prog): 489 prog = os.path.join(cwd, prog) 490 491 import thread 492 if self.waiting_submission or self.nb_used == self.nb_core: 493 self.waiting_submission.append((prog, argument,cwd, stdout)) 494 # check that none submission is already finished 495 while self.nb_used < self.nb_core and self.waiting_submission: 496 arg = self.waiting_submission.pop(0) 497 self.nb_used += 1 # udpate the number of running thread 498 thread.start_new_thread(self.launch, arg) 499 elif self.nb_used < self.nb_core -1: 500 self.nb_used += 1 # upate the number of running thread 501 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout)) 502 elif self.nb_used == self.nb_core -1: 503 self.nb_used += 1 # upate the number of running thread 504 thread.start_new_thread(self.launch, (prog, argument, cwd, stdout))
505 506
507 - def launch(self, exe, argument, cwd, stdout):
508 """ way to launch for multicore. If exe is a string then treat it as 509 an executable. Otherwise treat it as a function""" 510 import thread 511 def end(self, pid): 512 self.nb_used -= 1 513 self.done += 1 514 try: 515 self.pids.remove(pid) 516 except: 517 pass
518 519 fail_msg = None 520 try: 521 if isinstance(exe,str): 522 if os.path.exists(exe) and not exe.startswith('/'): 523 exe = './' + exe 524 proc = misc.Popen([exe] + argument, cwd=cwd, stdout=stdout, 525 stderr=subprocess.STDOUT) 526 pid = proc.pid 527 self.pids.append(pid) 528 proc.wait() 529 if proc.returncode not in [0, 143, -15]: 530 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 531 (' '.join([exe]+argument), proc.returncode) 532 #self.fail_msg = fail_msg 533 logger.warning(fail_msg) 534 try: 535 log = open(glob.glob(pjoin(cwd,'*','log.txt'))[0]).read() 536 logger.warning('Last 15 lines of logfile %s:\n%s\n' % \ 537 (pjoin(cwd,'*','log.txt'), '\n'.join(log.split('\n')[-15:-1]) + '\n')) 538 except (IOError, AttributeError, IndexError): 539 logger.warning('Please look for possible logfiles in %s' % cwd) 540 pass 541 self.remove(fail_msg) 542 else: 543 pid = tuple([id(o) for o in [exe] + argument]) 544 self.pids.append(pid) 545 # the function should return 0 if everything is fine 546 # the error message otherwise 547 returncode = exe(argument) 548 if returncode != 0: 549 logger.warning(returncode) 550 self.remove() 551 552 553 554 # release the lock for allowing to launch the next job 555 security = 0 556 # check that the status is locked to avoid coincidence unlock 557 while 1: 558 while not self.lock.locked(): 559 if not self.need_waiting: 560 # Main is not yet locked 561 end(self, pid) 562 return 563 elif security > 60: 564 end(self, pid) 565 return 566 security += 1 567 time.sleep(1) 568 try: 569 self.lock.release() 570 except thread.error: 571 continue 572 break 573 end(self, pid) 574 575 576 except Exception, error: 577 #logger.critical('one core fails with %s' % error) 578 self.remove() 579 raise
580 581 582 583
584 - def wait(self, me_dir, update_status):
585 """Wait that all thread finish 586 self.nb_used and self.done are update via each jobs (thread and local) 587 self.submitted is the nb of times that submitted has been call (local) 588 remaining is the nb of job that we still have to wait. (local) 589 self.pids is the list of the BASH pid of the submitted jobs. (thread) 590 591 WARNING: In principle all those value are coherent but since some are 592 modified in various thread, those data can be corrupted. (not the local 593 one). Nb_used in particular shouldn't be trusted too much. 594 This code check in different ways that all jobs have finished. 595 596 In principle, the statement related to '#security #X' are not used. 597 In practise they are times to times. 598 """ 599 600 import thread 601 602 remaining = self.submitted - self.done 603 604 while self.nb_used < self.nb_core: 605 if self.waiting_submission: 606 arg = self.waiting_submission.pop(0) 607 thread.start_new_thread(self.launch, arg) 608 self.nb_used += 1 # update the number of running thread 609 else: 610 break 611 612 try: 613 self.need_waiting = True 614 self.lock.acquire() 615 no_in_queue = 0 616 secure_mode = False # forbid final acauire if in securemode 617 while self.waiting_submission or self.nb_used: 618 if self.fail_msg: 619 msg, self.fail_msg = self.fail_msg, None 620 self.remove() 621 raise Exception, msg 622 if update_status: 623 update_status(len(self.waiting_submission), self.nb_used, self.done) 624 # security#1 that all job expected to be launched since 625 # we enter in this function are indeed launched. 626 if len(self.waiting_submission) == 0 == remaining : 627 self.done = self.submitted 628 break 629 630 # security #2: nb_used >0 but nothing remains as BASH PID 631 if len(self.waiting_submission) == 0 and len(self.pids) == 0: 632 if self.submitted == self.done: 633 break 634 logger.debug('Found too many jobs. Recovering') 635 no_in_queue += 1 636 time.sleep(min(180, 5 * no_in_queue)) 637 if no_in_queue > 3: 638 logger.debug('Still too many jobs. Continue') 639 break 640 continue 641 642 # security #3: if nb_used not reliable pass in secure mode 643 if not secure_mode and len(self.waiting_submission) != 0: 644 if self.nb_used != self.nb_core: 645 if self.nb_used != len(self.pids): 646 secure_mode = True 647 # security #4: nb_used not reliable use secure mode to finish the run 648 if secure_mode and not self.waiting_submission: 649 self.need_waiting = False 650 if self.lock.locked(): 651 self.lock.release() 652 break 653 654 # Wait for core to finish 655 self.lock.acquire() 656 remaining -=1 # update remaining job 657 #submit next one 658 if self.waiting_submission: 659 arg = self.waiting_submission.pop(0) 660 thread.start_new_thread(self.launch, arg) 661 self.nb_used += 1 # update the number of running thread 662 663 if self.fail_msg: 664 msg, self.fail_msg = self.fail_msg, None 665 self.remove() 666 raise Exception, msg 667 # security #5: checked that self.nb_used is not lower than expected 668 #This is the most current problem. 669 no_in_queue = 0 670 while self.submitted > self.done: 671 if self.fail_msg: 672 msg, self.fail_msg = self.fail_msg, None 673 self.remove() 674 raise Exception, msg 675 if no_in_queue == 0: 676 logger.debug('Some jobs have been lost. Try to recover') 677 #something bad happens 678 if not len(self.pids): 679 # The job is not running 680 logger.critical('Some jobs have been lost in the multicore treatment.') 681 logger.critical('The results might be incomplete. (Trying to continue anyway)') 682 break 683 elif update_status: 684 update_status(len(self.waiting_submission), len(self.pids) , 685 self.done) 686 # waiting that those jobs ends. 687 if not secure_mode: 688 self.lock.acquire() 689 else: 690 no_in_queue += 1 691 try: 692 time.sleep(min(180,5*no_in_queue)) 693 if no_in_queue > 5 * 3600.0 / 162: 694 break 695 except KeyboardInterrupt: 696 logger.warning('CTRL-C assumes that all jobs are done. Continue the code') 697 self.pids = [] # avoid security 6 698 break 699 700 # security #6. check that queue is empty. don't 701 no_in_queue = 0 702 while len(self.pids): 703 if self.fail_msg: 704 msg, self.fail_msg = self.fail_msg, None 705 self.remove() 706 raise Exception, msg 707 self.need_waiting = False 708 if self.lock.locked(): 709 self.lock.release() 710 secure_mode = True 711 if no_in_queue == 0 : 712 logger.warning('Some jobs have been lost. Try to recover.') 713 logger.warning('Hitting ctrl-c will consider that all jobs are done and continue the code.') 714 try: 715 #something very bad happens 716 if update_status: 717 update_status(len(self.waiting_submission), len(self.pids) , 718 self.done) 719 time.sleep(min(5*no_in_queue, 180)) 720 no_in_queue += 1 721 if no_in_queue > 5 * 3600.0 / 162: 722 break 723 except KeyboardInterrupt: 724 break 725 726 # print a last time the status (forcing 0 for the running) 727 if update_status: 728 self.next_update = 0 729 update_status(len(self.waiting_submission), 0, self.done) 730 731 # reset variable for next submission 732 self.need_waiting = False 733 security = 0 734 while not self.lock.locked() and security < 10: 735 # check that the status is locked to avoid coincidence unlock 736 if secure_mode: 737 security = 10 738 security +=1 739 time.sleep(1) 740 if security < 10: 741 self.lock.release() 742 self.done = 0 743 self.nb_used = 0 744 self.submitted = 0 745 self.pids = [] 746 747 except KeyboardInterrupt: 748 self.remove() 749 raise 750 if self.fail_msg: 751 msg, self.fail_msg = self.fail_msg, None 752 self.remove() 753 raise Exception, msg
754 755
756 - def remove(self, error=None):
757 """Ensure that all thread are killed""" 758 logger.info('remove job currently running') 759 self.waiting_submission = [] 760 if error: 761 self.fail_msg = error 762 for pid in list(self.pids): 763 if isinstance(pid, tuple): 764 continue 765 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 766 % {'pid':pid} ) 767 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} ) 768 if out == 0: 769 try: 770 self.pids.remove(pid) 771 except: 772 pass 773 #out = os.system('kill -9 %s &> /dev/null' % pid) 774 775 time.sleep(1) # waiting if some were submitting at the time of ctrl-c 776 for pid in list(self.pids): 777 if isinstance(pid, tuple): 778 continue 779 out = os.system('CPIDS=$(pgrep -P %s); kill -15 $CPIDS > /dev/null 2>&1' % pid ) 780 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} ) 781 if out == 0: 782 try: 783 self.pids.remove(pid) 784 except: 785 pass
786
787 -class CondorCluster(Cluster):
788 """Basic class for dealing with cluster submission""" 789 790 name = 'condor' 791 job_id = 'CONDOR_ID' 792 793 794 795 @multiple_try()
796 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 797 required_output=[], nb_submit=0):
798 """Submit a job prog to a Condor cluster""" 799 800 text = """Executable = %(prog)s 801 output = %(stdout)s 802 error = %(stderr)s 803 log = %(log)s 804 %(argument)s 805 environment = CONDOR_ID=$(Cluster).$(Process) 806 Universe = vanilla 807 notification = Error 808 Initialdir = %(cwd)s 809 %(requirement)s 810 getenv=True 811 queue 1 812 """ 813 814 if self.cluster_queue not in ['None', None]: 815 requirement = 'Requirements = %s=?=True' % self.cluster_queue 816 else: 817 requirement = '' 818 819 if cwd is None: 820 cwd = os.getcwd() 821 if stdout is None: 822 stdout = '/dev/null' 823 if stderr is None: 824 stderr = '/dev/null' 825 if log is None: 826 log = '/dev/null' 827 if not os.path.exists(prog): 828 prog = os.path.join(cwd, prog) 829 if argument: 830 argument = 'Arguments = %s' % ' '.join(argument) 831 else: 832 argument = '' 833 834 835 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 836 'stderr': stderr,'log': log,'argument': argument, 837 'requirement': requirement} 838 839 open('submit_condor','w').write(text % dico) 840 a = misc.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE) 841 output = a.stdout.read() 842 #Submitting job(s). 843 #Logging submit event(s). 844 #1 job(s) submitted to cluster 2253622. 845 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 846 try: 847 id = pat.search(output).groups()[0] 848 except: 849 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 850 % output 851 self.submitted += 1 852 self.submitted_ids.append(id) 853 return id
854 855 @store_input() 856 @multiple_try()
857 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 858 log=None, input_files=[], output_files=[], required_output=[], 859 nb_submit=0):
860 """Submit the job on the cluster NO SHARE DISK 861 input/output file should be give relative to cwd 862 """ 863 864 if not required_output and output_files: 865 required_output = output_files 866 867 if (input_files == [] == output_files): 868 return self.submit(prog, argument, cwd, stdout, stderr, log, 869 required_output=required_output, nb_submit=nb_submit) 870 871 text = """Executable = %(prog)s 872 output = %(stdout)s 873 error = %(stderr)s 874 log = %(log)s 875 %(argument)s 876 should_transfer_files = YES 877 when_to_transfer_output = ON_EXIT 878 transfer_input_files = %(input_files)s 879 %(output_files)s 880 Universe = vanilla 881 notification = Error 882 Initialdir = %(cwd)s 883 %(requirement)s 884 getenv=True 885 queue 1 886 """ 887 888 if self.cluster_queue not in ['None', None]: 889 requirement = 'Requirements = %s=?=True' % self.cluster_queue 890 else: 891 requirement = '' 892 893 if cwd is None: 894 cwd = os.getcwd() 895 if stdout is None: 896 stdout = '/dev/null' 897 if stderr is None: 898 stderr = '/dev/null' 899 if log is None: 900 log = '/dev/null' 901 if not os.path.exists(prog): 902 prog = os.path.join(cwd, prog) 903 if argument: 904 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 905 else: 906 argument = '' 907 # input/output file treatment 908 if input_files: 909 input_files = ','.join(input_files) 910 else: 911 input_files = '' 912 if output_files: 913 output_files = 'transfer_output_files = %s' % ','.join(output_files) 914 else: 915 output_files = '' 916 917 918 919 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 920 'stderr': stderr,'log': log,'argument': argument, 921 'requirement': requirement, 'input_files':input_files, 922 'output_files':output_files} 923 924 open('submit_condor','w').write(text % dico) 925 a = subprocess.Popen(['condor_submit','submit_condor'], stdout=subprocess.PIPE) 926 output = a.stdout.read() 927 #Submitting job(s). 928 #Logging submit event(s). 929 #1 job(s) submitted to cluster 2253622. 930 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 931 try: 932 id = pat.search(output).groups()[0] 933 except: 934 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 935 % output 936 self.submitted += 1 937 self.submitted_ids.append(id) 938 return id
939 940 941 942 943 944 @multiple_try(nb_try=10, sleep=10)
945 - def control_one_job(self, id):
946 """ control the status of a single job with it's cluster id """ 947 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 948 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 949 stderr=subprocess.PIPE) 950 951 error = status.stderr.read() 952 if status.returncode or error: 953 raise ClusterManagmentError, 'condor_q returns error: %s' % error 954 955 return status.stdout.readline().strip()
956 957 @check_interupt() 958 @multiple_try(nb_try=10, sleep=10)
959 - def control(self, me_dir):
960 """ control the status of a single job with it's cluster id """ 961 962 if not self.submitted_ids: 963 return 0, 0, 0, 0 964 965 packet = 15000 966 idle, run, fail = 0, 0, 0 967 ongoing = [] 968 for i in range(1+(len(self.submitted_ids)-1)//packet): 969 start = i * packet 970 stop = (i+1) * packet 971 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 972 " -format \'%-2s\ ' \'ClusterId\' " + \ 973 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 974 975 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 976 stderr=subprocess.PIPE) 977 error = status.stderr.read() 978 if status.returncode or error: 979 raise ClusterManagmentError, 'condor_q returns error: %s' % error 980 981 for line in status.stdout: 982 id, status = line.strip().split() 983 ongoing.append(int(id)) 984 if status in ['I','U']: 985 idle += 1 986 elif status == 'R': 987 run += 1 988 elif status != 'C': 989 fail += 1 990 991 for id in list(self.submitted_ids): 992 if int(id) not in ongoing: 993 status = self.check_termination(id) 994 if status == 'wait': 995 run += 1 996 elif status == 'resubmit': 997 idle += 1 998 999 return idle, run, self.submitted - (idle+run+fail), fail
1000 1001 @multiple_try()
1002 - def remove(self, *args, **opts):
1003 """Clean the jobson the cluster""" 1004 1005 if not self.submitted_ids: 1006 return 1007 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1008 1009 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1010
1011 -class PBSCluster(Cluster):
1012 """Basic class for dealing with cluster submission""" 1013 1014 name = 'pbs' 1015 job_id = 'PBS_JOBID' 1016 idle_tag = ['Q'] 1017 running_tag = ['T','E','R'] 1018 complete_tag = ['C'] 1019 1020 maximum_submited_jobs = 2500 1021 1022 @multiple_try()
1023 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1024 required_output=[], nb_submit=0):
1025 """Submit a job prog to a PBS cluster""" 1026 1027 me_dir = self.get_jobs_identifier(cwd, prog) 1028 1029 if len(self.submitted_ids) > self.maximum_submited_jobs: 1030 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1031 self.wait(me_dir, fct, self.maximum_submited_jobs) 1032 1033 1034 text = "" 1035 if cwd is None: 1036 cwd = os.getcwd() 1037 else: 1038 text = " cd %s;" % cwd 1039 if stdout is None: 1040 stdout = '/dev/null' 1041 if stderr is None: 1042 stderr = '/dev/null' 1043 elif stderr == -2: # -2 is subprocess.STDOUT 1044 stderr = stdout 1045 if log is None: 1046 log = '/dev/null' 1047 1048 if not os.path.isabs(prog): 1049 text += "./%s" % prog 1050 else: 1051 text+= prog 1052 1053 if argument: 1054 text += ' ' + ' '.join(argument) 1055 1056 command = ['qsub','-o', stdout, 1057 '-N', me_dir, 1058 '-e', stderr, 1059 '-V'] 1060 1061 if self.cluster_queue and self.cluster_queue != 'None': 1062 command.extend(['-q', self.cluster_queue]) 1063 1064 a = misc.Popen(command, stdout=subprocess.PIPE, 1065 stderr=subprocess.STDOUT, 1066 stdin=subprocess.PIPE, cwd=cwd) 1067 1068 output = a.communicate(text)[0] 1069 id = output.split('.')[0] 1070 if not id.isdigit() or a.returncode !=0: 1071 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1072 % output 1073 1074 self.submitted += 1 1075 self.submitted_ids.append(id) 1076 return id
1077 1078 @multiple_try()
1079 - def control_one_job(self, id):
1080 """ control the status of a single job with it's cluster id """ 1081 cmd = 'qstat '+str(id) 1082 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1083 stderr=subprocess.STDOUT) 1084 1085 for line in status.stdout: 1086 line = line.strip() 1087 if 'cannot connect to server' in line or 'cannot read reply' in line: 1088 raise ClusterManagmentError, 'server disconnected' 1089 if 'Unknown' in line: 1090 return 'F' 1091 elif line.startswith(str(id)): 1092 jobstatus = line.split()[4] 1093 else: 1094 jobstatus="" 1095 1096 if status.returncode != 0 and status.returncode is not None: 1097 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1098 if jobstatus in self.idle_tag: 1099 return 'I' 1100 elif jobstatus in self.running_tag: 1101 return 'R' 1102 return 'F'
1103 1104 1105 @multiple_try()
1106 - def control(self, me_dir):
1107 """ control the status of a single job with it's cluster id """ 1108 cmd = "qstat" 1109 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1110 1111 me_dir = self.get_jobs_identifier(me_dir) 1112 1113 ongoing = [] 1114 1115 idle, run, fail = 0, 0, 0 1116 for line in status.stdout: 1117 if 'cannot connect to server' in line or 'cannot read reply' in line: 1118 raise ClusterManagmentError, 'server disconnected' 1119 if me_dir in line: 1120 ongoing.append(line.split()[0].split('.')[0]) 1121 status2 = line.split()[4] 1122 if status2 in self.idle_tag: 1123 idle += 1 1124 elif status2 in self.running_tag: 1125 run += 1 1126 elif status2 in self.complete_tag: 1127 if not self.check_termination(line.split()[0].split('.')[0]): 1128 idle += 1 1129 else: 1130 fail += 1 1131 1132 if status.returncode != 0 and status.returncode is not None: 1133 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1134 1135 for id in list(self.submitted_ids): 1136 if id not in ongoing: 1137 status2 = self.check_termination(id) 1138 if status2 == 'wait': 1139 run += 1 1140 elif status2 == 'resubmit': 1141 idle += 1 1142 1143 return idle, run, self.submitted - (idle+run+fail), fail
1144 1145 @multiple_try()
1146 - def remove(self, *args, **opts):
1147 """Clean the jobs on the cluster""" 1148 1149 if not self.submitted_ids: 1150 return 1151 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1152 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1153
1154 1155 -class SGECluster(Cluster):
1156 """Basic class for dealing with cluster submission""" 1157 # Class written by Arian Abrahantes. 1158 1159 name = 'sge' 1160 job_id = 'JOB_ID' 1161 idle_tag = ['qw', 'hqw','hRqw','w'] 1162 running_tag = ['r','t','Rr','Rt'] 1163 identifier_length = 10 1164
1165 - def def_get_path(self,location):
1166 """replace string for path issues""" 1167 location = os.path.realpath(location) 1168 homePath = os.getenv("HOME") 1169 if homePath: 1170 location = location.replace(homePath,'$HOME') 1171 return location
1172 1173 @multiple_try()
1174 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1175 required_output=[], nb_submit=0):
1176 """Submit a job prog to an SGE cluster""" 1177 1178 me_dir = self.get_jobs_identifier(cwd, prog) 1179 1180 1181 if cwd is None: 1182 #cwd = os.getcwd() 1183 cwd = self.def_get_path(os.getcwd()) 1184 cwd1 = self.def_get_path(cwd) 1185 text = " cd %s;" % cwd1 1186 if stdout is None: 1187 stdout = '/dev/null' 1188 else: 1189 stdout = self.def_get_path(stdout) 1190 if stderr is None: 1191 stderr = '/dev/null' 1192 elif stderr == -2: # -2 is subprocess.STDOUT 1193 stderr = stdout 1194 else: 1195 stderr = self.def_get_path(stderr) 1196 1197 if log is None: 1198 log = '/dev/null' 1199 else: 1200 log = self.def_get_path(log) 1201 1202 text += prog 1203 if argument: 1204 text += ' ' + ' '.join(argument) 1205 1206 #if anything slips through argument 1207 #print "!=== inteded change ",text.replace('/srv/nfs','') 1208 #text = text.replace('/srv/nfs','') 1209 homePath = os.getenv("HOME") 1210 if homePath: 1211 text = text.replace(homePath,'$HOME') 1212 1213 logger.debug("!=== input %s" % text) 1214 logger.debug("!=== output %s" % stdout) 1215 logger.debug("!=== error %s" % stderr) 1216 logger.debug("!=== logs %s" % log) 1217 1218 command = ['qsub','-o', stdout, 1219 '-N', me_dir, 1220 '-e', stderr, 1221 '-V'] 1222 1223 if self.cluster_queue and self.cluster_queue != 'None': 1224 command.extend(['-q', self.cluster_queue]) 1225 1226 a = misc.Popen(command, stdout=subprocess.PIPE, 1227 stderr=subprocess.STDOUT, 1228 stdin=subprocess.PIPE, cwd=cwd) 1229 1230 output = a.communicate(text)[0] 1231 id = output.split(' ')[2] 1232 if not id.isdigit(): 1233 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1234 % output 1235 self.submitted += 1 1236 self.submitted_ids.append(id) 1237 logger.debug(output) 1238 1239 return id
1240 1241 @multiple_try()
1242 - def control_one_job(self, id):
1243 """ control the status of a single job with it's cluster id """ 1244 #cmd = 'qstat '+str(id) 1245 cmd = 'qstat ' 1246 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1247 for line in status.stdout: 1248 #print "!==",line 1249 #line = line.strip() 1250 #if 'Unknown' in line: 1251 # return 'F' 1252 #elif line.startswith(str(id)): 1253 # status = line.split()[4] 1254 if str(id) in line: 1255 status = line.split()[4] 1256 #print "!=status", status 1257 if status in self.idle_tag: 1258 return 'I' 1259 elif status in self.running_tag: 1260 return 'R' 1261 return 'F'
1262 1263 @multiple_try()
1264 - def control(self, me_dir):
1265 """ control the status of a single job with it's cluster id """ 1266 cmd = "qstat " 1267 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1268 1269 me_dir = self.get_jobs_identifier(me_dir) 1270 1271 idle, run, fail = 0, 0, 0 1272 for line in status.stdout: 1273 if me_dir in line: 1274 status = line.split()[4] 1275 if status in self.idle_tag: 1276 idle += 1 1277 elif status in self.running_tag: 1278 run += 1 1279 else: 1280 logger.debug(line) 1281 fail += 1 1282 1283 return idle, run, self.submitted - (idle+run+fail), fail
1284 1285 1286 1287 @multiple_try()
1288 - def remove(self, *args, **opts):
1289 """Clean the jobs on the cluster""" 1290 1291 if not self.submitted_ids: 1292 return 1293 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1294 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1295
1296 1297 -class LSFCluster(Cluster):
1298 """Basic class for dealing with cluster submission""" 1299 1300 name = 'lsf' 1301 job_id = 'LSB_JOBID' 1302 1303 @multiple_try()
1304 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1305 required_output=[], nb_submit=0):
1306 """Submit the job prog to an LSF cluster""" 1307 1308 1309 me_dir = self.get_jobs_identifier(cwd, prog) 1310 1311 text = "" 1312 command = ['bsub', '-C0', '-J', me_dir] 1313 if cwd is None: 1314 cwd = os.getcwd() 1315 else: 1316 text = " cd %s;" % cwd 1317 if stdout and isinstance(stdout, str): 1318 command.extend(['-o', stdout]) 1319 if stderr and isinstance(stdout, str): 1320 command.extend(['-e', stderr]) 1321 elif stderr == -2: # -2 is subprocess.STDOUT 1322 pass 1323 if log is None: 1324 log = '/dev/null' 1325 1326 text += prog 1327 if argument: 1328 text += ' ' + ' '.join(argument) 1329 1330 if self.cluster_queue and self.cluster_queue != 'None': 1331 command.extend(['-q', self.cluster_queue]) 1332 1333 a = misc.Popen(command, stdout=subprocess.PIPE, 1334 stderr=subprocess.STDOUT, 1335 stdin=subprocess.PIPE, cwd=cwd) 1336 1337 output = a.communicate(text)[0] 1338 #Job <nnnn> is submitted to default queue <normal>. 1339 try: 1340 id = output.split('>',1)[0].split('<')[1] 1341 except: 1342 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1343 % output 1344 if not id.isdigit(): 1345 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1346 % output 1347 self.submitted += 1 1348 self.submitted_ids.append(id) 1349 return id
1350 1351 1352 @multiple_try()
1353 - def control_one_job(self, id):
1354 """ control the status of a single job with it's cluster id """ 1355 1356 cmd = 'bjobs '+str(id) 1357 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1358 1359 for line in status.stdout: 1360 line = line.strip().upper() 1361 if 'JOBID' in line: 1362 continue 1363 elif str(id) not in line: 1364 continue 1365 status = line.split()[2] 1366 if status == 'RUN': 1367 return 'R' 1368 elif status == 'PEND': 1369 return 'I' 1370 elif status == 'DONE': 1371 return 'F' 1372 else: 1373 return 'H' 1374 return 'F'
1375 1376 @multiple_try()
1377 - def control(self, me_dir):
1378 """ control the status of a single job with it's cluster id """ 1379 1380 if not self.submitted_ids: 1381 return 0, 0, 0, 0 1382 1383 cmd = "bjobs " + ' '.join(self.submitted_ids) 1384 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1385 1386 jobstatus = {} 1387 for line in status.stdout: 1388 line = line.strip() 1389 if 'JOBID' in line: 1390 continue 1391 splitline = line.split() 1392 id = splitline[0] 1393 if id not in self.submitted_ids: 1394 continue 1395 jobstatus[id] = splitline[2] 1396 1397 idle, run, fail = 0, 0, 0 1398 for id in self.submitted_ids[:]: 1399 if id in jobstatus: 1400 status = jobstatus[id] 1401 else: 1402 status = 'MISSING' 1403 if status == 'RUN': 1404 run += 1 1405 elif status == 'PEND': 1406 idle += 1 1407 else: 1408 status = self.check_termination(id) 1409 if status == 'wait': 1410 run += 1 1411 elif status == 'resubmit': 1412 idle += 1 1413 1414 return idle, run, self.submitted - (idle+run+fail), fail
1415 1416 @multiple_try()
1417 - def remove(self, *args,**opts):
1418 """Clean the jobs on the cluster""" 1419 1420 if not self.submitted_ids: 1421 return 1422 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1423 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1424
1425 -class GECluster(Cluster):
1426 """Class for dealing with cluster submission on a GE cluster""" 1427 1428 name = 'ge' 1429 job_id = 'JOB_ID' 1430 idle_tag = ['qw'] 1431 running_tag = ['r'] 1432 1433 @multiple_try()
1434 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1435 required_output=[], nb_submit=0):
1436 """Submit a job prog to a GE cluster""" 1437 1438 text = "" 1439 if cwd is None: 1440 cwd = os.getcwd() 1441 else: 1442 text = " cd %s; bash " % cwd 1443 if stdout is None: 1444 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1445 if stderr is None: 1446 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1447 elif stderr == -2: # -2 is subprocess.STDOUT 1448 stderr = stdout 1449 if log is None: 1450 log = '/dev/null' 1451 1452 text += prog 1453 if argument: 1454 text += ' ' + ' '.join(argument) 1455 text += '\n' 1456 tmp_submit = os.path.join(cwd, 'tmp_submit') 1457 open(tmp_submit,'w').write(text) 1458 1459 a = misc.Popen(['qsub','-o', stdout, 1460 '-e', stderr, 1461 tmp_submit], 1462 stdout=subprocess.PIPE, 1463 stderr=subprocess.STDOUT, 1464 stdin=subprocess.PIPE, cwd=cwd) 1465 1466 output = a.communicate()[0] 1467 #Your job 874511 ("test.sh") has been submitted 1468 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1469 try: 1470 id = pat.search(output).groups()[0] 1471 except: 1472 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1473 % output 1474 self.submitted += 1 1475 self.submitted_ids.append(id) 1476 return id
1477 1478 @multiple_try()
1479 - def control_one_job(self, id):
1480 """ control the status of a single job with it's cluster id """ 1481 cmd = 'qstat | grep '+str(id) 1482 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1483 if not status: 1484 return 'F' 1485 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1486 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1487 stat = '' 1488 for line in status.stdout.read().split('\n'): 1489 if not line: 1490 continue 1491 line = line.strip() 1492 try: 1493 groups = pat.search(line).groups() 1494 except: 1495 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1496 if groups[0] != id: continue 1497 stat = groups[1] 1498 if not stat: 1499 return 'F' 1500 if stat in self.idle_tag: 1501 return 'I' 1502 if stat in self.running_tag: 1503 return 'R'
1504 1505 @multiple_try()
1506 - def control(self, me_dir=None):
1507 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1508 if not self.submitted_ids: 1509 return 0, 0, 0, 0 1510 idle, run, fail = 0, 0, 0 1511 ongoing = [] 1512 for statusflag in ['p', 'r', 'sh']: 1513 cmd = 'qstat -s %s' % statusflag 1514 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1515 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1516 pat = re.compile("^(\d+)") 1517 for line in status.stdout.read().split('\n'): 1518 line = line.strip() 1519 try: 1520 id = pat.search(line).groups()[0] 1521 except Exception: 1522 pass 1523 else: 1524 if id not in self.submitted_ids: 1525 continue 1526 ongoing.append(id) 1527 if statusflag == 'p': 1528 idle += 1 1529 if statusflag == 'r': 1530 run += 1 1531 if statusflag == 'sh': 1532 fail += 1 1533 for id in list(self.submitted_ids): 1534 if id not in ongoing: 1535 self.check_termination(id) 1536 #self.submitted_ids = ongoing 1537 1538 return idle, run, self.submitted - idle - run - fail, fail
1539 1540 @multiple_try()
1541 - def remove(self, *args, **opts):
1542 """Clean the jobs on the cluster""" 1543 1544 if not self.submitted_ids: 1545 return 1546 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1547 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1548
1549 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1550 """start a computation and not wait for it to finish. 1551 this fonction returns a lock which is locked as long as the job is 1552 running.""" 1553 1554 mc = MultiCore(1) 1555 mc.submit(exe, argument, cwd, stdout, **opt) 1556 mc.need_waiting = True 1557 mc.lock.acquire() 1558 return mc.lock
1559
1560 1561 -class SLURMCluster(Cluster):
1562 """Basic class for dealing with cluster submission""" 1563 1564 name = 'slurm' 1565 job_id = 'SLURM_JOBID' 1566 idle_tag = ['Q','PD','S','CF'] 1567 running_tag = ['R', 'CG'] 1568 complete_tag = ['C'] 1569 identification_length = 8 1570 1571 @multiple_try()
1572 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1573 required_output=[], nb_submit=0):
1574 """Submit a job prog to a SLURM cluster""" 1575 1576 me_dir = self.get_jobs_identifier(cwd, prog) 1577 1578 1579 if cwd is None: 1580 cwd = os.getcwd() 1581 if stdout is None: 1582 stdout = '/dev/null' 1583 if stderr is None: 1584 stderr = '/dev/null' 1585 elif stderr == -2: # -2 is subprocess.STDOUT 1586 stderr = stdout 1587 if log is None: 1588 log = '/dev/null' 1589 1590 command = ['sbatch', '-o', stdout, 1591 '-J', me_dir, 1592 '-e', stderr, prog] + argument 1593 1594 if self.cluster_queue and self.cluster_queue != 'None': 1595 command.insert(1, '-p') 1596 command.insert(2, self.cluster_queue) 1597 1598 a = misc.Popen(command, stdout=subprocess.PIPE, 1599 stderr=subprocess.STDOUT, 1600 stdin=subprocess.PIPE, cwd=cwd) 1601 1602 output = a.communicate() 1603 output_arr = output[0].split(' ') 1604 id = output_arr[3].rstrip() 1605 1606 if not id.isdigit(): 1607 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1608 1609 self.submitted += 1 1610 self.submitted_ids.append(id) 1611 return id
1612 1613 @multiple_try()
1614 - def control_one_job(self, id):
1615 """ control the status of a single job with it's cluster id """ 1616 cmd = 'squeue j'+str(id) 1617 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1618 stderr=open(os.devnull,'w')) 1619 1620 for line in status.stdout: 1621 line = line.strip() 1622 if 'Invalid' in line: 1623 return 'F' 1624 elif line.startswith(str(id)): 1625 status = line.split()[4] 1626 if status in self.idle_tag: 1627 return 'I' 1628 elif status in self.running_tag: 1629 return 'R' 1630 return 'F'
1631 1632 @multiple_try()
1633 - def control(self, me_dir):
1634 """ control the status of a single job with it's cluster id """ 1635 cmd = "squeue" 1636 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1637 1638 me_dir = self.get_jobs_identifier(me_dir) 1639 1640 idle, run, fail = 0, 0, 0 1641 ongoing=[] 1642 for line in status.stdout: 1643 if me_dir in line: 1644 id, _, _,_ , status,_ = line.split(None,5) 1645 ongoing.append(id) 1646 if status in self.idle_tag: 1647 idle += 1 1648 elif status in self.running_tag: 1649 run += 1 1650 elif status in self.complete_tag: 1651 status = self.check_termination(id) 1652 if status == 'wait': 1653 run += 1 1654 elif status == 'resubmit': 1655 idle += 1 1656 else: 1657 fail += 1 1658 1659 #control other finished job 1660 for id in list(self.submitted_ids): 1661 if id not in ongoing: 1662 status = self.check_termination(id) 1663 if status == 'wait': 1664 run += 1 1665 elif status == 'resubmit': 1666 idle += 1 1667 1668 1669 return idle, run, self.submitted - (idle+run+fail), fail
1670 1671 @multiple_try()
1672 - def remove(self, *args, **opts):
1673 """Clean the jobs on the cluster""" 1674 1675 if not self.submitted_ids: 1676 return 1677 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1678 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1679
1680 -class HTCaaSCluster(Cluster):
1681 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1682 1683 name= 'htcaas' 1684 job_id = 'HTCAAS_JOBID' 1685 1686 @store_input() 1687 @multiple_try()
1688 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1689 log=None, input_files=[], output_files=[], required_output=[], 1690 nb_submit=0):
1691 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1692 input/output file should be give relative to cwd 1693 """ 1694 # To make workspace name(temp) 1695 if 'ajob' in prog: 1696 prog_num = prog.rsplit("ajob",1)[1] 1697 else: 1698 prog_num = '0' 1699 1700 cur_usr = os.getenv('USER') 1701 1702 if cwd is None: 1703 cwd = os.getcwd() 1704 1705 cwd_cp = cwd.rsplit("/",2) 1706 #print 'This is HTCaaS Mode' 1707 1708 if not stdout is None: 1709 print "stdout: %s" % stdout 1710 1711 if not os.path.exists(prog): 1712 prog = os.path.join(cwd, prog) 1713 1714 if not required_output and output_files: 1715 required_output = output_files 1716 1717 1718 if not 'combine' and not 'pythia' in prog : 1719 cwd_arg = cwd+"/arguments" 1720 temp = ' '.join([str(a) for a in argument]) 1721 arg_cmd="echo '"+temp+"' > " + cwd_arg 1722 #print arg_cmd 1723 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1724 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1725 if argument : 1726 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1727 print command 1728 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1729 id = a.stdout.read().strip() 1730 1731 else: 1732 cwd_arg = cwd+"/arguments" 1733 temp = ' '.join([str(a) for a in argument]) 1734 #arg_cmd="echo '"+temp+"' > " + cwd_arg 1735 #print arg_cmd 1736 #aa = misc.Popen([arg_cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 1737 #print os.path.basename(prog) 1738 temp_file_name = "sub." + os.path.basename(prog) 1739 text = """#!/bin/bash 1740 MYPWD=%(cwd)s 1741 cd $MYPWD 1742 input_files=(%(input_files)s ) 1743 for i in ${input_files[@]} 1744 do 1745 chmod -f +x $i 1746 done 1747 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1748 """ 1749 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1750 'arguments': ' '.join([str(a) for a in argument]), 1751 'program': ' ' if '.py' in prog else 'bash'} 1752 1753 # writing a new script for the submission 1754 new_prog = pjoin(cwd, temp_file_name) 1755 open(new_prog, 'w').write(text % dico) 1756 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1757 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1758 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1759 id = a.stdout.read().strip() 1760 1761 nb_try=0 1762 nb_limit=5 1763 if not id.isdigit() : 1764 print "[ID is not digit]:" + id 1765 1766 while not id.isdigit() : 1767 nb_try+=1 1768 print "[fail_retry]:"+ nb_try 1769 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1770 id = a.stdout.read().strip() 1771 if nb_try > nb_limit : 1772 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1773 break 1774 1775 self.submitted += 1 1776 self.submitted_ids.append(id) 1777 1778 return id
1779 1780 @multiple_try(nb_try=10, sleep=10)
1781 - def control_one_job(self, id):
1782 """ control the status of a single job with it's cluster id """ 1783 1784 if id == 0 : 1785 status_out ='C' 1786 else : 1787 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1788 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1789 stderr=subprocess.PIPE) 1790 error = status.stderr.read() 1791 if status.returncode or error: 1792 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1793 status_out= status.stdout.read().strip() 1794 status_out= status_out.split(":",1)[1] 1795 if status_out == 'waiting': 1796 status_out='I' 1797 elif status_out == 'preparing' or status_out == 'running': 1798 status_out = 'R' 1799 elif status_out != 'done': 1800 status_out = 'F' 1801 elif status_out == 'done': 1802 status_out = 'C' 1803 1804 return status_out
1805 1806 @multiple_try(nb_try=15, sleep=1)
1807 - def control(self, me_dir):
1808 """ control the status of a single job with it's cluster id """ 1809 #print "HTCaaS2 Control" 1810 if not self.submitted_ids: 1811 return 0, 0, 0, 0 1812 1813 ongoing = [] 1814 idle, run, fail = 0, 0, 0 1815 1816 if id == 0 : 1817 return 0 , 0, 0, 0 1818 else : 1819 for i in range(len(self.submitted_ids)): 1820 ongoing.append(int(self.submitted_ids[i])) 1821 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 1822 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1823 status_out= status.stdout.read().strip() 1824 status_out= status_out.split(":",1)[1] 1825 if status_out == 'waiting': 1826 idle += 1 1827 elif status_out == 'preparing': 1828 run += 1 1829 elif status_out == 'running': 1830 run += 1 1831 elif status_out != 'done': 1832 fail += 1 1833 1834 if status_out != 'done': 1835 print "["+ self.submitted_ids[i] + "] " + status_out 1836 ''' 1837 for i in range(len(self.submitted_ids)): 1838 if int(self.submitted_ids[i]) not in ongoing: 1839 status = self.check_termination(int(self.submitted_ids[i])) 1840 if status = 'waiting': 1841 idle += 1 1842 elif status == 'resubmit': 1843 idle += 1 1844 elif status == 'failed': 1845 fail += 1 1846 ''' 1847 1848 return idle, run, self.submitted - (idle+run+fail), fail
1849 1850 @multiple_try()
1851 - def remove(self, *args, **opts):
1852 """Clean the jobson the cluster""" 1853 1854 if not self.submitted_ids: 1855 return 1856 for i in range(len(self.submitted_ids)): 1857 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 1858 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1859
1860 1861 -class HTCaaS2Cluster(Cluster):
1862 """Class for dealing with cluster submission on a HTCaaS cluster""" 1863 1864 name= 'htcaas2' 1865 job_id = 'HTCAAS2_JOBID' 1866 1867 @store_input() 1868 @multiple_try()
1869 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1870 log=None, input_files=[], output_files=[], required_output=[], 1871 nb_submit=0):
1872 """Submit the job on the cluster NO SHARE DISK 1873 input/output file should be give relative to cwd 1874 """ 1875 # To make workspace name(temp) 1876 if 'ajob' in prog: 1877 prog_num = prog.rsplit("ajob",1)[1] 1878 elif 'run_combine' in prog: 1879 prog_num = '0' 1880 else: 1881 prog_num = prog 1882 1883 cur_usr = os.getenv('USER') 1884 1885 import uuid 1886 dir = str(uuid.uuid4().hex) 1887 #dir = str(int(time())) 1888 prog_dir = '_run%s'% prog_num 1889 prog_dir = dir+prog_dir 1890 1891 if cwd is None: 1892 cwd = os.getcwd() 1893 1894 cwd_cp = cwd.rsplit("/",2) 1895 1896 if stdout is None: 1897 stdout='/dev/null' 1898 1899 if not os.path.exists(prog): 1900 prog = os.path.join(cwd, prog) 1901 1902 if not required_output and output_files: 1903 required_output = output_files 1904 1905 if '/' in argument : 1906 temp_file_name = "sub." + os.path.basename(prog) 1907 else : 1908 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 1909 1910 1911 if 'combine' in prog or 'pythia' in prog : 1912 text = """#!/bin/bash 1913 MYPWD=%(cwd)s 1914 cd $MYPWD 1915 script=%(script)s 1916 input_files=(%(input_files)s ) 1917 if [ $# -ge 1 ]; then 1918 arg1=$1 1919 else 1920 arg1='' 1921 fi 1922 args=' %(arguments)s' 1923 for i in ${input_files[@]}; do 1924 if [[ "$i" == *$script* ]]; then 1925 script=$i 1926 fi 1927 chmod -f +x $i 1928 done 1929 /bin/bash ${script} ${args} > %(stdout)s 1930 """ 1931 1932 elif 'shower' in prog : 1933 text = """#!/bin/bash 1934 MYPWD=%(cwd)s 1935 cd $MYPWD 1936 args=' %(arguments)s' 1937 input_files=( %(input_files)s ) 1938 for i in ${input_files[@]} 1939 do 1940 chmod -f +x $i 1941 done 1942 /bin/bash %(script)s ${args} > $MYPWD/done 1943 """ 1944 1945 else : 1946 text = """#!/bin/bash 1947 MYPWD=%(cwd)s 1948 #mkdir -p $MYTMP 1949 cd $MYPWD 1950 input_files=( %(input_files)s ) 1951 for i in ${input_files[@]} 1952 do 1953 if [[ $i != */*/* ]]; then 1954 i=$PWD"/"$i 1955 fi 1956 echo $i 1957 if [ -d $i ]; then 1958 htcaas-file-put -l $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1959 else 1960 htcaas-file-put -f $i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -i %(cur_usr)s 1961 fi 1962 done 1963 """ 1964 1965 dico = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 1966 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 1967 'input_files': ' '.join(input_files + [prog]), 1968 'output_files': ' '.join(output_files), 'stdout': stdout, 1969 'arguments': ' '.join([str(a) for a in argument]), 1970 'program': ' ' if '.py' in prog else 'bash'} 1971 1972 # writing a new script for the submission 1973 new_prog = pjoin(cwd, temp_file_name) 1974 open(new_prog, 'w').write(text % dico) 1975 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1976 1977 # print temp_file_name 1978 cmd1='/bin/bash '+ cwd+'/'+temp_file_name 1979 status1 = misc.Popen([cmd1], shell=True, stdout=subprocess.PIPE, 1980 stderr=subprocess.PIPE) 1981 #print '%s' % status1.stdout.read() 1982 1983 1984 if not 'combine' in prog and not 'shower' in prog and not 'pythia' in prog: 1985 1986 cmd3 = """htcaas-mgjob-submit -d /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/ -e %(script)s %(arguments)s""" 1987 dico3 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 1988 'arguments': ' ' if not argument else "-a " + '='.join([str(a) for a in argument]) , 1989 'prog_dir': prog_dir } 1990 status3 = misc.Popen([cmd3 % dico3], shell=True, stdout=subprocess.PIPE, 1991 stderr=subprocess.PIPE) 1992 id = status3.stdout.read().strip() 1993 ## exception 1994 nb_try=0 1995 nb_limit=5 1996 while not id.isdigit() : 1997 nb_try+=1 1998 a=misc.Popen( [cmd3 % dico3], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1999 id = a.stdout.read().strip() 2000 if nb_try > nb_limit : 2001 raise ClusterManagmentError, 'Fail to submit to the HTCaaS cluster: \n %s' % id 2002 break 2003 2004 temp_file_name2 = "sub." +id 2005 text2 = """#!/bin/bash 2006 MYPWD=%(cwd)s 2007 output_files=( %(output_files)s ) 2008 result=done 2009 if [ ! -e ${MYPWD}/done.%(job_id)s ]; then 2010 for i in ${output_files[@]} 2011 do 2012 htcaas-file-get -l ${MYPWD}/$i -r /pwork01/%(cur_usr)s/MG5_workspace/%(prog_dir)s/$i -i %(cur_usr)s 2013 chmod -Rf 777 ${MYPWD}/$i 2014 done 2015 for i in ${output_files[@]}; do 2016 if [[ -e ${MYPWD}/$i ]]; then 2017 result=done 2018 else 2019 result=running 2020 echo $result 2021 exit 0 2022 fi 2023 done 2024 echo $result 2025 touch ${MYPWD}/done.%(job_id)s 2026 else 2027 for i in ${output_files[@]}; do 2028 if [ -e ${MYPWD}/$i ]; then 2029 result=done 2030 else 2031 rm -f ${MYPWD}/done.%(job_id)s 2032 result=running 2033 echo $result 2034 exit 0 2035 fi 2036 done 2037 echo $result 2038 2039 fi 2040 2041 """ 2042 dico2 = {'cur_usr' : cur_usr, 'script': os.path.basename(prog), 2043 'cwd': cwd, 'job_id': self.job_id, 'prog_dir': prog_dir, 2044 'output_files': ' '.join(output_files), 'job_id': id, 2045 'program': ' ' if '.py' in prog else 'bash'} 2046 2047 homePath = os.getenv("HOME") 2048 outPath = homePath +"/MG5" 2049 2050 new_prog2 = pjoin(outPath, temp_file_name2) 2051 open(new_prog2, 'w').write(text2 % dico2) 2052 misc.Popen(['chmod','+x',new_prog2],cwd=cwd) 2053 2054 2055 self.submitted += 1 2056 self.submitted_ids.append(id) 2057 2058 elif 'combine' in prog or 'shower' in prog or 'pythia' in prog: 2059 if '/dev/null' in stdout : 2060 stdout='' 2061 2062 temp_file_shower = "sub.out" 2063 text_shower = """#!/bin/bash 2064 MYPWD=%(cwd)s 2065 result=done 2066 output_files=(%(output_files)s) 2067 for i in ${output_files[@]}; do 2068 if [ -e $MYPWD/$i -o -e $i ]; then 2069 result=done 2070 else 2071 result=running 2072 echo $result 2073 exit 0 2074 fi 2075 done 2076 echo $result 2077 """ 2078 dico_shower = { 'cwd': cwd, 'output_files': ' '.join([stdout]+output_files), 2079 'program': ' ' if '.py' in prog else 'bash'} 2080 homePath = os.getenv("HOME") 2081 outPath = homePath +"/MG5" 2082 new_prog_shower = pjoin(outPath, temp_file_shower) 2083 open(new_prog_shower, 'w').write(text_shower % dico_shower) 2084 misc.Popen(['chmod','+x',new_prog_shower],cwd=cwd) 2085 2086 id='-1' 2087 self.submitted += 1 2088 self.submitted_ids.append(id) 2089 2090 else : 2091 id='-2' 2092 self.submitted += 1 2093 self.submitted_ids.append(id) 2094 2095 return id
2096 2097 @multiple_try(nb_try=10, sleep=10)
2098 - def control_one_job(self, id):
2099 """ control the status of a single job with it's cluster id """ 2100 2101 homePath = os.getenv("HOME") 2102 outPath = homePath +"/MG5" 2103 2104 2105 if id == '0' or id=='-2' : 2106 status_out ='done' 2107 elif id == '-1' : 2108 cmd='/bin/bash ' +outPath+'/sub.out' 2109 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2110 status_out=status.stdout.read().strip() 2111 print "["+id+"]" + status_out 2112 if status_out == 'waiting': 2113 status_out='wait' 2114 elif status_out == 'preparing' or status_out == 'running': 2115 status_out = 'R' 2116 elif status_out != 'done': 2117 status_out = 'F' 2118 elif status_out == 'done': 2119 status_out = 'C' 2120 2121 print "["+id+"]" + status_out 2122 else : 2123 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 2124 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 2125 stderr=subprocess.PIPE) 2126 error = status.stderr.read() 2127 if status.returncode or error: 2128 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 2129 status_out= status.stdout.read().strip() 2130 status_out= status_out.split(":",1)[1] 2131 print "["+id+"]" + status_out 2132 if status_out == 'waiting': 2133 status_out='wait' 2134 elif status_out == 'preparing' or status_out == 'running': 2135 status_out = 'R' 2136 elif status_out == 'failed' : 2137 args = self.retry_args[id] 2138 id_temp = self.submit2(**args) 2139 del self.retry_args[id] 2140 self.submitted_ids.remove(id) 2141 status_out = 'I' 2142 elif status_out != 'done': 2143 status_out = 'F' 2144 elif status_out == 'done': 2145 status_out = 'C' 2146 2147 return status_out
2148 2149 2150 @check_interupt() 2151 @multiple_try(nb_try=15, sleep=10)
2152 - def control(self, me_dir):
2153 """ control the status of a single job with it's cluster id """ 2154 2155 if not self.submitted_ids: 2156 return 0, 0, 0, 0 2157 2158 ongoing = [] 2159 idle, run, fail = 0, 0, 0 2160 2161 homePath = os.getenv("HOME") 2162 outPath = homePath +"/MG5" 2163 2164 for i in range(len(self.submitted_ids)): 2165 ongoing.append(self.submitted_ids[i]) 2166 if self.submitted_ids[i] == '-2' : 2167 return 0,0,0,0 2168 if self.submitted_ids[i] == '0' : 2169 # ongoing.append('0') 2170 status_out='done' 2171 elif self.submitted_ids[i] == '-1' : 2172 cmd='/bin/bash ' +outPath+'/sub.out' 2173 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2174 status_out=status.stdout.read().strip() 2175 if status_out == 'waiting': 2176 idle += 1 2177 elif status_out == 'preparing': 2178 run += 1 2179 elif status_out == 'running': 2180 run += 1 2181 elif status_out != 'done': 2182 fail += 1 2183 else : 2184 args = self.retry_args[str(self.submitted_ids[i])] 2185 if 'required_output'in args and not args['required_output']: 2186 args['required_output'] = args['output_files'] 2187 self.retry_args[str(self.submitted_ids[i])] = args 2188 2189 cmd = "htcaas-job-status -m " + self.submitted_ids[i] + " -s | grep Status " 2190 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 2191 status_out= status.stdout.read().strip() 2192 status_out= status_out.split(":",1)[1] 2193 if status_out == 'waiting': 2194 idle += 1 2195 elif status_out == 'preparing': 2196 run += 1 2197 elif status_out == 'running': 2198 run += 1 2199 elif status_out == 'failed' or status_out == 'canceled': 2200 id = self.submit2(**args) 2201 #self.submitted_ids[i]=id 2202 del self.retry_args[self.submitted_ids[i]] 2203 self.submitted_ids.remove(self.submitted_ids[i]) 2204 self.submitted-=1 2205 idle += 1 2206 elif status_out != 'done': 2207 fail += 1 2208 if status_out == 'done': 2209 cmd2='/bin/bash '+ outPath+'/sub.'+self.submitted_ids[i] 2210 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2211 aa= status2.stdout.read().strip() 2212 #result= self.check_termination(str(self.submitted_ids[i])) 2213 #print result 2214 #if not result : 2215 #if not self.check_termination(str(self.submitted_ids[i])): 2216 # print "not_self" + self.submitted_ids[i] 2217 # idle += 1 2218 #else : 2219 for path in args['required_output']: 2220 if args['cwd']: 2221 path = pjoin(args['cwd'], path) 2222 # check that file exists and is not empty. 2223 temp1=os.path.exists(path) 2224 temp2=os.stat(path).st_size 2225 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 2226 status2 = misc.Popen([cmd2], shell=True, stdout=subprocess.PIPE,stderr=subprocess.PIPE) 2227 aa= status2.stdout.read().strip() 2228 if aa == 'done': 2229 self.submitted_ids[i] = '0' 2230 elif aa == 'running': 2231 run += 1 2232 else : 2233 self.submitted_ids[i]='0' 2234 2235 2236 for i in range(len(self.submitted_ids)): 2237 if str(self.submitted_ids[i]) not in ongoing: 2238 status2= self.check_termination(str(self.submitted_ids[i])) 2239 if status2 == 'wait': 2240 run += 1 2241 elif status2 == 'resubmit': 2242 idle += 1 2243 2244 return idle, run, self.submitted - (idle+run+fail), fail
2245 2246 @multiple_try()
2247 - def remove(self, *args, **opts):
2248 """Clean the jobson the cluster""" 2249 2250 if not self.submitted_ids: 2251 return 2252 for i in range(len(self.submitted_ids)): 2253 cmd = "htcaas-job-cancel -m %s" % ' '.join(self.submitted_ids[i]) 2254 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2255 2256 2257 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2258 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2259 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2260