Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21  import sys 
  22   
  23  logger = logging.getLogger('madgraph.cluster')  
  24   
  25  try: 
  26      from madgraph import MadGraph5Error 
  27      import madgraph.various.misc as misc 
  28  except Exception, error: 
  29      if __debug__: 
  30          print  str(error) 
  31      from internal import MadGraph5Error 
  32      import internal.misc as misc 
  33   
  34  pjoin = os.path.join 
35 36 -class ClusterManagmentError(MadGraph5Error):
37 pass
38
39 -class NotImplemented(MadGraph5Error):
40 pass
41 42 43 multiple_try = misc.multiple_try 44 pjoin = os.path.join
45 46 47 -def check_interupt(error=KeyboardInterrupt):
48 49 def deco_interupt(f): 50 def deco_f_interupt(self, *args, **opt): 51 try: 52 return f(self, *args, **opt) 53 except error: 54 try: 55 self.remove(*args, **opt) 56 except Exception: 57 pass 58 raise error
59 return deco_f_interupt 60 return deco_interupt 61
62 -def store_input(arg=''):
63 64 def deco_store(f): 65 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 66 input_files=[], output_files=[], required_output=[], nb_submit=0): 67 frame = inspect.currentframe() 68 args, _, _, values = inspect.getargvalues(frame) 69 args = dict([(i, values[i]) for i in args if i != 'self']) 70 id = f(self, **args) 71 if self.nb_retry > 0: 72 self.retry_args[id] = args 73 return id
74 return deco_f_store 75 return deco_store 76
77 -def need_transfer(options):
78 """ This function checks whether compression of input files are necessary 79 given the running options given. """ 80 81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 82 return False 83 else: 84 return True
85
86 -class Cluster(object):
87 """Basic Class for all cluster type submission""" 88 name = 'mother class' 89 identifier_length = 14 90
91 - def __init__(self,*args, **opts):
92 """Init the cluster""" 93 94 self.submitted = 0 95 self.submitted_ids = [] 96 self.finish = 0 97 self.submitted_dirs = [] #HTCaaS 98 self.submitted_exes = [] #HTCaaS 99 self.submitted_args = [] #HTCaaS 100 101 if 'cluster_queue' in opts: 102 self.cluster_queue = opts['cluster_queue'] 103 else: 104 self.cluster_queue = 'madgraph' 105 if 'cluster_temp_path' in opts: 106 self.temp_dir = opts['cluster_temp_path'] 107 else: 108 self.temp_dir = None 109 self.options = {'cluster_status_update': (600, 30)} 110 for key,value in opts.items(): 111 self.options[key] = value 112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 114 self.options = dict(opts) 115 self.retry_args = {} 116 # controlling jobs in controlled type submision 117 self.packet = {} 118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster.""" 123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124 125 126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 128 log=None, input_files=[], output_files=[], required_output=[], 129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster. 131 NO SHARE DISK""" 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 138 if not required_output and output_files: 139 required_output = output_files 140 141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 142 (input_files == [] == output_files): 143 return self.submit(prog, argument, cwd, stdout, stderr, log, 144 required_output=required_output, nb_submit=nb_submit) 145 146 if not input_files and not output_files: 147 # not input/output so not using submit2 148 return self.submit(prog, argument, cwd, stdout, stderr, log, 149 required_output=required_output, nb_submit=nb_submit) 150 151 if cwd is None: 152 cwd = os.getcwd() 153 if not os.path.exists(prog): 154 prog = os.path.join(cwd, prog) 155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 156 157 text = """#!/bin/bash 158 MYTMP=%(tmpdir)s/run$%(job_id)s 159 MYPWD=%(cwd)s 160 mkdir -p $MYTMP 161 cd $MYPWD 162 input_files=( %(input_files)s ) 163 for i in ${input_files[@]} 164 do 165 cp -R -L $i $MYTMP 166 done 167 cd $MYTMP 168 echo '%(arguments)s' > arguments 169 chmod +x ./%(script)s 170 %(program)s ./%(script)s %(arguments)s 171 exit=$? 172 output_files=( %(output_files)s ) 173 for i in ${output_files[@]} 174 do 175 cp -r $MYTMP/$i $MYPWD 176 done 177 # if [ "$exit" -eq "0" ] 178 # then 179 rm -rf $MYTMP 180 # fi 181 """ 182 183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 184 'cwd': cwd, 'job_id': self.job_id, 185 'input_files': ' '.join(input_files + [prog]), 186 'output_files': ' '.join(output_files), 187 'arguments': ' '.join([str(a) for a in argument]), 188 'program': ' ' if '.py' in prog else 'bash'} 189 190 # writing a new script for the submission 191 new_prog = pjoin(cwd, temp_file_name) 192 open(new_prog, 'w').write(text % dico) 193 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 194 195 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 196 required_output=required_output, nb_submit=nb_submit)
197 198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 200 log=None, input_files=[], output_files=[], required_output=[], 201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant 203 method should not be overwritten (but for DAG type submission)""" 204 205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 206 output_files, required_output, nb_submit) 207 208 209 if not packet_member: 210 return id 211 else: 212 if isinstance(packet_member, Packet): 213 self.id_to_packet[id] = packet_member 214 packet_member.put(id) 215 if packet_member.tag not in self.packet: 216 self.packet[packet_member.tag] = packet_member 217 else: 218 if packet_member in self.packet: 219 packet = self.packet[packet_member] 220 packet.put(id) 221 self.id_to_packet[id] = packet 222 return id
223
224 - def control(self, me_dir=None):
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 226 if not self.submitted_ids: 227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 228 idle, run, fail = 0, 0, 0 229 for pid in self.submitted_ids[:]: 230 status = self.control_one_job(id) 231 if status == 'I': 232 idle += 1 233 elif status == 'R': 234 run += 1 235 elif status == 'F': 236 self.finish +=1 237 self.submitted_ids.remove(pid) 238 else: 239 fail += 1 240 241 return idle, run, self.finish, fail
242
243 - def control_one_job(self, pid):
244 """ control the status of a single job with it's cluster id """ 245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
247 - def get_jobs_identifier(self, path, second_path=None):
248 """get a unique run_name for all the jobs helps to identify the runs 249 in the controller for some cluster.""" 250 251 if second_path: 252 path = os.path.realpath(pjoin(path, second_path)) 253 elif not os.path.exists(path): 254 return path # job already done 255 256 if 'SubProcesses' in path: 257 target = path.rsplit('/SubProcesses',1)[0] 258 elif 'MCatNLO' in path: 259 target = path.rsplit('/MCatNLO',1)[0] 260 elif 'PY8_parallelization' in path: 261 target = path.rsplit('/PY8_parallelization',1)[0] 262 elif second_path: 263 target=path 264 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 265 else: 266 target = path 267 268 if target.endswith('/'): 269 target = target[:-1] 270 271 target = misc.digest(target)[-self.identifier_length:] 272 if not target[0].isalpha(): 273 target = 'a' + target[1:] 274 275 return target
276 277 278 @check_interupt()
279 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
280 """Wait that all job are finish. 281 if minimal_job set, then return if idle + run is lower than that number""" 282 283 284 mode = 1 # 0 is long waiting/ 1 is short waiting 285 nb_iter = 0 286 nb_short = 0 287 change_at = 5 # number of iteration from which we wait longer between update. 288 289 if update_first: 290 idle, run, finish, fail = self.control(me_dir) 291 update_first(idle, run, finish) 292 293 #usefull shortcut for readibility 294 longtime, shorttime = self.options['cluster_status_update'] 295 296 nb_job = 0 297 298 if self.options['cluster_type'] == 'htcaas2': 299 me_dir = self.metasubmit(self) 300 301 while 1: 302 old_mode = mode 303 nb_iter += 1 304 idle, run, finish, fail = self.control(me_dir) 305 if nb_job: 306 if idle + run + finish + fail != nb_job: 307 nb_job = idle + run + finish + fail 308 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 309 else: 310 nb_job = idle + run + finish + fail 311 if fail: 312 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 313 if idle + run == 0: 314 #time.sleep(20) #security to ensure that the file are really written on the disk 315 logger.info('All jobs finished') 316 fct(idle, run, finish) 317 break 318 if idle + run < minimal_job: 319 return 320 fct(idle, run, finish) 321 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 322 if nb_iter < change_at: 323 mode = 1 324 elif idle < run: 325 if old_mode == 0: 326 if nb_short: 327 mode = 0 #we already be back from short to long so stay in long 328 #check if we need to go back to short mode 329 elif idle: 330 if nb_iter > change_at + int(longtime)//shorttime: 331 mode = 0 #stay in long waiting mode 332 else: 333 mode = 1 # pass in short waiting mode 334 nb_short =0 335 else: 336 mode = 1 # pass in short waiting mode 337 nb_short = 0 338 elif old_mode == 1: 339 nb_short +=1 340 if nb_short > 3* max(change_at, int(longtime)//shorttime): 341 mode = 0 #go back in slow waiting 342 else: 343 mode = 0 344 345 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 346 if old_mode > mode: 347 logger.info('''Start to wait %ss between checking status. 348 Note that you can change this time in the configuration file. 349 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 350 351 #now Waiting! 352 if mode == 0: 353 try: 354 time.sleep(self.options['cluster_status_update'][0]) 355 except KeyboardInterrupt: 356 logger.info('start to update the status') 357 nb_iter = min(0, change_at -2) 358 nb_short = 0 359 else: 360 time.sleep(self.options['cluster_status_update'][1]) 361 362 363 self.submitted = 0 364 self.submitted_ids = []
365
366 - def check_termination(self, job_id):
367 """Check the termination of the jobs with job_id and relaunch it if needed.""" 368 369 370 if job_id not in self.retry_args: 371 if job_id in self.id_to_packet: 372 nb_in_packet = self.id_to_packet[job_id].remove_one() 373 if nb_in_packet == 0: 374 # packet done run the associate function 375 packet = self.id_to_packet[job_id] 376 # fully ensure that the packet is finished (thread safe) 377 packet.queue.join() 378 #running the function 379 packet.fct(*packet.args) 380 del self.id_to_packet[job_id] 381 return 'resubmit' 382 else: 383 return True 384 385 args = self.retry_args[job_id] 386 if 'time_check' in args: 387 time_check = args['time_check'] 388 else: 389 time_check = 0 390 391 for path in args['required_output']: 392 if args['cwd']: 393 path = pjoin(args['cwd'], path) 394 # check that file exists and is not empty. 395 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 396 break 397 else: 398 # all requested output are present 399 if time_check > 0: 400 logger.info('Job %s Finally found the missing output.' % (job_id)) 401 del self.retry_args[job_id] 402 self.submitted_ids.remove(job_id) 403 # check if the job_id is in a packet 404 if job_id in self.id_to_packet: 405 nb_in_packet = self.id_to_packet[job_id].remove_one() 406 if nb_in_packet == 0: 407 # packet done run the associate function 408 packet = self.id_to_packet[job_id] 409 # fully ensure that the packet is finished (thread safe) 410 packet.queue.join() 411 #running the function 412 packet.fct(*packet.args) 413 del self.id_to_packet[job_id] 414 return 'resubmit' 415 416 return 'done' 417 418 if time_check == 0: 419 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 420 args['time_check'] = time.time() 421 return 'wait' 422 elif self.cluster_retry_wait > time.time() - time_check: 423 return 'wait' 424 425 #jobs failed to be completed even after waiting time!! 426 if self.nb_retry < 0: 427 logger.critical('''Fail to run correctly job %s. 428 with option: %s 429 file missing: %s''' % (job_id, args, path)) 430 raw_input('press enter to continue.') 431 elif self.nb_retry == 0: 432 logger.critical('''Fail to run correctly job %s. 433 with option: %s 434 file missing: %s. 435 Stopping all runs.''' % (job_id, args, path)) 436 self.remove() 437 elif args['nb_submit'] >= self.nb_retry: 438 logger.critical('''Fail to run correctly job %s. 439 with option: %s 440 file missing: %s 441 Fails %s times 442 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 443 self.remove() 444 else: 445 args['nb_submit'] += 1 446 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 447 del self.retry_args[job_id] 448 self.submitted_ids.remove(job_id) 449 if 'time_check' in args: 450 del args['time_check'] 451 if job_id in self.id_to_packet: 452 self.id_to_packet[job_id].remove_one() 453 args['packet_member'] = self.id_to_packet[job_id] 454 del self.id_to_packet[job_id] 455 self.cluster_submit(**args) 456 else: 457 self.submit2(**args) 458 return 'resubmit' 459 return 'done'
460 461 @check_interupt()
462 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 463 stderr=None, log=None, required_output=[], nb_submit=0, 464 input_files=[], output_files=[]):
465 """launch one job on the cluster and wait for it""" 466 467 special_output = False # tag for concatenate the error with the output. 468 if stderr == -2 and stdout: 469 #We are suppose to send the output to stdout 470 special_output = True 471 stderr = stdout + '.err' 472 473 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 474 required_output=required_output, input_files=input_files, 475 output_files=output_files) 476 477 if self.options['cluster_type']=='htcaas2': 478 if self.submitted == self.submitted_ids[-1]: 479 id = self.metasubmit(self) 480 481 frame = inspect.currentframe() 482 args, _, _, values = inspect.getargvalues(frame) 483 args = dict([(i, values[i]) for i in args if i != 'self']) 484 self.retry_args[id] = args 485 486 nb_wait=0 487 while 1: 488 nb_wait+=1 489 status = self.control_one_job(id) 490 if not status in ['R','I']: 491 status = self.check_termination(id) 492 if status in ['wait']: 493 time.sleep(30) 494 continue 495 elif status in ['resubmit']: 496 id = self.submitted_ids[0] 497 time.sleep(30) 498 continue 499 #really stop! 500 time.sleep(30) #security to ensure that the file are really written on the disk 501 break 502 time.sleep(self.options['cluster_status_update'][1]) 503 504 if required_output: 505 status = self.check_termination(id) 506 if status == 'wait': 507 run += 1 508 elif status == 'resubmit': 509 idle += 1 510 511 512 if special_output: 513 # combine the stdout and the stderr 514 #wait up to 50 s to see if those files exists 515 for i in range(5): 516 if os.path.exists(stdout): 517 if not os.path.exists(stderr): 518 time.sleep(5) 519 if os.path.exists(stderr): 520 err_text = open(stderr).read() 521 if not err_text: 522 return 523 logger.warning(err_text) 524 text = open(stdout).read() 525 open(stdout,'w').write(text + err_text) 526 else: 527 return 528 time.sleep(10)
529
530 - def remove(self, *args, **opts):
531 """ """ 532 logger.warning("""This cluster didn't support job removal, 533 the jobs are still running on the cluster.""")
534 535 @store_input()
536 - def metasubmit(self, me_dir):
537 logger.warning("""This cluster didn't support metajob submit.""") 538 return 0
539
540 - def modify_interface(self, run_interface):
541 """routine which allow to modify the run_card/mg5cmd object to change the 542 default behavior of the runs. 543 This is called at the time of the compilation of the run_card. 544 Note that this function can be called multiple times by run. 545 """ 546 #run_card = run_interface.run_card 547 return
548
549 -class Packet(object):
550 """ an object for handling packet of job, it is designed to be thread safe 551 """ 552
553 - def __init__(self, name, fct, args, opts={}):
554 import Queue 555 import threading 556 self.queue = Queue.Queue() 557 self.tag = name 558 self.fct = fct 559 self.args = args 560 self.opts = opts 561 self.done = threading.Event()
562
563 - def put(self, *args, **opts):
564 self.queue.put(*args, **opts)
565 566 append = put 567
568 - def remove_one(self):
569 self.queue.get(True) 570 self.queue.task_done() 571 return self.queue.qsize()
572
573 -class MultiCore(Cluster):
574 """class for dealing with the submission in multiple node""" 575 576 job_id = "$" 577
578 - def __init__(self, *args, **opt):
579 """Init the cluster """ 580 581 582 super(MultiCore, self).__init__(self, *args, **opt) 583 584 import Queue 585 import threading 586 import thread 587 self.queue = Queue.Queue() # list of job to do 588 self.done = Queue.Queue() # list of job finisned 589 self.submitted = Queue.Queue() # one entry by job submitted 590 self.stoprequest = threading.Event() #flag to ensure everything to close 591 self.demons = [] 592 self.nb_done =0 593 if 'nb_core' in opt: 594 self.nb_core = opt['nb_core'] 595 elif isinstance(args[0],int): 596 self.nb_core = args[0] 597 else: 598 self.nb_core = 1 599 self.update_fct = None 600 601 self.lock = threading.Event() # allow nice lock of the main thread 602 self.pids = Queue.Queue() # allow to clean jobs submit via subprocess 603 self.done_pid = [] # list of job finisned 604 self.done_pid_queue = Queue.Queue() 605 self.fail_msg = None 606 607 # starting the worker node 608 for _ in range(self.nb_core): 609 self.start_demon()
610 611
612 - def start_demon(self):
613 import threading 614 t = threading.Thread(target=self.worker) 615 t.daemon = True 616 t.start() 617 self.demons.append(t)
618 619
620 - def worker(self):
621 import Queue 622 import thread 623 while not self.stoprequest.isSet(): 624 try: 625 args = self.queue.get() 626 tag, exe, arg, opt = args 627 try: 628 # check for executable case 629 if isinstance(exe,str): 630 if os.path.exists(exe) and not exe.startswith('/'): 631 exe = './' + exe 632 if isinstance(opt['stdout'],str): 633 opt['stdout'] = open(opt['stdout'],'w') 634 if opt['stderr'] == None: 635 opt['stderr'] = subprocess.STDOUT 636 proc = misc.Popen([exe] + arg, **opt) 637 pid = proc.pid 638 self.pids.put(pid) 639 proc.wait() 640 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 641 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 642 (' '.join([exe]+arg), proc.returncode) 643 logger.warning(fail_msg) 644 self.stoprequest.set() 645 self.remove(fail_msg) 646 # handle the case when this is a python function. Note that 647 # this use Thread so they are NO built-in parralelization this is 648 # going to work on a single core! (but this is fine for IO intensive 649 # function. for CPU intensive fct this will slow down the computation 650 else: 651 pid = tag 652 self.pids.put(pid) 653 # the function should return 0 if everything is fine 654 # the error message otherwise 655 returncode = exe(*arg, **opt) 656 if returncode != 0: 657 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 658 self.stoprequest.set() 659 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 660 except Exception,error: 661 self.fail_msg = sys.exc_info() 662 logger.warning(str(error)) 663 self.stoprequest.set() 664 self.remove(error) 665 666 if __debug__: 667 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2] 668 669 self.queue.task_done() 670 self.done.put(tag) 671 self.done_pid_queue.put(pid) 672 #release the mother to print the status on the screen 673 try: 674 self.lock.set() 675 except thread.error: 676 continue 677 except Queue.Empty: 678 continue
679 680 681 682
683 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 684 log=None, required_output=[], nb_submit=0):
685 """submit a job on multicore machine""" 686 687 tag = (prog, tuple(argument), cwd, nb_submit) 688 if isinstance(prog, str): 689 690 opt = {'cwd': cwd, 691 'stdout':stdout, 692 'stderr': stderr} 693 self.queue.put((tag, prog, argument, opt)) 694 self.submitted.put(1) 695 return tag 696 else: 697 # python function 698 self.queue.put((tag, prog, argument, {})) 699 self.submitted.put(1) 700 return tag
701
702 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 703 stderr=None, log=None, **opts):
704 """launch one job and wait for it""" 705 if isinstance(stdout, str): 706 stdout = open(stdout, 'w') 707 if isinstance(stderr, str): 708 stdout = open(stderr, 'w') 709 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
710
711 - def remove(self, error=None):
712 """Ensure that all thread are killed""" 713 714 # ensure the worker to stop 715 self.stoprequest.set() 716 if error and not self.fail_msg: 717 self.fail_msg = error 718 719 # cleaning the queue done_pid_queue and move them to done_pid 720 while not self.done_pid_queue.empty(): 721 pid = self.done_pid_queue.get() 722 self.done_pid.append(pid) 723 # self.done_pid_queue.task_done() 724 725 while not self.pids.empty(): 726 pid = self.pids.get() 727 self.pids.task_done() 728 if isinstance(pid, tuple): 729 continue 730 if pid in self.done_pid: 731 continue 732 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 733 % {'pid':pid} ) 734 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
735 736
737 - def wait(self, me_dir, update_status, update_first=None):
738 """Waiting that all the jobs are done. This function also control that 739 the submission by packet are handle correctly (i.e. submit the function)""" 740 741 import Queue 742 import threading 743 744 try: # to catch KeyBoardInterupt to see which kind of error to display 745 last_status = (0, 0, 0) 746 sleep_time = 1 747 use_lock = True 748 first = True 749 while True: 750 force_one_more_loop = False # some security 751 752 # Loop over the job tagged as done to check if some packet of jobs 753 # are finished in case, put the associate function in the queue 754 while self.done.qsize(): 755 try: 756 tag = self.done.get(True, 1) 757 except Queue.Empty: 758 pass 759 else: 760 if self.id_to_packet and tuple(tag) in self.id_to_packet: 761 packet = self.id_to_packet[tuple(tag)] 762 remaining = packet.remove_one() 763 if remaining == 0: 764 # fully ensure that the packet is finished (thread safe) 765 packet.queue.join() 766 self.submit(packet.fct, packet.args) 767 force_one_more_loop = True 768 self.nb_done += 1 769 self.done.task_done() 770 771 # Get from the various queue the Idle/Done/Running information 772 # Those variable should be thread safe but approximate. 773 Idle = self.queue.qsize() 774 Done = self.nb_done + self.done.qsize() 775 Running = max(0, self.submitted.qsize() - Idle - Done) 776 777 if Idle + Running <= 0 and not force_one_more_loop: 778 update_status(Idle, Running, Done) 779 # Going the quit since everything is done 780 # Fully Ensure that everything is indeed done. 781 self.queue.join() 782 break 783 784 if (Idle, Running, Done) != last_status: 785 if first and update_first: 786 update_first(Idle, Running, Done) 787 first = False 788 else: 789 update_status(Idle, Running, Done) 790 last_status = (Idle, Running, Done) 791 792 # cleaning the queue done_pid_queue and move them to done_pid 793 while not self.done_pid_queue.empty(): 794 pid = self.done_pid_queue.get() 795 self.done_pid.append(pid) 796 self.done_pid_queue.task_done() 797 798 799 # Define how to wait for the next iteration 800 if use_lock: 801 # simply wait that a worker release the lock 802 use_lock = self.lock.wait(300) 803 self.lock.clear() 804 if not use_lock and Idle > 0: 805 use_lock = True 806 else: 807 # to be sure that we will never fully lock at the end pass to 808 # a simple time.sleep() 809 time.sleep(sleep_time) 810 sleep_time = min(sleep_time + 2, 180) 811 if update_first: 812 update_first(Idle, Running, Done) 813 814 if self.stoprequest.isSet(): 815 if isinstance(self.fail_msg, Exception): 816 raise self.fail_msg 817 elif isinstance(self.fail_msg, str): 818 raise Exception, self.fail_msg 819 else: 820 misc.sprint(self.fail_msg) 821 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 822 # reset variable for next submission 823 try: 824 self.lock.clear() 825 except Exception: 826 pass 827 self.done = Queue.Queue() 828 self.done_pid = [] 829 self.done_pid_queue = Queue.Queue() 830 self.nb_done = 0 831 self.submitted = Queue.Queue() 832 self.pids = Queue.Queue() 833 self.stoprequest.clear() 834 835 except KeyboardInterrupt: 836 # if one of the node fails -> return that error 837 if isinstance(self.fail_msg, Exception): 838 raise self.fail_msg 839 elif isinstance(self.fail_msg, str): 840 raise Exception, self.fail_msg 841 elif self.fail_msg: 842 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 843 # else return orignal error 844 raise
845
846 -class CondorCluster(Cluster):
847 """Basic class for dealing with cluster submission""" 848 849 name = 'condor' 850 job_id = 'CONDOR_ID' 851 852 853 854 @multiple_try()
855 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 856 required_output=[], nb_submit=0):
857 """Submit a job prog to a Condor cluster""" 858 859 text = """Executable = %(prog)s 860 output = %(stdout)s 861 error = %(stderr)s 862 log = %(log)s 863 %(argument)s 864 environment = CONDOR_ID=$(Cluster).$(Process) 865 Universe = vanilla 866 notification = Error 867 Initialdir = %(cwd)s 868 %(requirement)s 869 getenv=True 870 queue 1 871 """ 872 873 if self.cluster_queue not in ['None', None]: 874 requirement = 'Requirements = %s=?=True' % self.cluster_queue 875 else: 876 requirement = '' 877 878 if cwd is None: 879 cwd = os.getcwd() 880 if stdout is None: 881 stdout = '/dev/null' 882 if stderr is None: 883 stderr = '/dev/null' 884 if log is None: 885 log = '/dev/null' 886 if not os.path.exists(prog): 887 prog = os.path.join(cwd, prog) 888 if argument: 889 argument = 'Arguments = %s' % ' '.join(argument) 890 else: 891 argument = '' 892 893 894 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 895 'stderr': stderr,'log': log,'argument': argument, 896 'requirement': requirement} 897 898 #open('submit_condor','w').write(text % dico) 899 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 900 stdin=subprocess.PIPE) 901 output, _ = a.communicate(text % dico) 902 #output = a.stdout.read() 903 #Submitting job(s). 904 #Logging submit event(s). 905 #1 job(s) submitted to cluster 2253622. 906 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 907 try: 908 id = pat.search(output).groups()[0] 909 except: 910 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 911 % output 912 self.submitted += 1 913 self.submitted_ids.append(id) 914 return id
915 916 @store_input() 917 @multiple_try()
918 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 919 log=None, input_files=[], output_files=[], required_output=[], 920 nb_submit=0):
921 """Submit the job on the cluster NO SHARE DISK 922 input/output file should be give relative to cwd 923 """ 924 925 if not required_output and output_files: 926 required_output = output_files 927 928 if (input_files == [] == output_files): 929 return self.submit(prog, argument, cwd, stdout, stderr, log, 930 required_output=required_output, nb_submit=nb_submit) 931 932 text = """Executable = %(prog)s 933 output = %(stdout)s 934 error = %(stderr)s 935 log = %(log)s 936 %(argument)s 937 should_transfer_files = YES 938 when_to_transfer_output = ON_EXIT 939 transfer_input_files = %(input_files)s 940 %(output_files)s 941 Universe = vanilla 942 notification = Error 943 Initialdir = %(cwd)s 944 %(requirement)s 945 getenv=True 946 queue 1 947 """ 948 949 if self.cluster_queue not in ['None', None]: 950 requirement = 'Requirements = %s=?=True' % self.cluster_queue 951 else: 952 requirement = '' 953 954 if cwd is None: 955 cwd = os.getcwd() 956 if stdout is None: 957 stdout = '/dev/null' 958 if stderr is None: 959 stderr = '/dev/null' 960 if log is None: 961 log = '/dev/null' 962 if not os.path.exists(prog): 963 prog = os.path.join(cwd, prog) 964 if argument: 965 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 966 else: 967 argument = '' 968 # input/output file treatment 969 if input_files: 970 input_files = ','.join(input_files) 971 else: 972 input_files = '' 973 if output_files: 974 output_files = 'transfer_output_files = %s' % ','.join(output_files) 975 else: 976 output_files = '' 977 978 979 980 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 981 'stderr': stderr,'log': log,'argument': argument, 982 'requirement': requirement, 'input_files':input_files, 983 'output_files':output_files} 984 985 #open('submit_condor','w').write(text % dico) 986 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 987 stdin=subprocess.PIPE) 988 output, _ = a.communicate(text % dico) 989 #output = a.stdout.read() 990 #Submitting job(s). 991 #Logging submit event(s). 992 #1 job(s) submitted to cluster 2253622. 993 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 994 try: 995 id = pat.search(output).groups()[0] 996 except: 997 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 998 % output 999 self.submitted += 1 1000 self.submitted_ids.append(id) 1001 return id
1002 1003 1004 1005 1006 1007 @multiple_try(nb_try=10, sleep=10)
1008 - def control_one_job(self, id):
1009 """ control the status of a single job with it's cluster id """ 1010 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1011 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1012 stderr=subprocess.PIPE) 1013 1014 error = status.stderr.read() 1015 if status.returncode or error: 1016 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1017 1018 return status.stdout.readline().strip()
1019 1020 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'} 1021 @check_interupt() 1022 @multiple_try(nb_try=10, sleep=10)
1023 - def control(self, me_dir):
1024 """ control the status of a single job with it's cluster id """ 1025 1026 if not self.submitted_ids: 1027 return 0, 0, 0, 0 1028 1029 packet = 15000 1030 idle, run, fail = 0, 0, 0 1031 ongoing = [] 1032 for i in range(1+(len(self.submitted_ids)-1)//packet): 1033 start = i * packet 1034 stop = (i+1) * packet 1035 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1036 " -format \"%d \" ClusterId " + \ 1037 " -format \"%d\\n\" JobStatus " 1038 1039 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1040 stderr=subprocess.PIPE) 1041 error = status.stderr.read() 1042 if status.returncode or error: 1043 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1044 1045 for line in status.stdout: 1046 id, status = line.strip().split() 1047 status = self.jobstatus[status] 1048 ongoing.append(id) 1049 if status in ['I','U']: 1050 idle += 1 1051 elif status == 'R': 1052 run += 1 1053 elif status != 'C': 1054 fail += 1 1055 1056 for id in list(self.submitted_ids): 1057 if id not in ongoing: 1058 status = self.check_termination(id) 1059 if status == 'wait': 1060 run += 1 1061 elif status == 'resubmit': 1062 idle += 1 1063 1064 return idle, run, self.submitted - (idle+run+fail), fail
1065 1066 @multiple_try()
1067 - def remove(self, *args, **opts):
1068 """Clean the jobson the cluster""" 1069 1070 if not self.submitted_ids: 1071 return 1072 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1073 1074 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1075 self.submitted_ids = []
1076
1077 -class PBSCluster(Cluster):
1078 """Basic class for dealing with cluster submission""" 1079 1080 name = 'pbs' 1081 job_id = 'PBS_JOBID' 1082 idle_tag = ['Q'] 1083 running_tag = ['T','E','R'] 1084 complete_tag = ['C'] 1085 1086 maximum_submited_jobs = 2500 1087 1088 @multiple_try()
1089 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1090 required_output=[], nb_submit=0):
1091 """Submit a job prog to a PBS cluster""" 1092 1093 me_dir = self.get_jobs_identifier(cwd, prog) 1094 1095 if len(self.submitted_ids) > self.maximum_submited_jobs: 1096 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1097 self.wait(me_dir, fct, self.maximum_submited_jobs) 1098 1099 1100 text = "" 1101 if cwd is None: 1102 cwd = os.getcwd() 1103 else: 1104 text = " cd %s;" % cwd 1105 if stdout is None: 1106 stdout = '/dev/null' 1107 if stderr is None: 1108 stderr = '/dev/null' 1109 elif stderr == -2: # -2 is subprocess.STDOUT 1110 stderr = stdout 1111 if log is None: 1112 log = '/dev/null' 1113 1114 if not os.path.isabs(prog): 1115 text += "./%s" % prog 1116 else: 1117 text+= prog 1118 1119 if argument: 1120 text += ' ' + ' '.join(argument) 1121 1122 command = ['qsub','-o', stdout, 1123 '-N', me_dir, 1124 '-e', stderr, 1125 '-V'] 1126 1127 if self.cluster_queue and self.cluster_queue != 'None': 1128 command.extend(['-q', self.cluster_queue]) 1129 1130 a = misc.Popen(command, stdout=subprocess.PIPE, 1131 stderr=subprocess.STDOUT, 1132 stdin=subprocess.PIPE, cwd=cwd) 1133 1134 output = a.communicate(text)[0] 1135 id = output.split('.')[0] 1136 if not id.isdigit() or a.returncode !=0: 1137 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1138 % output 1139 1140 self.submitted += 1 1141 self.submitted_ids.append(id) 1142 return id
1143 1144 @multiple_try()
1145 - def control_one_job(self, id):
1146 """ control the status of a single job with it's cluster id """ 1147 cmd = 'qstat '+str(id) 1148 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1149 stderr=subprocess.STDOUT) 1150 1151 for line in status.stdout: 1152 line = line.strip() 1153 if 'cannot connect to server' in line or 'cannot read reply' in line: 1154 raise ClusterManagmentError, 'server disconnected' 1155 if 'Unknown' in line: 1156 return 'F' 1157 elif line.startswith(str(id)): 1158 jobstatus = line.split()[4] 1159 else: 1160 jobstatus="" 1161 1162 if status.returncode != 0 and status.returncode is not None: 1163 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1164 if jobstatus in self.idle_tag: 1165 return 'I' 1166 elif jobstatus in self.running_tag: 1167 return 'R' 1168 return 'F'
1169 1170 1171 @multiple_try()
1172 - def control(self, me_dir):
1173 """ control the status of a single job with it's cluster id """ 1174 cmd = "qstat" 1175 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1176 1177 me_dir = self.get_jobs_identifier(me_dir) 1178 1179 ongoing = [] 1180 1181 idle, run, fail = 0, 0, 0 1182 for line in status.stdout: 1183 if 'cannot connect to server' in line or 'cannot read reply' in line: 1184 raise ClusterManagmentError, 'server disconnected' 1185 if me_dir in line: 1186 ongoing.append(line.split()[0].split('.')[0]) 1187 status2 = line.split()[4] 1188 if status2 in self.idle_tag: 1189 idle += 1 1190 elif status2 in self.running_tag: 1191 run += 1 1192 elif status2 in self.complete_tag: 1193 if not self.check_termination(line.split()[0].split('.')[0]): 1194 idle += 1 1195 else: 1196 fail += 1 1197 1198 if status.returncode != 0 and status.returncode is not None: 1199 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1200 1201 for id in list(self.submitted_ids): 1202 if id not in ongoing: 1203 status2 = self.check_termination(id) 1204 if status2 == 'wait': 1205 run += 1 1206 elif status2 == 'resubmit': 1207 idle += 1 1208 1209 return idle, run, self.submitted - (idle+run+fail), fail
1210 1211 @multiple_try()
1212 - def remove(self, *args, **opts):
1213 """Clean the jobs on the cluster""" 1214 1215 if not self.submitted_ids: 1216 return 1217 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1218 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1219 self.submitted_ids = []
1220
1221 1222 -class SGECluster(Cluster):
1223 """Basic class for dealing with cluster submission""" 1224 # Class written by Arian Abrahantes. 1225 1226 name = 'sge' 1227 job_id = 'JOB_ID' 1228 idle_tag = ['qw', 'hqw','hRqw','w'] 1229 running_tag = ['r','t','Rr','Rt'] 1230 identifier_length = 10 1231
1232 - def def_get_path(self,location):
1233 """replace string for path issues""" 1234 location = os.path.realpath(location) 1235 homePath = os.getenv("HOME") 1236 if homePath: 1237 location = location.replace(homePath,'$HOME') 1238 return location
1239 1240 @multiple_try()
1241 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1242 required_output=[], nb_submit=0):
1243 """Submit a job prog to an SGE cluster""" 1244 1245 me_dir = self.get_jobs_identifier(cwd, prog) 1246 1247 1248 if cwd is None: 1249 #cwd = os.getcwd() 1250 cwd = self.def_get_path(os.getcwd()) 1251 cwd1 = self.def_get_path(cwd) 1252 text = " cd %s;" % cwd1 1253 if stdout is None: 1254 stdout = '/dev/null' 1255 else: 1256 stdout = self.def_get_path(stdout) 1257 if stderr is None: 1258 stderr = '/dev/null' 1259 elif stderr == -2: # -2 is subprocess.STDOUT 1260 stderr = stdout 1261 else: 1262 stderr = self.def_get_path(stderr) 1263 1264 if log is None: 1265 log = '/dev/null' 1266 else: 1267 log = self.def_get_path(log) 1268 1269 text += prog 1270 if argument: 1271 text += ' ' + ' '.join(argument) 1272 1273 #if anything slips through argument 1274 #print "!=== inteded change ",text.replace('/srv/nfs','') 1275 #text = text.replace('/srv/nfs','') 1276 homePath = os.getenv("HOME") 1277 if homePath: 1278 text = text.replace(homePath,'$HOME') 1279 1280 logger.debug("!=== input %s" % text) 1281 logger.debug("!=== output %s" % stdout) 1282 logger.debug("!=== error %s" % stderr) 1283 logger.debug("!=== logs %s" % log) 1284 1285 command = ['qsub','-o', stdout, 1286 '-N', me_dir, 1287 '-e', stderr, 1288 '-V'] 1289 1290 if self.cluster_queue and self.cluster_queue != 'None': 1291 command.extend(['-q', self.cluster_queue]) 1292 1293 a = misc.Popen(command, stdout=subprocess.PIPE, 1294 stderr=subprocess.STDOUT, 1295 stdin=subprocess.PIPE, cwd=cwd) 1296 1297 output = a.communicate(text)[0] 1298 id = output.split(' ')[2] 1299 if not id.isdigit(): 1300 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1301 % output 1302 self.submitted += 1 1303 self.submitted_ids.append(id) 1304 logger.debug(output) 1305 1306 return id
1307 1308 @multiple_try()
1309 - def control_one_job(self, id):
1310 """ control the status of a single job with it's cluster id """ 1311 #cmd = 'qstat '+str(id) 1312 cmd = 'qstat ' 1313 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1314 for line in status.stdout: 1315 #print "!==",line 1316 #line = line.strip() 1317 #if 'Unknown' in line: 1318 # return 'F' 1319 #elif line.startswith(str(id)): 1320 # status = line.split()[4] 1321 if str(id) in line: 1322 status = line.split()[4] 1323 #print "!=status", status 1324 if status in self.idle_tag: 1325 return 'I' 1326 elif status in self.running_tag: 1327 return 'R' 1328 return 'F'
1329 1330 @multiple_try()
1331 - def control(self, me_dir):
1332 """ control the status of a single job with it's cluster id """ 1333 cmd = "qstat " 1334 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1335 1336 me_dir = self.get_jobs_identifier(me_dir) 1337 1338 finished = list(self.submitted_ids) 1339 1340 idle, run, fail = 0, 0, 0 1341 for line in status.stdout: 1342 if me_dir in line: 1343 id,_,_,_,status = line.split()[:5] 1344 if status in self.idle_tag: 1345 idle += 1 1346 finished.remove(id) 1347 elif status in self.running_tag: 1348 run += 1 1349 finished.remove(id) 1350 else: 1351 logger.debug(line) 1352 fail += 1 1353 finished.remove(id) 1354 1355 for id in finished: 1356 self.check_termination(id) 1357 1358 return idle, run, self.submitted - (idle+run+fail), fail
1359 1360 1361 1362 @multiple_try()
1363 - def remove(self, *args, **opts):
1364 """Clean the jobs on the cluster""" 1365 1366 if not self.submitted_ids: 1367 return 1368 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1369 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1370 self.submitted_ids = []
1371
1372 1373 -class LSFCluster(Cluster):
1374 """Basic class for dealing with cluster submission""" 1375 1376 name = 'lsf' 1377 job_id = 'LSB_JOBID' 1378 1379 @multiple_try()
1380 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1381 required_output=[], nb_submit=0):
1382 """Submit the job prog to an LSF cluster""" 1383 1384 1385 me_dir = self.get_jobs_identifier(cwd, prog) 1386 1387 text = "" 1388 command = ['bsub', '-C0', '-J', me_dir] 1389 if cwd is None: 1390 cwd = os.getcwd() 1391 else: 1392 text = " cd %s;" % cwd 1393 if stdout and isinstance(stdout, str): 1394 command.extend(['-o', stdout]) 1395 if stderr and isinstance(stdout, str): 1396 command.extend(['-e', stderr]) 1397 elif stderr == -2: # -2 is subprocess.STDOUT 1398 pass 1399 if log is None: 1400 log = '/dev/null' 1401 1402 text += prog 1403 if argument: 1404 text += ' ' + ' '.join(argument) 1405 1406 if self.cluster_queue and self.cluster_queue != 'None': 1407 command.extend(['-q', self.cluster_queue]) 1408 1409 a = misc.Popen(command, stdout=subprocess.PIPE, 1410 stderr=subprocess.STDOUT, 1411 stdin=subprocess.PIPE, cwd=cwd) 1412 1413 output = a.communicate(text)[0] 1414 #Job <nnnn> is submitted to default queue <normal>. 1415 try: 1416 id = output.split('>',1)[0].split('<')[1] 1417 except: 1418 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1419 % output 1420 if not id.isdigit(): 1421 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1422 % output 1423 self.submitted += 1 1424 self.submitted_ids.append(id) 1425 return id
1426 1427 1428 @multiple_try()
1429 - def control_one_job(self, id):
1430 """ control the status of a single job with it's cluster id """ 1431 1432 cmd = 'bjobs '+str(id) 1433 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1434 1435 for line in status.stdout: 1436 line = line.strip().upper() 1437 if 'JOBID' in line: 1438 continue 1439 elif str(id) not in line: 1440 continue 1441 status = line.split()[2] 1442 if status == 'RUN': 1443 return 'R' 1444 elif status == 'PEND': 1445 return 'I' 1446 elif status == 'DONE': 1447 return 'F' 1448 else: 1449 return 'H' 1450 return 'F'
1451 1452 @multiple_try()
1453 - def control(self, me_dir):
1454 """ control the status of a single job with it's cluster id """ 1455 1456 if not self.submitted_ids: 1457 return 0, 0, 0, 0 1458 1459 cmd = "bjobs " + ' '.join(self.submitted_ids) 1460 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1461 1462 jobstatus = {} 1463 for line in status.stdout: 1464 line = line.strip() 1465 if 'JOBID' in line: 1466 continue 1467 splitline = line.split() 1468 id = splitline[0] 1469 if id not in self.submitted_ids: 1470 continue 1471 jobstatus[id] = splitline[2] 1472 1473 idle, run, fail = 0, 0, 0 1474 for id in self.submitted_ids[:]: 1475 if id in jobstatus: 1476 status = jobstatus[id] 1477 else: 1478 status = 'MISSING' 1479 if status == 'RUN': 1480 run += 1 1481 elif status == 'PEND': 1482 idle += 1 1483 else: 1484 status = self.check_termination(id) 1485 if status == 'wait': 1486 run += 1 1487 elif status == 'resubmit': 1488 idle += 1 1489 1490 return idle, run, self.submitted - (idle+run+fail), fail
1491 1492 @multiple_try()
1493 - def remove(self, *args,**opts):
1494 """Clean the jobs on the cluster""" 1495 1496 if not self.submitted_ids: 1497 return 1498 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1499 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1500 self.submitted_ids = []
1501
1502 -class GECluster(Cluster):
1503 """Class for dealing with cluster submission on a GE cluster""" 1504 1505 name = 'ge' 1506 job_id = 'JOB_ID' 1507 idle_tag = ['qw'] 1508 running_tag = ['r'] 1509 1510 @multiple_try()
1511 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1512 required_output=[], nb_submit=0):
1513 """Submit a job prog to a GE cluster""" 1514 1515 text = "" 1516 if cwd is None: 1517 cwd = os.getcwd() 1518 else: 1519 text = " cd %s; bash " % cwd 1520 if stdout is None: 1521 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1522 if stderr is None: 1523 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1524 elif stderr == -2: # -2 is subprocess.STDOUT 1525 stderr = stdout 1526 if log is None: 1527 log = '/dev/null' 1528 1529 text += prog 1530 if argument: 1531 text += ' ' + ' '.join(argument) 1532 text += '\n' 1533 tmp_submit = os.path.join(cwd, 'tmp_submit') 1534 open(tmp_submit,'w').write(text) 1535 1536 a = misc.Popen(['qsub','-o', stdout, 1537 '-e', stderr, 1538 tmp_submit], 1539 stdout=subprocess.PIPE, 1540 stderr=subprocess.STDOUT, 1541 stdin=subprocess.PIPE, cwd=cwd) 1542 1543 output = a.communicate()[0] 1544 #Your job 874511 ("test.sh") has been submitted 1545 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1546 try: 1547 id = pat.search(output).groups()[0] 1548 except: 1549 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1550 % output 1551 self.submitted += 1 1552 self.submitted_ids.append(id) 1553 return id
1554 1555 @multiple_try()
1556 - def control_one_job(self, id):
1557 """ control the status of a single job with it's cluster id """ 1558 cmd = 'qstat | grep '+str(id) 1559 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1560 if not status: 1561 return 'F' 1562 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1563 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1564 stat = '' 1565 for line in status.stdout.read().split('\n'): 1566 if not line: 1567 continue 1568 line = line.strip() 1569 try: 1570 groups = pat.search(line).groups() 1571 except: 1572 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1573 if groups[0] != id: continue 1574 stat = groups[1] 1575 if not stat: 1576 return 'F' 1577 if stat in self.idle_tag: 1578 return 'I' 1579 if stat in self.running_tag: 1580 return 'R'
1581 1582 @multiple_try()
1583 - def control(self, me_dir=None):
1584 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1585 if not self.submitted_ids: 1586 return 0, 0, 0, 0 1587 idle, run, fail = 0, 0, 0 1588 ongoing = [] 1589 for statusflag in ['p', 'r', 'sh']: 1590 cmd = 'qstat -s %s' % statusflag 1591 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1592 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1593 pat = re.compile("^(\d+)") 1594 for line in status.stdout.read().split('\n'): 1595 line = line.strip() 1596 try: 1597 id = pat.search(line).groups()[0] 1598 except Exception: 1599 pass 1600 else: 1601 if id not in self.submitted_ids: 1602 continue 1603 ongoing.append(id) 1604 if statusflag == 'p': 1605 idle += 1 1606 if statusflag == 'r': 1607 run += 1 1608 if statusflag == 'sh': 1609 fail += 1 1610 for id in list(self.submitted_ids): 1611 if id not in ongoing: 1612 self.check_termination(id) 1613 #self.submitted_ids = ongoing 1614 1615 return idle, run, self.submitted - idle - run - fail, fail
1616 1617 @multiple_try()
1618 - def remove(self, *args, **opts):
1619 """Clean the jobs on the cluster""" 1620 1621 if not self.submitted_ids: 1622 return 1623 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1624 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1625 self.submitted_ids = []
1626
1627 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1628 """start a computation and not wait for it to finish. 1629 this fonction returns a lock which is locked as long as the job is 1630 running.""" 1631 1632 mc = MultiCore(1) 1633 mc.submit(exe, argument, cwd, stdout, **opt) 1634 mc.need_waiting = True 1635 return mc.lock
1636
1637 1638 -class SLURMCluster(Cluster):
1639 """Basic class for dealing with cluster submission""" 1640 1641 name = 'slurm' 1642 job_id = 'SLURM_JOBID' 1643 idle_tag = ['Q','PD','S','CF'] 1644 running_tag = ['R', 'CG'] 1645 complete_tag = ['C'] 1646 identifier_length = 8 1647 1648 @multiple_try()
1649 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1650 required_output=[], nb_submit=0):
1651 """Submit a job prog to a SLURM cluster""" 1652 1653 me_dir = self.get_jobs_identifier(cwd, prog) 1654 1655 1656 if cwd is None: 1657 cwd = os.getcwd() 1658 if stdout is None: 1659 stdout = '/dev/null' 1660 if stderr is None: 1661 stderr = '/dev/null' 1662 elif stderr == -2: # -2 is subprocess.STDOUT 1663 stderr = stdout 1664 if log is None: 1665 log = '/dev/null' 1666 1667 command = ['sbatch', '-o', stdout, 1668 '-J', me_dir, 1669 '-e', stderr, prog] + argument 1670 1671 if self.cluster_queue and self.cluster_queue != 'None': 1672 command.insert(1, '-p') 1673 command.insert(2, self.cluster_queue) 1674 1675 a = misc.Popen(command, stdout=subprocess.PIPE, 1676 stderr=subprocess.STDOUT, 1677 stdin=subprocess.PIPE, cwd=cwd) 1678 1679 output = a.communicate() 1680 output_arr = output[0].split(' ') 1681 id = output_arr[3].rstrip() 1682 1683 if not id.isdigit(): 1684 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1685 % (output[0] + '\n' + output[1]) 1686 1687 self.submitted += 1 1688 self.submitted_ids.append(id) 1689 return id
1690 1691 @multiple_try()
1692 - def control_one_job(self, id):
1693 """ control the status of a single job with it's cluster id """ 1694 cmd = 'squeue j'+str(id) 1695 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1696 stderr=open(os.devnull,'w')) 1697 1698 for line in status.stdout: 1699 line = line.strip() 1700 if 'Invalid' in line: 1701 return 'F' 1702 elif line.startswith(str(id)): 1703 status = line.split()[4] 1704 if status in self.idle_tag: 1705 return 'I' 1706 elif status in self.running_tag: 1707 return 'R' 1708 return 'F'
1709 1710 @multiple_try()
1711 - def control(self, me_dir):
1712 """ control the status of a single job with it's cluster id """ 1713 cmd = "squeue" 1714 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1715 1716 me_dir = self.get_jobs_identifier(me_dir) 1717 1718 idle, run, fail = 0, 0, 0 1719 ongoing=[] 1720 for line in pstatus.stdout: 1721 if me_dir in line: 1722 id, _, _,_ , status,_ = line.split(None,5) 1723 ongoing.append(id) 1724 if status in self.idle_tag: 1725 idle += 1 1726 elif status in self.running_tag: 1727 run += 1 1728 elif status in self.complete_tag: 1729 status = self.check_termination(id) 1730 if status == 'wait': 1731 run += 1 1732 elif status == 'resubmit': 1733 idle += 1 1734 else: 1735 fail += 1 1736 1737 #control other finished job 1738 for id in list(self.submitted_ids): 1739 if id not in ongoing: 1740 status = self.check_termination(id) 1741 if status == 'wait': 1742 run += 1 1743 elif status == 'resubmit': 1744 idle += 1 1745 1746 1747 return idle, run, self.submitted - (idle+run+fail), fail
1748 1749 @multiple_try()
1750 - def remove(self, *args, **opts):
1751 """Clean the jobs on the cluster""" 1752 1753 if not self.submitted_ids: 1754 return 1755 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1756 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1757 self.submitted_ids = []
1758
1759 -class HTCaaSCluster(Cluster):
1760 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1761 1762 name= 'htcaas' 1763 job_id = 'HTCAAS_JOBID' 1764 idle_tag = ['waiting'] 1765 running_tag = ['preparing','running'] 1766 complete_tag = ['done'] 1767 1768 @store_input() 1769 @multiple_try()
1770 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1771 log=None, input_files=[], output_files=[], required_output=[], 1772 nb_submit=0):
1773 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1774 input/output file should be given as relative to CWd 1775 """ 1776 # To make workspace name(temp) 1777 cur_usr = os.getenv('USER') 1778 1779 if cwd is None: 1780 cwd = os.getcwd() 1781 1782 cwd_cp = cwd.rsplit("/",2) 1783 1784 if not stdout is None: 1785 print "stdout: %s" % stdout 1786 1787 if not os.path.exists(prog): 1788 prog = os.path.join(cwd, prog) 1789 1790 if not required_output and output_files: 1791 required_output = output_files 1792 1793 logger.debug(prog) 1794 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1795 cwd_arg = cwd+"/arguments" 1796 temp = ' '.join([str(a) for a in argument]) 1797 arg_cmd="echo '"+temp+"' > " + cwd_arg 1798 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1799 if argument : 1800 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1801 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1802 id = a.stdout.read().strip() 1803 1804 else: 1805 cwd_arg = cwd+"/arguments" 1806 temp = ' '.join([str(a) for a in argument]) 1807 temp_file_name = "sub." + os.path.basename(prog) 1808 text = """#!/bin/bash 1809 MYPWD=%(cwd)s 1810 cd $MYPWD 1811 input_files=(%(input_files)s ) 1812 for i in ${input_files[@]} 1813 do 1814 chmod -f +x $i 1815 done 1816 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1817 """ 1818 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1819 'arguments': ' '.join([str(a) for a in argument]), 1820 'program': ' ' if '.py' in prog else 'bash'} 1821 1822 # writing a new script for the submission 1823 new_prog = pjoin(cwd, temp_file_name) 1824 open(new_prog, 'w').write(text % dico) 1825 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1826 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1827 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1828 id = a.stdout.read().strip() 1829 logger.debug(id) 1830 1831 nb_try=0 1832 nb_limit=5 1833 if not id.isdigit() : 1834 print "[ID is not digit]:" + id 1835 1836 while not id.isdigit() : 1837 nb_try+=1 1838 print "[fail_retry]:"+ nb_try 1839 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1840 id = a.stdout.read().strip() 1841 if nb_try > nb_limit : 1842 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1843 break 1844 1845 self.submitted += 1 1846 self.submitted_ids.append(id) 1847 1848 return id
1849 1850 @multiple_try(nb_try=10, sleep=5)
1851 - def control_one_job(self, id):
1852 """ control the status of a single job with it's cluster id """ 1853 1854 if id == 0 : 1855 status_out ='C' 1856 else : 1857 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1858 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1859 stderr=subprocess.PIPE) 1860 error = status.stderr.read() 1861 if status.returncode or error: 1862 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1863 status_out= status.stdout.read().strip() 1864 status_out= status_out.split(":",1)[1] 1865 if status_out == 'waiting': 1866 status_out='I' 1867 elif status_out == 'preparing' or status_out == 'running': 1868 status_out = 'R' 1869 elif status_out != 'done': 1870 status_out = 'F' 1871 elif status_out == 'done': 1872 status_out = 'C' 1873 1874 return status_out
1875 1876 @multiple_try()
1877 - def control(self, me_dir):
1878 """ control the status of a single job with it's cluster id """ 1879 if not self.submitted_ids: 1880 logger.debug("self.submitted_ids not exists") 1881 return 0, 0, 0, 0 1882 1883 ongoing = [] 1884 idle, run, fail = 0, 0, 0 1885 1886 start = self.submitted_ids[0] 1887 end = self.submitted_ids[-1] 1888 1889 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1890 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1891 1892 for line in status.stdout: 1893 #ongoing.append(line.split()[0].strip()) 1894 status2 = line.split()[-1] 1895 if status2 is not 'null' or line.split()[0].strip() is not '0': 1896 ongoing.append(line.split()[0].strip()) 1897 logger.debug("["+line.split()[0].strip()+"]"+status2) 1898 if status2 is 'null' or line.split()[0].strip() is '0': 1899 idle += 1 1900 elif status2 in self.idle_tag: 1901 idle += 1 1902 elif status2 in self.running_tag: 1903 run += 1 1904 elif status2 in self.complete_tag: 1905 if not self.check_termination(line.split()[0]): 1906 idle +=1 1907 else: 1908 fail += 1 1909 1910 return idle, run, self.submitted - (idle+run+fail), fail
1911 1912 @multiple_try()
1913 - def remove(self, *args, **opts):
1914 """Clean the jobson the cluster""" 1915 1916 if not self.submitted_ids: 1917 return 1918 for i in range(len(self.submitted_ids)): 1919 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1920 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1921
1922 -class HTCaaS2Cluster(Cluster):
1923 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1924 1925 name= 'htcaas2' 1926 job_id = 'HTCAAS2_JOBID' 1927 idle_tag = ['waiting'] 1928 running_tag = ['preparing','running'] 1929 complete_tag = ['done'] 1930 1931 @store_input() 1932 @multiple_try()
1933 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1934 log=None, input_files=[], output_files=[], required_output=[], 1935 nb_submit=0):
1936 1937 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1938 input/output file should be given as relative to CWD 1939 """ 1940 if cwd is None: 1941 cwd = os.getcwd() 1942 1943 if not os.path.exists(prog): 1944 prog = os.path.join(cwd, prog) 1945 1946 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1947 if cwd or prog : 1948 self.submitted_dirs.append(cwd) 1949 self.submitted_exes.append(prog) 1950 else: 1951 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1952 1953 if argument : 1954 self.submitted_args.append('='.join([str(a) for a in argument])) 1955 1956 if cwd or prog : 1957 self.submitted += 1 1958 id = self.submitted 1959 self.submitted_ids.append(id) 1960 else: 1961 logger.debug("cwd and prog are not exist! ") 1962 id = 0 1963 1964 else: 1965 temp_file_name = "sub."+ os.path.basename(prog) 1966 text = """#!/bin/bash 1967 MYPWD=%(cwd)s 1968 cd $MYPWD 1969 input_files=(%(input_files)s ) 1970 for i in ${input_files[@]} 1971 do 1972 chmod -f +x $i 1973 done 1974 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1975 """ 1976 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1977 'arguments': ' '.join([str(a) for a in argument]), 1978 'program': ' ' if '.py' in prog else 'bash'} 1979 # writing a new script for the submission 1980 new_prog = pjoin(cwd, temp_file_name) 1981 open(new_prog, 'w').write(text % dico) 1982 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1983 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 1984 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1985 id = a.stdout.read().strip() 1986 logger.debug("[mode2]-["+str(id)+"]") 1987 if cwd and prog : 1988 self.submitted += 1 1989 self.submitted_ids.append(id) 1990 else: 1991 logger.debug("cwd and prog are not exist! ") 1992 id = 0 1993 1994 return id
1995 1996 @multiple_try()
1997 - def metasubmit(self, me_dir=None):
1998 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 1999 tmp_leng= len(self.submitted_ids)/2 2000 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 2001 tmp_dirs2= self.submitted_dirs[tmp_leng:] 2002 tmp_exes1= self.submitted_exes[0:tmp_leng] 2003 tmp_exes2= self.submitted_exes[tmp_leng:] 2004 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a is not ' ']), 2005 '-e', ":".join([str(a) for a in tmp_exes1 if a and a is not ' '])] 2006 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a is not ' ']), 2007 '-e', ":".join([str(a) for a in tmp_exes2 if a and a is not ' '])] 2008 if len(self.submitted_args) > 0 : 2009 tmp_args1= self.submitted_args[0:tmp_leng] 2010 tmp_args2= self.submitted_args[tmp_leng:] 2011 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 2012 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 2013 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2014 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2015 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2016 2017 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2018 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a is not ' ']), 2019 '-e', ":".join([str(a) for a in self.submitted_exes if a and a is not ' '])] 2020 if len(self.submitted_args) > 0 : 2021 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2022 if self.submitted_dirs[0] or self.submitted_exes[0] : 2023 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2024 me_dir = result.stdout.read().strip() 2025 self.submitted_ids[0]=me_dir 2026 else: 2027 me_dir = self.submitted_ids[-1] 2028 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2029 me_dir = self.submitted_ids[0] 2030 else: 2031 me_dir = -1 2032 2033 logger.debug("[" + str(me_dir) + "]") 2034 2035 self.submitted_dirs = [] 2036 self.submitted_exes = [] 2037 self.submitted_args = [] 2038 2039 return me_dir
2040 2041 2042 @multiple_try(nb_try=10, sleep=5)
2043 - def control_one_job(self, id):
2044 """ control the status of a single job with it's cluster id """ 2045 #logger.debug("CONTROL ONE JOB MODE") 2046 if self.submitted == self.submitted_ids[-1] : 2047 id = self.metasubmit(self) 2048 tempid = self.submitted_ids[-1] 2049 self.submitted_ids.remove(self.submitted_ids[-1]) 2050 self.submitted_ids.append(id) 2051 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2052 2053 if id == 0 : 2054 status_out ='C' 2055 else: 2056 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2057 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2058 stderr=subprocess.PIPE) 2059 error = status.stderr.read() 2060 if status.returncode or error: 2061 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error 2062 status_out= status.stdout.read().strip() 2063 status_out= status_out.split(":",1)[1] 2064 logger.debug("[["+str(id)+"]]"+status_out) 2065 if status_out == 'waiting': 2066 status_out='I' 2067 elif status_out == 'preparing' or status_out == 'running': 2068 status_out = 'R' 2069 elif status_out != 'done': 2070 status_out = 'F' 2071 elif status_out == 'done': 2072 status_out = 'C' 2073 self.submitted -= 1 2074 2075 return status_out
2076 2077 @multiple_try()
2078 - def control(self, me_dir):
2079 """ control the status of a single job with it's cluster id """ 2080 if not self.submitted_ids: 2081 logger.debug("self.submitted_ids not exists") 2082 return 0, 0, 0, 0 2083 2084 if "//" in me_dir : 2085 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2086 start = me_dir.split("//")[0] 2087 end = me_dir.split("//")[1] 2088 else : 2089 start = me_dir.split("//")[1] 2090 end = me_dir.split("//")[0] 2091 elif "/" in me_dir : # update 2092 start = 0 2093 end = 0 2094 elif me_dir.isdigit(): 2095 start = me_dir 2096 end = me_dir 2097 elif not me_dir.isdigit(): 2098 me_dir = self.submitted_ids[0] 2099 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2100 2101 ongoing = [] 2102 idle, run, fail, done = 0, 0, 0, 0 2103 2104 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2105 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2106 2107 for line in status.stdout: 2108 status2 = line.split()[-1] 2109 if status2 is not 'null' or line.split()[0].strip() is not '0': 2110 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2111 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2112 2113 if status2 is 'null' or line.split()[0].strip() is '0': 2114 idle += 1 2115 elif status2 in self.idle_tag: 2116 idle += 1 2117 elif status2 in self.running_tag: 2118 run += 1 2119 elif status2 in self.complete_tag: 2120 done += 1 2121 self.submitted -= 1 2122 if not self.check_termination(line.split()[1]): 2123 idle +=1 2124 else: 2125 fail += 1 2126 2127 return idle, run, self.submitted - (idle+run+fail), fail
2128 2129 @multiple_try()
2130 - def remove(self, *args, **opts):
2131 """Clean the jobson the cluster""" 2132 2133 if not self.submitted_ids: 2134 return 2135 id = self.submitted_ids[0] 2136 if id is not 0 : 2137 cmd = "htcaas-job-cancel -m %s" % str(id) 2138 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2139 2140 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2141 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2142 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2143 2144 onecore=MultiCore(1) # create a thread to run simple bash job without having to 2145 #fork the main process 2146