Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21  import sys 
  22   
  23  logger = logging.getLogger('madgraph.cluster')  
  24   
  25  try: 
  26      from madgraph import MadGraph5Error 
  27      import madgraph.various.misc as misc 
  28  except Exception, error: 
  29      if __debug__: 
  30          print  str(error) 
  31      from internal import MadGraph5Error 
  32      import internal.misc as misc 
  33   
  34  pjoin = os.path.join 
35 36 -class ClusterManagmentError(MadGraph5Error):
37 pass
38
39 -class NotImplemented(MadGraph5Error):
40 pass
41 42 43 multiple_try = misc.multiple_try 44 pjoin = os.path.join
45 46 47 -def check_interupt(error=KeyboardInterrupt):
48 49 def deco_interupt(f): 50 def deco_f_interupt(self, *args, **opt): 51 try: 52 return f(self, *args, **opt) 53 except error: 54 try: 55 self.remove(*args, **opt) 56 except Exception: 57 pass 58 raise error
59 return deco_f_interupt 60 return deco_interupt 61
62 -def store_input(arg=''):
63 64 def deco_store(f): 65 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 66 input_files=[], output_files=[], required_output=[], nb_submit=0): 67 frame = inspect.currentframe() 68 args, _, _, values = inspect.getargvalues(frame) 69 args = dict([(i, values[i]) for i in args if i != 'self']) 70 id = f(self, **args) 71 if self.nb_retry > 0: 72 self.retry_args[id] = args 73 return id
74 return deco_f_store 75 return deco_store 76
77 -def need_transfer(options):
78 """ This function checks whether compression of input files are necessary 79 given the running options given. """ 80 81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 82 return False 83 else: 84 return True
85
86 -class Cluster(object):
87 """Basic Class for all cluster type submission""" 88 name = 'mother class' 89 identifier_length = 14 90
91 - def __init__(self,*args, **opts):
92 """Init the cluster""" 93 94 self.submitted = 0 95 self.submitted_ids = [] 96 self.finish = 0 97 self.submitted_dirs = [] #HTCaaS 98 self.submitted_exes = [] #HTCaaS 99 self.submitted_args = [] #HTCaaS 100 101 if 'cluster_queue' in opts: 102 self.cluster_queue = opts['cluster_queue'] 103 else: 104 self.cluster_queue = 'madgraph' 105 if 'cluster_temp_path' in opts: 106 self.temp_dir = opts['cluster_temp_path'] 107 else: 108 self.temp_dir = None 109 self.options = {'cluster_status_update': (600, 30)} 110 for key,value in opts.items(): 111 self.options[key] = value 112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 114 self.options = dict(opts) 115 self.retry_args = {} 116 # controlling jobs in controlled type submision 117 self.packet = {} 118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster.""" 123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124 125 126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 128 log=None, input_files=[], output_files=[], required_output=[], 129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster. 131 NO SHARE DISK""" 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 138 if not required_output and output_files: 139 required_output = output_files 140 141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 142 (input_files == [] == output_files): 143 return self.submit(prog, argument, cwd, stdout, stderr, log, 144 required_output=required_output, nb_submit=nb_submit) 145 146 if not input_files and not output_files: 147 # not input/output so not using submit2 148 return self.submit(prog, argument, cwd, stdout, stderr, log, 149 required_output=required_output, nb_submit=nb_submit) 150 151 if cwd is None: 152 cwd = os.getcwd() 153 if not os.path.exists(prog): 154 prog = os.path.join(cwd, prog) 155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 156 157 text = """#!/bin/bash 158 MYTMP=%(tmpdir)s/run$%(job_id)s 159 MYPWD=%(cwd)s 160 mkdir -p $MYTMP 161 cd $MYPWD 162 input_files=( %(input_files)s ) 163 for i in ${input_files[@]} 164 do 165 cp -R -L $i $MYTMP 166 done 167 cd $MYTMP 168 echo '%(arguments)s' > arguments 169 chmod +x ./%(script)s 170 %(program)s ./%(script)s %(arguments)s 171 exit=$? 172 output_files=( %(output_files)s ) 173 for i in ${output_files[@]} 174 do 175 cp -r $MYTMP/$i $MYPWD 176 done 177 # if [ "$exit" -eq "0" ] 178 # then 179 rm -rf $MYTMP 180 # fi 181 """ 182 183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 184 'cwd': cwd, 'job_id': self.job_id, 185 'input_files': ' '.join(input_files + [prog]), 186 'output_files': ' '.join(output_files), 187 'arguments': ' '.join([str(a) for a in argument]), 188 'program': ' ' if '.py' in prog else 'bash'} 189 190 # writing a new script for the submission 191 new_prog = pjoin(cwd, temp_file_name) 192 open(new_prog, 'w').write(text % dico) 193 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 194 195 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 196 required_output=required_output, nb_submit=nb_submit)
197 198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 200 log=None, input_files=[], output_files=[], required_output=[], 201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant 203 method should not be overwritten (but for DAG type submission)""" 204 205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 206 output_files, required_output, nb_submit) 207 208 209 if not packet_member: 210 return id 211 else: 212 if isinstance(packet_member, Packet): 213 self.id_to_packet[id] = packet_member 214 packet_member.put(id) 215 if packet_member.tag not in self.packet: 216 self.packet[packet_member.tag] = packet_member 217 else: 218 if packet_member in self.packet: 219 packet = self.packet[packet_member] 220 packet.put(id) 221 self.id_to_packet[id] = packet 222 return id
223
224 - def control(self, me_dir=None):
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 226 if not self.submitted_ids: 227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 228 idle, run, fail = 0, 0, 0 229 for pid in self.submitted_ids[:]: 230 status = self.control_one_job(id) 231 if status == 'I': 232 idle += 1 233 elif status == 'R': 234 run += 1 235 elif status == 'F': 236 self.finish +=1 237 self.submitted_ids.remove(pid) 238 else: 239 fail += 1 240 241 return idle, run, self.finish, fail
242
243 - def control_one_job(self, pid):
244 """ control the status of a single job with it's cluster id """ 245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
247 - def get_jobs_identifier(self, path, second_path=None):
248 """get a unique run_name for all the jobs helps to identify the runs 249 in the controller for some cluster.""" 250 251 if second_path: 252 path = os.path.realpath(pjoin(path, second_path)) 253 elif not os.path.exists(path): 254 return path # job already done 255 256 if 'SubProcesses' in path: 257 target = path.rsplit('/SubProcesses',1)[0] 258 elif 'MCatNLO' in path: 259 target = path.rsplit('/MCatNLO',1)[0] 260 elif second_path: 261 target=path 262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 263 else: 264 target = path 265 266 if target.endswith('/'): 267 target = target[:-1] 268 269 target = misc.digest(target)[-self.identifier_length:] 270 if not target[0].isalpha(): 271 target = 'a' + target[1:] 272 273 return target
274 275 276 @check_interupt()
277 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
278 """Wait that all job are finish. 279 if minimal_job set, then return if idle + run is lower than that number""" 280 281 282 mode = 1 # 0 is long waiting/ 1 is short waiting 283 nb_iter = 0 284 nb_short = 0 285 change_at = 5 # number of iteration from which we wait longer between update. 286 287 if update_first: 288 idle, run, finish, fail = self.control(me_dir) 289 update_first(idle, run, finish) 290 291 #usefull shortcut for readibility 292 longtime, shorttime = self.options['cluster_status_update'] 293 294 nb_job = 0 295 296 if self.options['cluster_type'] == 'htcaas2': 297 me_dir = self.metasubmit(self) 298 299 while 1: 300 old_mode = mode 301 nb_iter += 1 302 idle, run, finish, fail = self.control(me_dir) 303 if nb_job: 304 if idle + run + finish + fail != nb_job: 305 nb_job = idle + run + finish + fail 306 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 307 else: 308 nb_job = idle + run + finish + fail 309 if fail: 310 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 311 if idle + run == 0: 312 #time.sleep(20) #security to ensure that the file are really written on the disk 313 logger.info('All jobs finished') 314 fct(idle, run, finish) 315 break 316 if idle + run < minimal_job: 317 return 318 fct(idle, run, finish) 319 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 320 if nb_iter < change_at: 321 mode = 1 322 elif idle < run: 323 if old_mode == 0: 324 if nb_short: 325 mode = 0 #we already be back from short to long so stay in long 326 #check if we need to go back to short mode 327 elif idle: 328 if nb_iter > change_at + int(longtime)//shorttime: 329 mode = 0 #stay in long waiting mode 330 else: 331 mode = 1 # pass in short waiting mode 332 nb_short =0 333 else: 334 mode = 1 # pass in short waiting mode 335 nb_short = 0 336 elif old_mode == 1: 337 nb_short +=1 338 if nb_short > 3* max(change_at, int(longtime)//shorttime): 339 mode = 0 #go back in slow waiting 340 else: 341 mode = 0 342 343 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 344 if old_mode > mode: 345 logger.info('''Start to wait %ss between checking status. 346 Note that you can change this time in the configuration file. 347 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 348 349 #now Waiting! 350 if mode == 0: 351 try: 352 time.sleep(self.options['cluster_status_update'][0]) 353 except KeyboardInterrupt: 354 logger.info('start to update the status') 355 nb_iter = min(0, change_at -2) 356 nb_short = 0 357 else: 358 time.sleep(self.options['cluster_status_update'][1]) 359 360 361 self.submitted = 0 362 self.submitted_ids = []
363
364 - def check_termination(self, job_id):
365 """Check the termination of the jobs with job_id and relaunch it if needed.""" 366 367 368 if job_id not in self.retry_args: 369 if job_id in self.id_to_packet: 370 nb_in_packet = self.id_to_packet[job_id].remove_one() 371 if nb_in_packet == 0: 372 # packet done run the associate function 373 packet = self.id_to_packet[job_id] 374 # fully ensure that the packet is finished (thread safe) 375 packet.queue.join() 376 #running the function 377 packet.fct(*packet.args) 378 del self.id_to_packet[job_id] 379 return 'resubmit' 380 else: 381 return True 382 383 args = self.retry_args[job_id] 384 if 'time_check' in args: 385 time_check = args['time_check'] 386 else: 387 time_check = 0 388 389 for path in args['required_output']: 390 if args['cwd']: 391 path = pjoin(args['cwd'], path) 392 # check that file exists and is not empty. 393 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 394 break 395 else: 396 # all requested output are present 397 if time_check > 0: 398 logger.info('Job %s Finally found the missing output.' % (job_id)) 399 del self.retry_args[job_id] 400 self.submitted_ids.remove(job_id) 401 # check if the job_id is in a packet 402 if job_id in self.id_to_packet: 403 nb_in_packet = self.id_to_packet[job_id].remove_one() 404 if nb_in_packet == 0: 405 # packet done run the associate function 406 packet = self.id_to_packet[job_id] 407 # fully ensure that the packet is finished (thread safe) 408 packet.queue.join() 409 #running the function 410 packet.fct(*packet.args) 411 del self.id_to_packet[job_id] 412 return 'resubmit' 413 414 return 'done' 415 416 if time_check == 0: 417 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 418 args['time_check'] = time.time() 419 return 'wait' 420 elif self.cluster_retry_wait > time.time() - time_check: 421 return 'wait' 422 423 #jobs failed to be completed even after waiting time!! 424 if self.nb_retry < 0: 425 logger.critical('''Fail to run correctly job %s. 426 with option: %s 427 file missing: %s''' % (job_id, args, path)) 428 raw_input('press enter to continue.') 429 elif self.nb_retry == 0: 430 logger.critical('''Fail to run correctly job %s. 431 with option: %s 432 file missing: %s. 433 Stopping all runs.''' % (job_id, args, path)) 434 self.remove() 435 elif args['nb_submit'] >= self.nb_retry: 436 logger.critical('''Fail to run correctly job %s. 437 with option: %s 438 file missing: %s 439 Fails %s times 440 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 441 self.remove() 442 else: 443 args['nb_submit'] += 1 444 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 445 del self.retry_args[job_id] 446 self.submitted_ids.remove(job_id) 447 if 'time_check' in args: 448 del args['time_check'] 449 if job_id in self.id_to_packet: 450 self.id_to_packet[job_id].remove_one() 451 args['packet_member'] = self.id_to_packet[job_id] 452 del self.id_to_packet[job_id] 453 self.cluster_submit(**args) 454 else: 455 self.submit2(**args) 456 return 'resubmit' 457 return 'done'
458 459 @check_interupt()
460 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 461 stderr=None, log=None, required_output=[], nb_submit=0, 462 input_files=[], output_files=[]):
463 """launch one job on the cluster and wait for it""" 464 465 special_output = False # tag for concatenate the error with the output. 466 if stderr == -2 and stdout: 467 #We are suppose to send the output to stdout 468 special_output = True 469 stderr = stdout + '.err' 470 471 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 472 required_output=required_output, input_files=input_files, 473 output_files=output_files) 474 475 if self.options['cluster_type']=='htcaas2': 476 if self.submitted == self.submitted_ids[-1]: 477 id = self.metasubmit(self) 478 479 frame = inspect.currentframe() 480 args, _, _, values = inspect.getargvalues(frame) 481 args = dict([(i, values[i]) for i in args if i != 'self']) 482 self.retry_args[id] = args 483 484 nb_wait=0 485 while 1: 486 nb_wait+=1 487 status = self.control_one_job(id) 488 if not status in ['R','I']: 489 status = self.check_termination(id) 490 if status in ['wait']: 491 time.sleep(30) 492 continue 493 elif status in ['resubmit']: 494 id = self.submitted_ids[0] 495 time.sleep(30) 496 continue 497 #really stop! 498 time.sleep(30) #security to ensure that the file are really written on the disk 499 break 500 time.sleep(self.options['cluster_status_update'][1]) 501 502 if required_output: 503 status = self.check_termination(id) 504 if status == 'wait': 505 run += 1 506 elif status == 'resubmit': 507 idle += 1 508 509 510 if special_output: 511 # combine the stdout and the stderr 512 #wait up to 50 s to see if those files exists 513 for i in range(5): 514 if os.path.exists(stdout): 515 if not os.path.exists(stderr): 516 time.sleep(5) 517 if os.path.exists(stderr): 518 err_text = open(stderr).read() 519 if not err_text: 520 return 521 logger.warning(err_text) 522 text = open(stdout).read() 523 open(stdout,'w').write(text + err_text) 524 else: 525 return 526 time.sleep(10)
527
528 - def remove(self, *args, **opts):
529 """ """ 530 logger.warning("""This cluster didn't support job removal, 531 the jobs are still running on the cluster.""")
532 533 @store_input()
534 - def metasubmit(self, me_dir):
535 logger.warning("""This cluster didn't support metajob submit.""") 536 return 0
537
538 -class Packet(object):
539 """ an object for handling packet of job, it is designed to be thread safe 540 """ 541
542 - def __init__(self, name, fct, args, opts={}):
543 import Queue 544 import threading 545 self.queue = Queue.Queue() 546 self.tag = name 547 self.fct = fct 548 self.args = args 549 self.opts = opts 550 self.done = threading.Event()
551
552 - def put(self, *args, **opts):
553 self.queue.put(*args, **opts)
554 555 append = put 556
557 - def remove_one(self):
558 self.queue.get(True) 559 self.queue.task_done() 560 return self.queue.qsize()
561
562 -class MultiCore(Cluster):
563 """class for dealing with the submission in multiple node""" 564 565 job_id = "$" 566
567 - def __init__(self, *args, **opt):
568 """Init the cluster """ 569 570 571 super(MultiCore, self).__init__(self, *args, **opt) 572 573 import Queue 574 import threading 575 import thread 576 self.queue = Queue.Queue() # list of job to do 577 self.done = Queue.Queue() # list of job finisned 578 self.submitted = Queue.Queue() # one entry by job submitted 579 self.stoprequest = threading.Event() #flag to ensure everything to close 580 self.demons = [] 581 self.nb_done =0 582 if 'nb_core' in opt: 583 self.nb_core = opt['nb_core'] 584 elif isinstance(args[0],int): 585 self.nb_core = args[0] 586 else: 587 self.nb_core = 1 588 self.update_fct = None 589 590 self.lock = threading.Event() # allow nice lock of the main thread 591 self.pids = Queue.Queue() # allow to clean jobs submit via subprocess 592 self.done_pid = [] # list of job finisned 593 self.done_pid_queue = Queue.Queue() 594 self.fail_msg = None 595 596 # starting the worker node 597 for _ in range(self.nb_core): 598 self.start_demon()
599 600
601 - def start_demon(self):
602 import threading 603 t = threading.Thread(target=self.worker) 604 t.daemon = True 605 t.start() 606 self.demons.append(t)
607 608
609 - def worker(self):
610 import Queue 611 import thread 612 while not self.stoprequest.isSet(): 613 try: 614 args = self.queue.get() 615 tag, exe, arg, opt = args 616 try: 617 # check for executable case 618 if isinstance(exe,str): 619 if os.path.exists(exe) and not exe.startswith('/'): 620 exe = './' + exe 621 if isinstance(opt['stdout'],str): 622 opt['stdout'] = open(opt['stdout'],'w') 623 if opt['stderr'] == None: 624 opt['stderr'] = subprocess.STDOUT 625 proc = misc.Popen([exe] + arg, **opt) 626 pid = proc.pid 627 self.pids.put(pid) 628 proc.wait() 629 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 630 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 631 (' '.join([exe]+arg), proc.returncode) 632 logger.warning(fail_msg) 633 self.stoprequest.set() 634 self.remove(fail_msg) 635 # handle the case when this is a python function. Note that 636 # this use Thread so they are NO built-in parralelization this is 637 # going to work on a single core! (but this is fine for IO intensive 638 # function. for CPU intensive fct this will slow down the computation 639 else: 640 pid = tag 641 self.pids.put(pid) 642 # the function should return 0 if everything is fine 643 # the error message otherwise 644 returncode = exe(*arg, **opt) 645 if returncode != 0: 646 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 647 self.stoprequest.set() 648 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 649 except Exception,error: 650 self.fail_msg = sys.exc_info() 651 logger.warning(str(error)) 652 self.stoprequest.set() 653 self.remove(error) 654 655 if __debug__: 656 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2] 657 658 self.queue.task_done() 659 self.done.put(tag) 660 self.done_pid_queue.put(pid) 661 #release the mother to print the status on the screen 662 try: 663 self.lock.set() 664 except thread.error: 665 continue 666 except Queue.Empty: 667 continue
668 669 670 671
672 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 673 log=None, required_output=[], nb_submit=0):
674 """submit a job on multicore machine""" 675 676 tag = (prog, tuple(argument), cwd, nb_submit) 677 if isinstance(prog, str): 678 679 opt = {'cwd': cwd, 680 'stdout':stdout, 681 'stderr': stderr} 682 self.queue.put((tag, prog, argument, opt)) 683 self.submitted.put(1) 684 return tag 685 else: 686 # python function 687 self.queue.put((tag, prog, argument, {})) 688 self.submitted.put(1) 689 return tag
690
691 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 692 stderr=None, log=None, **opts):
693 """launch one job and wait for it""" 694 if isinstance(stdout, str): 695 stdout = open(stdout, 'w') 696 if isinstance(stderr, str): 697 stdout = open(stderr, 'w') 698 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
699
700 - def remove(self, error=None):
701 """Ensure that all thread are killed""" 702 703 # ensure the worker to stop 704 self.stoprequest.set() 705 if error and not self.fail_msg: 706 self.fail_msg = error 707 708 # cleaning the queue done_pid_queue and move them to done_pid 709 while not self.done_pid_queue.empty(): 710 pid = self.done_pid_queue.get() 711 self.done_pid.append(pid) 712 # self.done_pid_queue.task_done() 713 714 while not self.pids.empty(): 715 pid = self.pids.get() 716 self.pids.task_done() 717 if isinstance(pid, tuple): 718 continue 719 if pid in self.done_pid: 720 continue 721 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 722 % {'pid':pid} ) 723 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
724 725
726 - def wait(self, me_dir, update_status, update_first=None):
727 """Waiting that all the jobs are done. This function also control that 728 the submission by packet are handle correctly (i.e. submit the function)""" 729 730 import Queue 731 import threading 732 733 try: # to catch KeyBoardInterupt to see which kind of error to display 734 last_status = (0, 0, 0) 735 sleep_time = 1 736 use_lock = True 737 first = True 738 while True: 739 force_one_more_loop = False # some security 740 741 # Loop over the job tagged as done to check if some packet of jobs 742 # are finished in case, put the associate function in the queue 743 while self.done.qsize(): 744 try: 745 tag = self.done.get(True, 1) 746 except Queue.Empty: 747 pass 748 else: 749 if self.id_to_packet and tuple(tag) in self.id_to_packet: 750 packet = self.id_to_packet[tuple(tag)] 751 remaining = packet.remove_one() 752 if remaining == 0: 753 # fully ensure that the packet is finished (thread safe) 754 packet.queue.join() 755 self.submit(packet.fct, packet.args) 756 force_one_more_loop = True 757 self.nb_done += 1 758 self.done.task_done() 759 760 # Get from the various queue the Idle/Done/Running information 761 # Those variable should be thread safe but approximate. 762 Idle = self.queue.qsize() 763 Done = self.nb_done + self.done.qsize() 764 Running = max(0, self.submitted.qsize() - Idle - Done) 765 766 if Idle + Running <= 0 and not force_one_more_loop: 767 update_status(Idle, Running, Done) 768 # Going the quit since everything is done 769 # Fully Ensure that everything is indeed done. 770 self.queue.join() 771 break 772 773 if (Idle, Running, Done) != last_status: 774 if first and update_first: 775 update_first(Idle, Running, Done) 776 first = False 777 else: 778 update_status(Idle, Running, Done) 779 last_status = (Idle, Running, Done) 780 781 # cleaning the queue done_pid_queue and move them to done_pid 782 while not self.done_pid_queue.empty(): 783 pid = self.done_pid_queue.get() 784 self.done_pid.append(pid) 785 self.done_pid_queue.task_done() 786 787 788 # Define how to wait for the next iteration 789 if use_lock: 790 # simply wait that a worker release the lock 791 use_lock = self.lock.wait(300) 792 self.lock.clear() 793 if not use_lock and Idle > 0: 794 use_lock = True 795 else: 796 # to be sure that we will never fully lock at the end pass to 797 # a simple time.sleep() 798 time.sleep(sleep_time) 799 sleep_time = min(sleep_time + 2, 180) 800 if update_first: 801 update_first(Idle, Running, Done) 802 803 if self.stoprequest.isSet(): 804 if isinstance(self.fail_msg, Exception): 805 raise self.fail_msg 806 elif isinstance(self.fail_msg, str): 807 raise Exception, self.fail_msg 808 else: 809 misc.sprint(self.fail_msg) 810 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 811 # reset variable for next submission 812 try: 813 self.lock.clear() 814 except Exception: 815 pass 816 self.done = Queue.Queue() 817 self.done_pid = [] 818 self.done_pid_queue = Queue.Queue() 819 self.nb_done = 0 820 self.submitted = Queue.Queue() 821 self.pids = Queue.Queue() 822 self.stoprequest.clear() 823 824 except KeyboardInterrupt: 825 # if one of the node fails -> return that error 826 if isinstance(self.fail_msg, Exception): 827 raise self.fail_msg 828 elif isinstance(self.fail_msg, str): 829 raise Exception, self.fail_msg 830 elif self.fail_msg: 831 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 832 # else return orignal error 833 raise
834
835 -class CondorCluster(Cluster):
836 """Basic class for dealing with cluster submission""" 837 838 name = 'condor' 839 job_id = 'CONDOR_ID' 840 841 842 843 @multiple_try()
844 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 845 required_output=[], nb_submit=0):
846 """Submit a job prog to a Condor cluster""" 847 848 text = """Executable = %(prog)s 849 output = %(stdout)s 850 error = %(stderr)s 851 log = %(log)s 852 %(argument)s 853 environment = CONDOR_ID=$(Cluster).$(Process) 854 Universe = vanilla 855 notification = Error 856 Initialdir = %(cwd)s 857 %(requirement)s 858 getenv=True 859 queue 1 860 """ 861 862 if self.cluster_queue not in ['None', None]: 863 requirement = 'Requirements = %s=?=True' % self.cluster_queue 864 else: 865 requirement = '' 866 867 if cwd is None: 868 cwd = os.getcwd() 869 if stdout is None: 870 stdout = '/dev/null' 871 if stderr is None: 872 stderr = '/dev/null' 873 if log is None: 874 log = '/dev/null' 875 if not os.path.exists(prog): 876 prog = os.path.join(cwd, prog) 877 if argument: 878 argument = 'Arguments = %s' % ' '.join(argument) 879 else: 880 argument = '' 881 882 883 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 884 'stderr': stderr,'log': log,'argument': argument, 885 'requirement': requirement} 886 887 #open('submit_condor','w').write(text % dico) 888 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 889 stdin=subprocess.PIPE) 890 output, _ = a.communicate(text % dico) 891 #output = a.stdout.read() 892 #Submitting job(s). 893 #Logging submit event(s). 894 #1 job(s) submitted to cluster 2253622. 895 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 896 try: 897 id = pat.search(output).groups()[0] 898 except: 899 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 900 % output 901 self.submitted += 1 902 self.submitted_ids.append(id) 903 return id
904 905 @store_input() 906 @multiple_try()
907 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 908 log=None, input_files=[], output_files=[], required_output=[], 909 nb_submit=0):
910 """Submit the job on the cluster NO SHARE DISK 911 input/output file should be give relative to cwd 912 """ 913 914 if not required_output and output_files: 915 required_output = output_files 916 917 if (input_files == [] == output_files): 918 return self.submit(prog, argument, cwd, stdout, stderr, log, 919 required_output=required_output, nb_submit=nb_submit) 920 921 text = """Executable = %(prog)s 922 output = %(stdout)s 923 error = %(stderr)s 924 log = %(log)s 925 %(argument)s 926 should_transfer_files = YES 927 when_to_transfer_output = ON_EXIT 928 transfer_input_files = %(input_files)s 929 %(output_files)s 930 Universe = vanilla 931 notification = Error 932 Initialdir = %(cwd)s 933 %(requirement)s 934 getenv=True 935 queue 1 936 """ 937 938 if self.cluster_queue not in ['None', None]: 939 requirement = 'Requirements = %s=?=True' % self.cluster_queue 940 else: 941 requirement = '' 942 943 if cwd is None: 944 cwd = os.getcwd() 945 if stdout is None: 946 stdout = '/dev/null' 947 if stderr is None: 948 stderr = '/dev/null' 949 if log is None: 950 log = '/dev/null' 951 if not os.path.exists(prog): 952 prog = os.path.join(cwd, prog) 953 if argument: 954 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 955 else: 956 argument = '' 957 # input/output file treatment 958 if input_files: 959 input_files = ','.join(input_files) 960 else: 961 input_files = '' 962 if output_files: 963 output_files = 'transfer_output_files = %s' % ','.join(output_files) 964 else: 965 output_files = '' 966 967 968 969 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 970 'stderr': stderr,'log': log,'argument': argument, 971 'requirement': requirement, 'input_files':input_files, 972 'output_files':output_files} 973 974 #open('submit_condor','w').write(text % dico) 975 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 976 stdin=subprocess.PIPE) 977 output, _ = a.communicate(text % dico) 978 #output = a.stdout.read() 979 #Submitting job(s). 980 #Logging submit event(s). 981 #1 job(s) submitted to cluster 2253622. 982 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 983 try: 984 id = pat.search(output).groups()[0] 985 except: 986 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 987 % output 988 self.submitted += 1 989 self.submitted_ids.append(id) 990 return id
991 992 993 994 995 996 @multiple_try(nb_try=10, sleep=10)
997 - def control_one_job(self, id):
998 """ control the status of a single job with it's cluster id """ 999 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1000 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1001 stderr=subprocess.PIPE) 1002 1003 error = status.stderr.read() 1004 if status.returncode or error: 1005 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1006 1007 return status.stdout.readline().strip()
1008 1009 @check_interupt() 1010 @multiple_try(nb_try=10, sleep=10)
1011 - def control(self, me_dir):
1012 """ control the status of a single job with it's cluster id """ 1013 1014 if not self.submitted_ids: 1015 return 0, 0, 0, 0 1016 1017 packet = 15000 1018 idle, run, fail = 0, 0, 0 1019 ongoing = [] 1020 for i in range(1+(len(self.submitted_ids)-1)//packet): 1021 start = i * packet 1022 stop = (i+1) * packet 1023 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1024 " -format \'%-2s\ ' \'ClusterId\' " + \ 1025 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1026 1027 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1028 stderr=subprocess.PIPE) 1029 error = status.stderr.read() 1030 if status.returncode or error: 1031 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1032 1033 for line in status.stdout: 1034 id, status = line.strip().split() 1035 ongoing.append(int(id)) 1036 if status in ['I','U']: 1037 idle += 1 1038 elif status == 'R': 1039 run += 1 1040 elif status != 'C': 1041 fail += 1 1042 1043 for id in list(self.submitted_ids): 1044 if int(id) not in ongoing: 1045 status = self.check_termination(id) 1046 if status == 'wait': 1047 run += 1 1048 elif status == 'resubmit': 1049 idle += 1 1050 1051 return idle, run, self.submitted - (idle+run+fail), fail
1052 1053 @multiple_try()
1054 - def remove(self, *args, **opts):
1055 """Clean the jobson the cluster""" 1056 1057 if not self.submitted_ids: 1058 return 1059 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1060 1061 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1062 self.submitted_ids = []
1063
1064 -class PBSCluster(Cluster):
1065 """Basic class for dealing with cluster submission""" 1066 1067 name = 'pbs' 1068 job_id = 'PBS_JOBID' 1069 idle_tag = ['Q'] 1070 running_tag = ['T','E','R'] 1071 complete_tag = ['C'] 1072 1073 maximum_submited_jobs = 2500 1074 1075 @multiple_try()
1076 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1077 required_output=[], nb_submit=0):
1078 """Submit a job prog to a PBS cluster""" 1079 1080 me_dir = self.get_jobs_identifier(cwd, prog) 1081 1082 if len(self.submitted_ids) > self.maximum_submited_jobs: 1083 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1084 self.wait(me_dir, fct, self.maximum_submited_jobs) 1085 1086 1087 text = "" 1088 if cwd is None: 1089 cwd = os.getcwd() 1090 else: 1091 text = " cd %s;" % cwd 1092 if stdout is None: 1093 stdout = '/dev/null' 1094 if stderr is None: 1095 stderr = '/dev/null' 1096 elif stderr == -2: # -2 is subprocess.STDOUT 1097 stderr = stdout 1098 if log is None: 1099 log = '/dev/null' 1100 1101 if not os.path.isabs(prog): 1102 text += "./%s" % prog 1103 else: 1104 text+= prog 1105 1106 if argument: 1107 text += ' ' + ' '.join(argument) 1108 1109 command = ['qsub','-o', stdout, 1110 '-N', me_dir, 1111 '-e', stderr, 1112 '-V'] 1113 1114 if self.cluster_queue and self.cluster_queue != 'None': 1115 command.extend(['-q', self.cluster_queue]) 1116 1117 a = misc.Popen(command, stdout=subprocess.PIPE, 1118 stderr=subprocess.STDOUT, 1119 stdin=subprocess.PIPE, cwd=cwd) 1120 1121 output = a.communicate(text)[0] 1122 id = output.split('.')[0] 1123 if not id.isdigit() or a.returncode !=0: 1124 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1125 % output 1126 1127 self.submitted += 1 1128 self.submitted_ids.append(id) 1129 return id
1130 1131 @multiple_try()
1132 - def control_one_job(self, id):
1133 """ control the status of a single job with it's cluster id """ 1134 cmd = 'qstat '+str(id) 1135 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1136 stderr=subprocess.STDOUT) 1137 1138 for line in status.stdout: 1139 line = line.strip() 1140 if 'cannot connect to server' in line or 'cannot read reply' in line: 1141 raise ClusterManagmentError, 'server disconnected' 1142 if 'Unknown' in line: 1143 return 'F' 1144 elif line.startswith(str(id)): 1145 jobstatus = line.split()[4] 1146 else: 1147 jobstatus="" 1148 1149 if status.returncode != 0 and status.returncode is not None: 1150 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1151 if jobstatus in self.idle_tag: 1152 return 'I' 1153 elif jobstatus in self.running_tag: 1154 return 'R' 1155 return 'F'
1156 1157 1158 @multiple_try()
1159 - def control(self, me_dir):
1160 """ control the status of a single job with it's cluster id """ 1161 cmd = "qstat" 1162 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1163 1164 me_dir = self.get_jobs_identifier(me_dir) 1165 1166 ongoing = [] 1167 1168 idle, run, fail = 0, 0, 0 1169 for line in status.stdout: 1170 if 'cannot connect to server' in line or 'cannot read reply' in line: 1171 raise ClusterManagmentError, 'server disconnected' 1172 if me_dir in line: 1173 ongoing.append(line.split()[0].split('.')[0]) 1174 status2 = line.split()[4] 1175 if status2 in self.idle_tag: 1176 idle += 1 1177 elif status2 in self.running_tag: 1178 run += 1 1179 elif status2 in self.complete_tag: 1180 if not self.check_termination(line.split()[0].split('.')[0]): 1181 idle += 1 1182 else: 1183 fail += 1 1184 1185 if status.returncode != 0 and status.returncode is not None: 1186 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1187 1188 for id in list(self.submitted_ids): 1189 if id not in ongoing: 1190 status2 = self.check_termination(id) 1191 if status2 == 'wait': 1192 run += 1 1193 elif status2 == 'resubmit': 1194 idle += 1 1195 1196 return idle, run, self.submitted - (idle+run+fail), fail
1197 1198 @multiple_try()
1199 - def remove(self, *args, **opts):
1200 """Clean the jobs on the cluster""" 1201 1202 if not self.submitted_ids: 1203 return 1204 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1205 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1206 self.submitted_ids = []
1207
1208 1209 -class SGECluster(Cluster):
1210 """Basic class for dealing with cluster submission""" 1211 # Class written by Arian Abrahantes. 1212 1213 name = 'sge' 1214 job_id = 'JOB_ID' 1215 idle_tag = ['qw', 'hqw','hRqw','w'] 1216 running_tag = ['r','t','Rr','Rt'] 1217 identifier_length = 10 1218
1219 - def def_get_path(self,location):
1220 """replace string for path issues""" 1221 location = os.path.realpath(location) 1222 homePath = os.getenv("HOME") 1223 if homePath: 1224 location = location.replace(homePath,'$HOME') 1225 return location
1226 1227 @multiple_try()
1228 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1229 required_output=[], nb_submit=0):
1230 """Submit a job prog to an SGE cluster""" 1231 1232 me_dir = self.get_jobs_identifier(cwd, prog) 1233 1234 1235 if cwd is None: 1236 #cwd = os.getcwd() 1237 cwd = self.def_get_path(os.getcwd()) 1238 cwd1 = self.def_get_path(cwd) 1239 text = " cd %s;" % cwd1 1240 if stdout is None: 1241 stdout = '/dev/null' 1242 else: 1243 stdout = self.def_get_path(stdout) 1244 if stderr is None: 1245 stderr = '/dev/null' 1246 elif stderr == -2: # -2 is subprocess.STDOUT 1247 stderr = stdout 1248 else: 1249 stderr = self.def_get_path(stderr) 1250 1251 if log is None: 1252 log = '/dev/null' 1253 else: 1254 log = self.def_get_path(log) 1255 1256 text += prog 1257 if argument: 1258 text += ' ' + ' '.join(argument) 1259 1260 #if anything slips through argument 1261 #print "!=== inteded change ",text.replace('/srv/nfs','') 1262 #text = text.replace('/srv/nfs','') 1263 homePath = os.getenv("HOME") 1264 if homePath: 1265 text = text.replace(homePath,'$HOME') 1266 1267 logger.debug("!=== input %s" % text) 1268 logger.debug("!=== output %s" % stdout) 1269 logger.debug("!=== error %s" % stderr) 1270 logger.debug("!=== logs %s" % log) 1271 1272 command = ['qsub','-o', stdout, 1273 '-N', me_dir, 1274 '-e', stderr, 1275 '-V'] 1276 1277 if self.cluster_queue and self.cluster_queue != 'None': 1278 command.extend(['-q', self.cluster_queue]) 1279 1280 a = misc.Popen(command, stdout=subprocess.PIPE, 1281 stderr=subprocess.STDOUT, 1282 stdin=subprocess.PIPE, cwd=cwd) 1283 1284 output = a.communicate(text)[0] 1285 id = output.split(' ')[2] 1286 if not id.isdigit(): 1287 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1288 % output 1289 self.submitted += 1 1290 self.submitted_ids.append(id) 1291 logger.debug(output) 1292 1293 return id
1294 1295 @multiple_try()
1296 - def control_one_job(self, id):
1297 """ control the status of a single job with it's cluster id """ 1298 #cmd = 'qstat '+str(id) 1299 cmd = 'qstat ' 1300 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1301 for line in status.stdout: 1302 #print "!==",line 1303 #line = line.strip() 1304 #if 'Unknown' in line: 1305 # return 'F' 1306 #elif line.startswith(str(id)): 1307 # status = line.split()[4] 1308 if str(id) in line: 1309 status = line.split()[4] 1310 #print "!=status", status 1311 if status in self.idle_tag: 1312 return 'I' 1313 elif status in self.running_tag: 1314 return 'R' 1315 return 'F'
1316 1317 @multiple_try()
1318 - def control(self, me_dir):
1319 """ control the status of a single job with it's cluster id """ 1320 cmd = "qstat " 1321 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1322 1323 me_dir = self.get_jobs_identifier(me_dir) 1324 1325 finished = list(self.submitted_ids) 1326 1327 idle, run, fail = 0, 0, 0 1328 for line in status.stdout: 1329 if me_dir in line: 1330 id,_,_,_,status = line.split()[:5] 1331 if status in self.idle_tag: 1332 idle += 1 1333 finished.remove(id) 1334 elif status in self.running_tag: 1335 run += 1 1336 finished.remove(id) 1337 else: 1338 logger.debug(line) 1339 fail += 1 1340 finished.remove(id) 1341 1342 for id in finished: 1343 self.check_termination(id) 1344 1345 return idle, run, self.submitted - (idle+run+fail), fail
1346 1347 1348 1349 @multiple_try()
1350 - def remove(self, *args, **opts):
1351 """Clean the jobs on the cluster""" 1352 1353 if not self.submitted_ids: 1354 return 1355 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1356 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1357 self.submitted_ids = []
1358
1359 1360 -class LSFCluster(Cluster):
1361 """Basic class for dealing with cluster submission""" 1362 1363 name = 'lsf' 1364 job_id = 'LSB_JOBID' 1365 1366 @multiple_try()
1367 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1368 required_output=[], nb_submit=0):
1369 """Submit the job prog to an LSF cluster""" 1370 1371 1372 me_dir = self.get_jobs_identifier(cwd, prog) 1373 1374 text = "" 1375 command = ['bsub', '-C0', '-J', me_dir] 1376 if cwd is None: 1377 cwd = os.getcwd() 1378 else: 1379 text = " cd %s;" % cwd 1380 if stdout and isinstance(stdout, str): 1381 command.extend(['-o', stdout]) 1382 if stderr and isinstance(stdout, str): 1383 command.extend(['-e', stderr]) 1384 elif stderr == -2: # -2 is subprocess.STDOUT 1385 pass 1386 if log is None: 1387 log = '/dev/null' 1388 1389 text += prog 1390 if argument: 1391 text += ' ' + ' '.join(argument) 1392 1393 if self.cluster_queue and self.cluster_queue != 'None': 1394 command.extend(['-q', self.cluster_queue]) 1395 1396 a = misc.Popen(command, stdout=subprocess.PIPE, 1397 stderr=subprocess.STDOUT, 1398 stdin=subprocess.PIPE, cwd=cwd) 1399 1400 output = a.communicate(text)[0] 1401 #Job <nnnn> is submitted to default queue <normal>. 1402 try: 1403 id = output.split('>',1)[0].split('<')[1] 1404 except: 1405 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1406 % output 1407 if not id.isdigit(): 1408 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1409 % output 1410 self.submitted += 1 1411 self.submitted_ids.append(id) 1412 return id
1413 1414 1415 @multiple_try()
1416 - def control_one_job(self, id):
1417 """ control the status of a single job with it's cluster id """ 1418 1419 cmd = 'bjobs '+str(id) 1420 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1421 1422 for line in status.stdout: 1423 line = line.strip().upper() 1424 if 'JOBID' in line: 1425 continue 1426 elif str(id) not in line: 1427 continue 1428 status = line.split()[2] 1429 if status == 'RUN': 1430 return 'R' 1431 elif status == 'PEND': 1432 return 'I' 1433 elif status == 'DONE': 1434 return 'F' 1435 else: 1436 return 'H' 1437 return 'F'
1438 1439 @multiple_try()
1440 - def control(self, me_dir):
1441 """ control the status of a single job with it's cluster id """ 1442 1443 if not self.submitted_ids: 1444 return 0, 0, 0, 0 1445 1446 cmd = "bjobs " + ' '.join(self.submitted_ids) 1447 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1448 1449 jobstatus = {} 1450 for line in status.stdout: 1451 line = line.strip() 1452 if 'JOBID' in line: 1453 continue 1454 splitline = line.split() 1455 id = splitline[0] 1456 if id not in self.submitted_ids: 1457 continue 1458 jobstatus[id] = splitline[2] 1459 1460 idle, run, fail = 0, 0, 0 1461 for id in self.submitted_ids[:]: 1462 if id in jobstatus: 1463 status = jobstatus[id] 1464 else: 1465 status = 'MISSING' 1466 if status == 'RUN': 1467 run += 1 1468 elif status == 'PEND': 1469 idle += 1 1470 else: 1471 status = self.check_termination(id) 1472 if status == 'wait': 1473 run += 1 1474 elif status == 'resubmit': 1475 idle += 1 1476 1477 return idle, run, self.submitted - (idle+run+fail), fail
1478 1479 @multiple_try()
1480 - def remove(self, *args,**opts):
1481 """Clean the jobs on the cluster""" 1482 1483 if not self.submitted_ids: 1484 return 1485 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1486 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1487 self.submitted_ids = []
1488
1489 -class GECluster(Cluster):
1490 """Class for dealing with cluster submission on a GE cluster""" 1491 1492 name = 'ge' 1493 job_id = 'JOB_ID' 1494 idle_tag = ['qw'] 1495 running_tag = ['r'] 1496 1497 @multiple_try()
1498 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1499 required_output=[], nb_submit=0):
1500 """Submit a job prog to a GE cluster""" 1501 1502 text = "" 1503 if cwd is None: 1504 cwd = os.getcwd() 1505 else: 1506 text = " cd %s; bash " % cwd 1507 if stdout is None: 1508 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1509 if stderr is None: 1510 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1511 elif stderr == -2: # -2 is subprocess.STDOUT 1512 stderr = stdout 1513 if log is None: 1514 log = '/dev/null' 1515 1516 text += prog 1517 if argument: 1518 text += ' ' + ' '.join(argument) 1519 text += '\n' 1520 tmp_submit = os.path.join(cwd, 'tmp_submit') 1521 open(tmp_submit,'w').write(text) 1522 1523 a = misc.Popen(['qsub','-o', stdout, 1524 '-e', stderr, 1525 tmp_submit], 1526 stdout=subprocess.PIPE, 1527 stderr=subprocess.STDOUT, 1528 stdin=subprocess.PIPE, cwd=cwd) 1529 1530 output = a.communicate()[0] 1531 #Your job 874511 ("test.sh") has been submitted 1532 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1533 try: 1534 id = pat.search(output).groups()[0] 1535 except: 1536 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1537 % output 1538 self.submitted += 1 1539 self.submitted_ids.append(id) 1540 return id
1541 1542 @multiple_try()
1543 - def control_one_job(self, id):
1544 """ control the status of a single job with it's cluster id """ 1545 cmd = 'qstat | grep '+str(id) 1546 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1547 if not status: 1548 return 'F' 1549 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1550 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1551 stat = '' 1552 for line in status.stdout.read().split('\n'): 1553 if not line: 1554 continue 1555 line = line.strip() 1556 try: 1557 groups = pat.search(line).groups() 1558 except: 1559 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1560 if groups[0] != id: continue 1561 stat = groups[1] 1562 if not stat: 1563 return 'F' 1564 if stat in self.idle_tag: 1565 return 'I' 1566 if stat in self.running_tag: 1567 return 'R'
1568 1569 @multiple_try()
1570 - def control(self, me_dir=None):
1571 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1572 if not self.submitted_ids: 1573 return 0, 0, 0, 0 1574 idle, run, fail = 0, 0, 0 1575 ongoing = [] 1576 for statusflag in ['p', 'r', 'sh']: 1577 cmd = 'qstat -s %s' % statusflag 1578 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1579 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1580 pat = re.compile("^(\d+)") 1581 for line in status.stdout.read().split('\n'): 1582 line = line.strip() 1583 try: 1584 id = pat.search(line).groups()[0] 1585 except Exception: 1586 pass 1587 else: 1588 if id not in self.submitted_ids: 1589 continue 1590 ongoing.append(id) 1591 if statusflag == 'p': 1592 idle += 1 1593 if statusflag == 'r': 1594 run += 1 1595 if statusflag == 'sh': 1596 fail += 1 1597 for id in list(self.submitted_ids): 1598 if id not in ongoing: 1599 self.check_termination(id) 1600 #self.submitted_ids = ongoing 1601 1602 return idle, run, self.submitted - idle - run - fail, fail
1603 1604 @multiple_try()
1605 - def remove(self, *args, **opts):
1606 """Clean the jobs on the cluster""" 1607 1608 if not self.submitted_ids: 1609 return 1610 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1611 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1612 self.submitted_ids = []
1613
1614 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1615 """start a computation and not wait for it to finish. 1616 this fonction returns a lock which is locked as long as the job is 1617 running.""" 1618 1619 mc = MultiCore(1) 1620 mc.submit(exe, argument, cwd, stdout, **opt) 1621 mc.need_waiting = True 1622 return mc.lock
1623
1624 1625 -class SLURMCluster(Cluster):
1626 """Basic class for dealing with cluster submission""" 1627 1628 name = 'slurm' 1629 job_id = 'SLURM_JOBID' 1630 idle_tag = ['Q','PD','S','CF'] 1631 running_tag = ['R', 'CG'] 1632 complete_tag = ['C'] 1633 identifier_length = 8 1634 1635 @multiple_try()
1636 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1637 required_output=[], nb_submit=0):
1638 """Submit a job prog to a SLURM cluster""" 1639 1640 me_dir = self.get_jobs_identifier(cwd, prog) 1641 1642 1643 if cwd is None: 1644 cwd = os.getcwd() 1645 if stdout is None: 1646 stdout = '/dev/null' 1647 if stderr is None: 1648 stderr = '/dev/null' 1649 elif stderr == -2: # -2 is subprocess.STDOUT 1650 stderr = stdout 1651 if log is None: 1652 log = '/dev/null' 1653 1654 command = ['sbatch', '-o', stdout, 1655 '-J', me_dir, 1656 '-e', stderr, prog] + argument 1657 1658 if self.cluster_queue and self.cluster_queue != 'None': 1659 command.insert(1, '-p') 1660 command.insert(2, self.cluster_queue) 1661 1662 a = misc.Popen(command, stdout=subprocess.PIPE, 1663 stderr=subprocess.STDOUT, 1664 stdin=subprocess.PIPE, cwd=cwd) 1665 1666 output = a.communicate() 1667 output_arr = output[0].split(' ') 1668 id = output_arr[3].rstrip() 1669 1670 if not id.isdigit(): 1671 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1672 % (output[0] + '\n' + output[1]) 1673 1674 self.submitted += 1 1675 self.submitted_ids.append(id) 1676 return id
1677 1678 @multiple_try()
1679 - def control_one_job(self, id):
1680 """ control the status of a single job with it's cluster id """ 1681 cmd = 'squeue j'+str(id) 1682 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1683 stderr=open(os.devnull,'w')) 1684 1685 for line in status.stdout: 1686 line = line.strip() 1687 if 'Invalid' in line: 1688 return 'F' 1689 elif line.startswith(str(id)): 1690 status = line.split()[4] 1691 if status in self.idle_tag: 1692 return 'I' 1693 elif status in self.running_tag: 1694 return 'R' 1695 return 'F'
1696 1697 @multiple_try()
1698 - def control(self, me_dir):
1699 """ control the status of a single job with it's cluster id """ 1700 cmd = "squeue" 1701 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1702 1703 me_dir = self.get_jobs_identifier(me_dir) 1704 1705 idle, run, fail = 0, 0, 0 1706 ongoing=[] 1707 for line in pstatus.stdout: 1708 if me_dir in line: 1709 id, _, _,_ , status,_ = line.split(None,5) 1710 ongoing.append(id) 1711 if status in self.idle_tag: 1712 idle += 1 1713 elif status in self.running_tag: 1714 run += 1 1715 elif status in self.complete_tag: 1716 status = self.check_termination(id) 1717 if status == 'wait': 1718 run += 1 1719 elif status == 'resubmit': 1720 idle += 1 1721 else: 1722 fail += 1 1723 1724 #control other finished job 1725 for id in list(self.submitted_ids): 1726 if id not in ongoing: 1727 status = self.check_termination(id) 1728 if status == 'wait': 1729 run += 1 1730 elif status == 'resubmit': 1731 idle += 1 1732 1733 1734 return idle, run, self.submitted - (idle+run+fail), fail
1735 1736 @multiple_try()
1737 - def remove(self, *args, **opts):
1738 """Clean the jobs on the cluster""" 1739 1740 if not self.submitted_ids: 1741 return 1742 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1743 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1744 self.submitted_ids = []
1745
1746 -class HTCaaSCluster(Cluster):
1747 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1748 1749 name= 'htcaas' 1750 job_id = 'HTCAAS_JOBID' 1751 idle_tag = ['waiting'] 1752 running_tag = ['preparing','running'] 1753 complete_tag = ['done'] 1754 1755 @store_input() 1756 @multiple_try()
1757 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1758 log=None, input_files=[], output_files=[], required_output=[], 1759 nb_submit=0):
1760 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1761 input/output file should be given as relative to CWd 1762 """ 1763 # To make workspace name(temp) 1764 cur_usr = os.getenv('USER') 1765 1766 if cwd is None: 1767 cwd = os.getcwd() 1768 1769 cwd_cp = cwd.rsplit("/",2) 1770 1771 if not stdout is None: 1772 print "stdout: %s" % stdout 1773 1774 if not os.path.exists(prog): 1775 prog = os.path.join(cwd, prog) 1776 1777 if not required_output and output_files: 1778 required_output = output_files 1779 1780 logger.debug(prog) 1781 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1782 cwd_arg = cwd+"/arguments" 1783 temp = ' '.join([str(a) for a in argument]) 1784 arg_cmd="echo '"+temp+"' > " + cwd_arg 1785 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1786 if argument : 1787 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1788 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1789 id = a.stdout.read().strip() 1790 1791 else: 1792 cwd_arg = cwd+"/arguments" 1793 temp = ' '.join([str(a) for a in argument]) 1794 temp_file_name = "sub." + os.path.basename(prog) 1795 text = """#!/bin/bash 1796 MYPWD=%(cwd)s 1797 cd $MYPWD 1798 input_files=(%(input_files)s ) 1799 for i in ${input_files[@]} 1800 do 1801 chmod -f +x $i 1802 done 1803 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1804 """ 1805 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1806 'arguments': ' '.join([str(a) for a in argument]), 1807 'program': ' ' if '.py' in prog else 'bash'} 1808 1809 # writing a new script for the submission 1810 new_prog = pjoin(cwd, temp_file_name) 1811 open(new_prog, 'w').write(text % dico) 1812 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1813 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1814 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1815 id = a.stdout.read().strip() 1816 logger.debug(id) 1817 1818 nb_try=0 1819 nb_limit=5 1820 if not id.isdigit() : 1821 print "[ID is not digit]:" + id 1822 1823 while not id.isdigit() : 1824 nb_try+=1 1825 print "[fail_retry]:"+ nb_try 1826 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1827 id = a.stdout.read().strip() 1828 if nb_try > nb_limit : 1829 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1830 break 1831 1832 self.submitted += 1 1833 self.submitted_ids.append(id) 1834 1835 return id
1836 1837 @multiple_try(nb_try=10, sleep=5)
1838 - def control_one_job(self, id):
1839 """ control the status of a single job with it's cluster id """ 1840 1841 if id == 0 : 1842 status_out ='C' 1843 else : 1844 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1845 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1846 stderr=subprocess.PIPE) 1847 error = status.stderr.read() 1848 if status.returncode or error: 1849 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1850 status_out= status.stdout.read().strip() 1851 status_out= status_out.split(":",1)[1] 1852 if status_out == 'waiting': 1853 status_out='I' 1854 elif status_out == 'preparing' or status_out == 'running': 1855 status_out = 'R' 1856 elif status_out != 'done': 1857 status_out = 'F' 1858 elif status_out == 'done': 1859 status_out = 'C' 1860 1861 return status_out
1862 1863 @multiple_try()
1864 - def control(self, me_dir):
1865 """ control the status of a single job with it's cluster id """ 1866 if not self.submitted_ids: 1867 logger.debug("self.submitted_ids not exists") 1868 return 0, 0, 0, 0 1869 1870 ongoing = [] 1871 idle, run, fail = 0, 0, 0 1872 1873 start = self.submitted_ids[0] 1874 end = self.submitted_ids[-1] 1875 1876 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1877 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1878 1879 for line in status.stdout: 1880 #ongoing.append(line.split()[0].strip()) 1881 status2 = line.split()[-1] 1882 if status2 is not 'null' or line.split()[0].strip() is not '0': 1883 ongoing.append(line.split()[0].strip()) 1884 logger.debug("["+line.split()[0].strip()+"]"+status2) 1885 if status2 is 'null' or line.split()[0].strip() is '0': 1886 idle += 1 1887 elif status2 in self.idle_tag: 1888 idle += 1 1889 elif status2 in self.running_tag: 1890 run += 1 1891 elif status2 in self.complete_tag: 1892 if not self.check_termination(line.split()[0]): 1893 idle +=1 1894 else: 1895 fail += 1 1896 1897 return idle, run, self.submitted - (idle+run+fail), fail
1898 1899 @multiple_try()
1900 - def remove(self, *args, **opts):
1901 """Clean the jobson the cluster""" 1902 1903 if not self.submitted_ids: 1904 return 1905 for i in range(len(self.submitted_ids)): 1906 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1907 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1908
1909 -class HTCaaS2Cluster(Cluster):
1910 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1911 1912 name= 'htcaas2' 1913 job_id = 'HTCAAS2_JOBID' 1914 idle_tag = ['waiting'] 1915 running_tag = ['preparing','running'] 1916 complete_tag = ['done'] 1917 1918 @store_input() 1919 @multiple_try()
1920 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1921 log=None, input_files=[], output_files=[], required_output=[], 1922 nb_submit=0):
1923 1924 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1925 input/output file should be given as relative to CWD 1926 """ 1927 if cwd is None: 1928 cwd = os.getcwd() 1929 1930 if not os.path.exists(prog): 1931 prog = os.path.join(cwd, prog) 1932 1933 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1934 if cwd or prog : 1935 self.submitted_dirs.append(cwd) 1936 self.submitted_exes.append(prog) 1937 else: 1938 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1939 1940 if argument : 1941 self.submitted_args.append('='.join([str(a) for a in argument])) 1942 1943 if cwd or prog : 1944 self.submitted += 1 1945 id = self.submitted 1946 self.submitted_ids.append(id) 1947 else: 1948 logger.debug("cwd and prog are not exist! ") 1949 id = 0 1950 1951 else: 1952 temp_file_name = "sub."+ os.path.basename(prog) 1953 text = """#!/bin/bash 1954 MYPWD=%(cwd)s 1955 cd $MYPWD 1956 input_files=(%(input_files)s ) 1957 for i in ${input_files[@]} 1958 do 1959 chmod -f +x $i 1960 done 1961 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1962 """ 1963 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1964 'arguments': ' '.join([str(a) for a in argument]), 1965 'program': ' ' if '.py' in prog else 'bash'} 1966 # writing a new script for the submission 1967 new_prog = pjoin(cwd, temp_file_name) 1968 open(new_prog, 'w').write(text % dico) 1969 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1970 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 1971 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1972 id = a.stdout.read().strip() 1973 logger.debug("[mode2]-["+str(id)+"]") 1974 if cwd and prog : 1975 self.submitted += 1 1976 self.submitted_ids.append(id) 1977 else: 1978 logger.debug("cwd and prog are not exist! ") 1979 id = 0 1980 1981 return id
1982 1983 @multiple_try()
1984 - def metasubmit(self, me_dir=None):
1985 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 1986 tmp_leng= len(self.submitted_ids)/2 1987 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 1988 tmp_dirs2= self.submitted_dirs[tmp_leng:] 1989 tmp_exes1= self.submitted_exes[0:tmp_leng] 1990 tmp_exes2= self.submitted_exes[tmp_leng:] 1991 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a is not ' ']), 1992 '-e', ":".join([str(a) for a in tmp_exes1 if a and a is not ' '])] 1993 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a is not ' ']), 1994 '-e', ":".join([str(a) for a in tmp_exes2 if a and a is not ' '])] 1995 if len(self.submitted_args) > 0 : 1996 tmp_args1= self.submitted_args[0:tmp_leng] 1997 tmp_args2= self.submitted_args[tmp_leng:] 1998 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 1999 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 2000 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2001 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2002 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2003 2004 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2005 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a is not ' ']), 2006 '-e', ":".join([str(a) for a in self.submitted_exes if a and a is not ' '])] 2007 if len(self.submitted_args) > 0 : 2008 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2009 if self.submitted_dirs[0] or self.submitted_exes[0] : 2010 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2011 me_dir = result.stdout.read().strip() 2012 self.submitted_ids[0]=me_dir 2013 else: 2014 me_dir = self.submitted_ids[-1] 2015 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2016 me_dir = self.submitted_ids[0] 2017 else: 2018 me_dir = -1 2019 2020 logger.debug("[" + str(me_dir) + "]") 2021 2022 self.submitted_dirs = [] 2023 self.submitted_exes = [] 2024 self.submitted_args = [] 2025 2026 return me_dir
2027 2028 2029 @multiple_try(nb_try=10, sleep=5)
2030 - def control_one_job(self, id):
2031 """ control the status of a single job with it's cluster id """ 2032 #logger.debug("CONTROL ONE JOB MODE") 2033 if self.submitted == self.submitted_ids[-1] : 2034 id = self.metasubmit(self) 2035 tempid = self.submitted_ids[-1] 2036 self.submitted_ids.remove(self.submitted_ids[-1]) 2037 self.submitted_ids.append(id) 2038 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2039 2040 if id == 0 : 2041 status_out ='C' 2042 else: 2043 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2044 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2045 stderr=subprocess.PIPE) 2046 error = status.stderr.read() 2047 if status.returncode or error: 2048 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error 2049 status_out= status.stdout.read().strip() 2050 status_out= status_out.split(":",1)[1] 2051 logger.debug("[["+str(id)+"]]"+status_out) 2052 if status_out == 'waiting': 2053 status_out='I' 2054 elif status_out == 'preparing' or status_out == 'running': 2055 status_out = 'R' 2056 elif status_out != 'done': 2057 status_out = 'F' 2058 elif status_out == 'done': 2059 status_out = 'C' 2060 self.submitted -= 1 2061 2062 return status_out
2063 2064 @multiple_try()
2065 - def control(self, me_dir):
2066 """ control the status of a single job with it's cluster id """ 2067 if not self.submitted_ids: 2068 logger.debug("self.submitted_ids not exists") 2069 return 0, 0, 0, 0 2070 2071 if "//" in me_dir : 2072 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2073 start = me_dir.split("//")[0] 2074 end = me_dir.split("//")[1] 2075 else : 2076 start = me_dir.split("//")[1] 2077 end = me_dir.split("//")[0] 2078 elif "/" in me_dir : # update 2079 start = 0 2080 end = 0 2081 elif me_dir.isdigit(): 2082 start = me_dir 2083 end = me_dir 2084 elif not me_dir.isdigit(): 2085 me_dir = self.submitted_ids[0] 2086 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2087 2088 ongoing = [] 2089 idle, run, fail, done = 0, 0, 0, 0 2090 2091 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2092 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2093 2094 for line in status.stdout: 2095 status2 = line.split()[-1] 2096 if status2 is not 'null' or line.split()[0].strip() is not '0': 2097 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2098 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2099 2100 if status2 is 'null' or line.split()[0].strip() is '0': 2101 idle += 1 2102 elif status2 in self.idle_tag: 2103 idle += 1 2104 elif status2 in self.running_tag: 2105 run += 1 2106 elif status2 in self.complete_tag: 2107 done += 1 2108 self.submitted -= 1 2109 if not self.check_termination(line.split()[1]): 2110 idle +=1 2111 else: 2112 fail += 1 2113 2114 return idle, run, self.submitted - (idle+run+fail), fail
2115 2116 @multiple_try()
2117 - def remove(self, *args, **opts):
2118 """Clean the jobson the cluster""" 2119 2120 if not self.submitted_ids: 2121 return 2122 id = self.submitted_ids[0] 2123 if id is not 0 : 2124 cmd = "htcaas-job-cancel -m %s" % str(id) 2125 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2126 2127 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2128 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2129 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2130 2131 onecore=MultiCore(1) # create a thread to run simple bash job without having to 2132 #fork the main process 2133