Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21  import sys 
  22   
  23  logger = logging.getLogger('madgraph.cluster')  
  24   
  25  try: 
  26      from madgraph import MadGraph5Error 
  27      import madgraph.various.misc as misc 
  28  except Exception, error: 
  29      if __debug__: 
  30          print  str(error) 
  31      from internal import MadGraph5Error 
  32      import internal.misc as misc 
  33   
  34  pjoin = os.path.join 
35 36 -class ClusterManagmentError(MadGraph5Error):
37 pass
38
39 -class NotImplemented(MadGraph5Error):
40 pass
41 42 43 multiple_try = misc.multiple_try 44 pjoin = os.path.join
45 46 47 -def check_interupt(error=KeyboardInterrupt):
48 49 def deco_interupt(f): 50 def deco_f_interupt(self, *args, **opt): 51 try: 52 return f(self, *args, **opt) 53 except error: 54 try: 55 self.remove(*args, **opt) 56 except Exception: 57 pass 58 raise error
59 return deco_f_interupt 60 return deco_interupt 61
62 -def store_input(arg=''):
63 64 def deco_store(f): 65 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 66 input_files=[], output_files=[], required_output=[], nb_submit=0): 67 frame = inspect.currentframe() 68 args, _, _, values = inspect.getargvalues(frame) 69 args = dict([(i, values[i]) for i in args if i != 'self']) 70 id = f(self, **args) 71 if self.nb_retry > 0: 72 self.retry_args[id] = args 73 return id
74 return deco_f_store 75 return deco_store 76
77 -def need_transfer(options):
78 """ This function checks whether compression of input files are necessary 79 given the running options given. """ 80 81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 82 return False 83 else: 84 return True
85
86 -class Cluster(object):
87 """Basic Class for all cluster type submission""" 88 name = 'mother class' 89 identifier_length = 14 90
91 - def __init__(self,*args, **opts):
92 """Init the cluster""" 93 94 self.submitted = 0 95 self.submitted_ids = [] 96 self.finish = 0 97 self.submitted_dirs = [] #HTCaaS 98 self.submitted_exes = [] #HTCaaS 99 self.submitted_args = [] #HTCaaS 100 101 if 'cluster_queue' in opts: 102 self.cluster_queue = opts['cluster_queue'] 103 else: 104 self.cluster_queue = 'madgraph' 105 if 'cluster_temp_path' in opts: 106 self.temp_dir = opts['cluster_temp_path'] 107 else: 108 self.temp_dir = None 109 self.options = {'cluster_status_update': (600, 30)} 110 for key,value in opts.items(): 111 self.options[key] = value 112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 114 self.options = dict(opts) 115 self.retry_args = {} 116 # controlling jobs in controlled type submision 117 self.packet = {} 118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster.""" 123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124 125 126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 128 log=None, input_files=[], output_files=[], required_output=[], 129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster. 131 NO SHARE DISK""" 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 138 if not required_output and output_files: 139 required_output = output_files 140 141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 142 (input_files == [] == output_files): 143 return self.submit(prog, argument, cwd, stdout, stderr, log, 144 required_output=required_output, nb_submit=nb_submit) 145 146 if not input_files and not output_files: 147 # not input/output so not using submit2 148 return self.submit(prog, argument, cwd, stdout, stderr, log, 149 required_output=required_output, nb_submit=nb_submit) 150 151 if cwd is None: 152 cwd = os.getcwd() 153 if not os.path.exists(prog): 154 prog = os.path.join(cwd, prog) 155 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 156 157 text = """#!/bin/bash 158 MYTMP=%(tmpdir)s/run$%(job_id)s 159 MYPWD=%(cwd)s 160 mkdir -p $MYTMP 161 cd $MYPWD 162 input_files=( %(input_files)s ) 163 for i in ${input_files[@]} 164 do 165 cp -R -L $i $MYTMP 166 done 167 cd $MYTMP 168 echo '%(arguments)s' > arguments 169 chmod +x ./%(script)s 170 %(program)s ./%(script)s %(arguments)s 171 exit=$? 172 output_files=( %(output_files)s ) 173 for i in ${output_files[@]} 174 do 175 cp -r $MYTMP/$i $MYPWD 176 done 177 # if [ "$exit" -eq "0" ] 178 # then 179 rm -rf $MYTMP 180 # fi 181 """ 182 183 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 184 'cwd': cwd, 'job_id': self.job_id, 185 'input_files': ' '.join(input_files + [prog]), 186 'output_files': ' '.join(output_files), 187 'arguments': ' '.join([str(a) for a in argument]), 188 'program': ' ' if '.py' in prog else 'bash'} 189 190 # writing a new script for the submission 191 new_prog = pjoin(cwd, temp_file_name) 192 open(new_prog, 'w').write(text % dico) 193 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 194 195 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 196 required_output=required_output, nb_submit=nb_submit)
197 198
199 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 200 log=None, input_files=[], output_files=[], required_output=[], 201 nb_submit=0, packet_member=None):
202 """This function wrap the cluster submition with cluster independant 203 method should not be overwritten (but for DAG type submission)""" 204 205 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 206 output_files, required_output, nb_submit) 207 208 209 if not packet_member: 210 return id 211 else: 212 if isinstance(packet_member, Packet): 213 self.id_to_packet[id] = packet_member 214 packet_member.put(id) 215 if packet_member.tag not in self.packet: 216 self.packet[packet_member.tag] = packet_member 217 else: 218 if packet_member in self.packet: 219 packet = self.packet[packet_member] 220 packet.put(id) 221 self.id_to_packet[id] = packet 222 return id
223
224 - def control(self, me_dir=None):
225 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 226 if not self.submitted_ids: 227 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 228 idle, run, fail = 0, 0, 0 229 for pid in self.submitted_ids[:]: 230 status = self.control_one_job(id) 231 if status == 'I': 232 idle += 1 233 elif status == 'R': 234 run += 1 235 elif status == 'F': 236 self.finish +=1 237 self.submitted_ids.remove(pid) 238 else: 239 fail += 1 240 241 return idle, run, self.finish, fail
242
243 - def control_one_job(self, pid):
244 """ control the status of a single job with it's cluster id """ 245 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
246
247 - def get_jobs_identifier(self, path, second_path=None):
248 """get a unique run_name for all the jobs helps to identify the runs 249 in the controller for some cluster.""" 250 251 if second_path: 252 path = os.path.realpath(pjoin(path, second_path)) 253 elif not os.path.exists(path): 254 return path # job already done 255 256 if 'SubProcesses' in path: 257 target = path.rsplit('/SubProcesses',1)[0] 258 elif 'MCatNLO' in path: 259 target = path.rsplit('/MCatNLO',1)[0] 260 elif second_path: 261 target=path 262 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 263 else: 264 target = path 265 266 if target.endswith('/'): 267 target = target[:-1] 268 269 target = misc.digest(target)[-self.identifier_length:] 270 if not target[0].isalpha(): 271 target = 'a' + target[1:] 272 273 return target
274 275 276 @check_interupt()
277 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
278 """Wait that all job are finish. 279 if minimal_job set, then return if idle + run is lower than that number""" 280 281 282 mode = 1 # 0 is long waiting/ 1 is short waiting 283 nb_iter = 0 284 nb_short = 0 285 change_at = 5 # number of iteration from which we wait longer between update. 286 287 if update_first: 288 idle, run, finish, fail = self.control(me_dir) 289 update_first(idle, run, finish) 290 291 #usefull shortcut for readibility 292 longtime, shorttime = self.options['cluster_status_update'] 293 294 nb_job = 0 295 296 if self.options['cluster_type'] == 'htcaas2': 297 me_dir = self.metasubmit(self) 298 299 while 1: 300 old_mode = mode 301 nb_iter += 1 302 idle, run, finish, fail = self.control(me_dir) 303 if nb_job: 304 if idle + run + finish + fail != nb_job: 305 nb_job = idle + run + finish + fail 306 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 307 else: 308 nb_job = idle + run + finish + fail 309 if fail: 310 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 311 if idle + run == 0: 312 #time.sleep(20) #security to ensure that the file are really written on the disk 313 logger.info('All jobs finished') 314 fct(idle, run, finish) 315 break 316 if idle + run < minimal_job: 317 return 318 fct(idle, run, finish) 319 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 320 if nb_iter < change_at: 321 mode = 1 322 elif idle < run: 323 if old_mode == 0: 324 if nb_short: 325 mode = 0 #we already be back from short to long so stay in long 326 #check if we need to go back to short mode 327 elif idle: 328 if nb_iter > change_at + int(longtime)//shorttime: 329 mode = 0 #stay in long waiting mode 330 else: 331 mode = 1 # pass in short waiting mode 332 nb_short =0 333 else: 334 mode = 1 # pass in short waiting mode 335 nb_short = 0 336 elif old_mode == 1: 337 nb_short +=1 338 if nb_short > 3* max(change_at, int(longtime)//shorttime): 339 mode = 0 #go back in slow waiting 340 else: 341 mode = 0 342 343 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 344 if old_mode > mode: 345 logger.info('''Start to wait %ss between checking status. 346 Note that you can change this time in the configuration file. 347 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 348 349 #now Waiting! 350 if mode == 0: 351 try: 352 time.sleep(self.options['cluster_status_update'][0]) 353 except KeyboardInterrupt: 354 logger.info('start to update the status') 355 nb_iter = min(0, change_at -2) 356 nb_short = 0 357 else: 358 time.sleep(self.options['cluster_status_update'][1]) 359 360 361 self.submitted = 0 362 self.submitted_ids = []
363
364 - def check_termination(self, job_id):
365 """Check the termination of the jobs with job_id and relaunch it if needed.""" 366 367 368 if job_id not in self.retry_args: 369 return True 370 371 args = self.retry_args[job_id] 372 if 'time_check' in args: 373 time_check = args['time_check'] 374 else: 375 time_check = 0 376 377 for path in args['required_output']: 378 if args['cwd']: 379 path = pjoin(args['cwd'], path) 380 # check that file exists and is not empty. 381 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 382 break 383 else: 384 # all requested output are present 385 if time_check > 0: 386 logger.info('Job %s Finally found the missing output.' % (job_id)) 387 del self.retry_args[job_id] 388 self.submitted_ids.remove(job_id) 389 # check if the job_id is in a packet 390 if job_id in self.id_to_packet: 391 nb_in_packet = self.id_to_packet[job_id].remove_one() 392 if nb_in_packet == 0: 393 # packet done run the associate function 394 packet = self.id_to_packet[job_id] 395 # fully ensure that the packet is finished (thread safe) 396 packet.queue.join() 397 #running the function 398 packet.fct(*packet.args) 399 del self.id_to_packet[job_id] 400 return 'resubmit' 401 402 return 'done' 403 404 if time_check == 0: 405 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 406 args['time_check'] = time.time() 407 return 'wait' 408 elif self.cluster_retry_wait > time.time() - time_check: 409 return 'wait' 410 411 #jobs failed to be completed even after waiting time!! 412 if self.nb_retry < 0: 413 logger.critical('''Fail to run correctly job %s. 414 with option: %s 415 file missing: %s''' % (job_id, args, path)) 416 raw_input('press enter to continue.') 417 elif self.nb_retry == 0: 418 logger.critical('''Fail to run correctly job %s. 419 with option: %s 420 file missing: %s. 421 Stopping all runs.''' % (job_id, args, path)) 422 self.remove() 423 elif args['nb_submit'] >= self.nb_retry: 424 logger.critical('''Fail to run correctly job %s. 425 with option: %s 426 file missing: %s 427 Fails %s times 428 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 429 self.remove() 430 else: 431 args['nb_submit'] += 1 432 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 433 del self.retry_args[job_id] 434 self.submitted_ids.remove(job_id) 435 if 'time_check' in args: 436 del args['time_check'] 437 if job_id in self.id_to_packet: 438 self.id_to_packet[job_id].remove_one() 439 args['packet_member'] = self.id_to_packet[job_id] 440 del self.id_to_packet[job_id] 441 self.cluster_submit(**args) 442 else: 443 self.submit2(**args) 444 return 'resubmit' 445 return 'done'
446 447 @check_interupt()
448 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 449 stderr=None, log=None, required_output=[], nb_submit=0, 450 input_files=[], output_files=[]):
451 """launch one job on the cluster and wait for it""" 452 453 special_output = False # tag for concatenate the error with the output. 454 if stderr == -2 and stdout: 455 #We are suppose to send the output to stdout 456 special_output = True 457 stderr = stdout + '.err' 458 459 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 460 required_output=required_output, input_files=input_files, 461 output_files=output_files) 462 463 if self.options['cluster_type']=='htcaas2': 464 if self.submitted == self.submitted_ids[-1]: 465 id = self.metasubmit(self) 466 467 frame = inspect.currentframe() 468 args, _, _, values = inspect.getargvalues(frame) 469 args = dict([(i, values[i]) for i in args if i != 'self']) 470 self.retry_args[id] = args 471 472 nb_wait=0 473 while 1: 474 nb_wait+=1 475 status = self.control_one_job(id) 476 if not status in ['R','I']: 477 status = self.check_termination(id) 478 if status in ['wait']: 479 time.sleep(30) 480 continue 481 elif status in ['resubmit']: 482 id = self.submitted_ids[0] 483 time.sleep(30) 484 continue 485 #really stop! 486 time.sleep(30) #security to ensure that the file are really written on the disk 487 break 488 time.sleep(self.options['cluster_status_update'][1]) 489 490 if required_output: 491 status = self.check_termination(id) 492 if status == 'wait': 493 run += 1 494 elif status == 'resubmit': 495 idle += 1 496 497 498 if special_output: 499 # combine the stdout and the stderr 500 #wait up to 50 s to see if those files exists 501 for i in range(5): 502 if os.path.exists(stdout): 503 if not os.path.exists(stderr): 504 time.sleep(5) 505 if os.path.exists(stderr): 506 err_text = open(stderr).read() 507 if not err_text: 508 return 509 logger.warning(err_text) 510 text = open(stdout).read() 511 open(stdout,'w').write(text + err_text) 512 else: 513 return 514 time.sleep(10)
515
516 - def remove(self, *args, **opts):
517 """ """ 518 logger.warning("""This cluster didn't support job removal, 519 the jobs are still running on the cluster.""")
520 521 @store_input()
522 - def metasubmit(self, me_dir):
523 logger.warning("""This cluster didn't support metajob submit.""") 524 return 0
525
526 -class Packet(object):
527 """ an object for handling packet of job, it is designed to be thread safe 528 """ 529
530 - def __init__(self, name, fct, args, opts={}):
531 import Queue 532 import threading 533 self.queue = Queue.Queue() 534 self.tag = name 535 self.fct = fct 536 self.args = args 537 self.opts = opts 538 self.done = threading.Event()
539
540 - def put(self, *args, **opts):
541 self.queue.put(*args, **opts)
542 543 append = put 544
545 - def remove_one(self):
546 self.queue.get(True) 547 self.queue.task_done() 548 return self.queue.qsize()
549
550 -class MultiCore(Cluster):
551 """class for dealing with the submission in multiple node""" 552 553 job_id = "$" 554
555 - def __init__(self, *args, **opt):
556 """Init the cluster """ 557 558 559 super(MultiCore, self).__init__(self, *args, **opt) 560 561 import Queue 562 import threading 563 import thread 564 self.queue = Queue.Queue() # list of job to do 565 self.done = Queue.Queue() # list of job finisned 566 self.submitted = Queue.Queue() # one entry by job submitted 567 self.stoprequest = threading.Event() #flag to ensure everything to close 568 self.demons = [] 569 self.nb_done =0 570 if 'nb_core' in opt: 571 self.nb_core = opt['nb_core'] 572 elif isinstance(args[0],int): 573 self.nb_core = args[0] 574 else: 575 self.nb_core = 1 576 self.update_fct = None 577 578 self.lock = threading.Event() # allow nice lock of the main thread 579 self.pids = Queue.Queue() # allow to clean jobs submit via subprocess 580 self.done_pid = [] # list of job finisned 581 self.done_pid_queue = Queue.Queue() 582 self.fail_msg = None 583 584 # starting the worker node 585 for _ in range(self.nb_core): 586 self.start_demon()
587 588
589 - def start_demon(self):
590 import threading 591 t = threading.Thread(target=self.worker) 592 t.daemon = True 593 t.start() 594 self.demons.append(t)
595 596
597 - def worker(self):
598 import Queue 599 import thread 600 while not self.stoprequest.isSet(): 601 try: 602 args = self.queue.get() 603 tag, exe, arg, opt = args 604 try: 605 # check for executable case 606 if isinstance(exe,str): 607 if os.path.exists(exe) and not exe.startswith('/'): 608 exe = './' + exe 609 if opt['stderr'] == None: 610 opt['stderr'] = subprocess.STDOUT 611 proc = misc.Popen([exe] + arg, **opt) 612 pid = proc.pid 613 self.pids.put(pid) 614 proc.wait() 615 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 616 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 617 (' '.join([exe]+arg), proc.returncode) 618 logger.warning(fail_msg) 619 self.stoprequest.set() 620 self.remove(fail_msg) 621 # handle the case when this is a python function. Note that 622 # this use Thread so they are NO built-in parralelization this is 623 # going to work on a single core! (but this is fine for IO intensive 624 # function. for CPU intensive fct this will slow down the computation 625 else: 626 pid = tag 627 self.pids.put(pid) 628 # the function should return 0 if everything is fine 629 # the error message otherwise 630 returncode = exe(*arg, **opt) 631 if returncode != 0: 632 logger.warning("fct %s does not return 0. Starts to stop the code in a clean way.", exe) 633 self.stoprequest.set() 634 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 635 except Exception,error: 636 self.fail_msg = sys.exc_info() 637 logger.warning(str(error)) 638 self.stoprequest.set() 639 self.remove(error) 640 641 if __debug__: 642 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2] 643 644 self.queue.task_done() 645 self.done.put(tag) 646 self.done_pid_queue.put(pid) 647 #release the mother to print the status on the screen 648 try: 649 self.lock.set() 650 except thread.error: 651 continue 652 except Queue.Empty: 653 continue
654 655 656 657
658 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 659 log=None, required_output=[], nb_submit=0):
660 """submit a job on multicore machine""" 661 662 tag = (prog, tuple(argument), cwd, nb_submit) 663 if isinstance(prog, str): 664 665 666 opt = {'cwd': cwd, 667 'stdout':stdout, 668 'stderr': stderr} 669 self.queue.put((tag, prog, argument, opt)) 670 self.submitted.put(1) 671 return tag 672 else: 673 # python function 674 self.queue.put((tag, prog, argument, {})) 675 self.submitted.put(1) 676 return tag
677
678 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 679 stderr=None, log=None, **opts):
680 """launch one job and wait for it""" 681 if isinstance(stdout, str): 682 stdout = open(stdout, 'w') 683 if isinstance(stderr, str): 684 stdout = open(stderr, 'w') 685 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
686
687 - def remove(self, error=None):
688 """Ensure that all thread are killed""" 689 690 # ensure the worker to stop 691 self.stoprequest.set() 692 if error and not self.fail_msg: 693 self.fail_msg = error 694 695 # cleaning the queue done_pid_queue and move them to done_pid 696 while not self.done_pid_queue.empty(): 697 pid = self.done_pid_queue.get() 698 self.done_pid.append(pid) 699 # self.done_pid_queue.task_done() 700 701 while not self.pids.empty(): 702 pid = self.pids.get() 703 self.pids.task_done() 704 if isinstance(pid, tuple): 705 continue 706 if pid in self.done_pid: 707 continue 708 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 709 % {'pid':pid} ) 710 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
711 712
713 - def wait(self, me_dir, update_status, update_first=None):
714 """Waiting that all the jobs are done. This function also control that 715 the submission by packet are handle correctly (i.e. submit the function)""" 716 717 import Queue 718 import threading 719 720 try: # to catch KeyBoardInterupt to see which kind of error to display 721 last_status = (0, 0, 0) 722 sleep_time = 1 723 use_lock = True 724 first = True 725 while True: 726 force_one_more_loop = False # some security 727 728 # Loop over the job tagged as done to check if some packet of jobs 729 # are finished in case, put the associate function in the queue 730 while self.done.qsize(): 731 try: 732 tag = self.done.get(True, 1) 733 except Queue.Empty: 734 pass 735 else: 736 if self.id_to_packet and tuple(tag) in self.id_to_packet: 737 packet = self.id_to_packet[tuple(tag)] 738 remaining = packet.remove_one() 739 if remaining == 0: 740 # fully ensure that the packet is finished (thread safe) 741 packet.queue.join() 742 self.submit(packet.fct, packet.args) 743 force_one_more_loop = True 744 self.nb_done += 1 745 self.done.task_done() 746 747 # Get from the various queue the Idle/Done/Running information 748 # Those variable should be thread safe but approximate. 749 Idle = self.queue.qsize() 750 Done = self.nb_done + self.done.qsize() 751 Running = max(0, self.submitted.qsize() - Idle - Done) 752 753 if Idle + Running <= 0 and not force_one_more_loop: 754 update_status(Idle, Running, Done) 755 # Going the quit since everything is done 756 # Fully Ensure that everything is indeed done. 757 self.queue.join() 758 break 759 760 if (Idle, Running, Done) != last_status: 761 if first and update_first: 762 update_first(Idle, Running, Done) 763 first = False 764 else: 765 update_status(Idle, Running, Done) 766 last_status = (Idle, Running, Done) 767 768 # cleaning the queue done_pid_queue and move them to done_pid 769 while not self.done_pid_queue.empty(): 770 pid = self.done_pid_queue.get() 771 self.done_pid.append(pid) 772 self.done_pid_queue.task_done() 773 774 775 # Define how to wait for the next iteration 776 if use_lock: 777 # simply wait that a worker release the lock 778 use_lock = self.lock.wait(300) 779 self.lock.clear() 780 if not use_lock and Idle > 0: 781 use_lock = True 782 else: 783 # to be sure that we will never fully lock at the end pass to 784 # a simple time.sleep() 785 time.sleep(sleep_time) 786 sleep_time = min(sleep_time + 2, 180) 787 if update_first: 788 update_first(Idle, Running, Done) 789 790 if self.stoprequest.isSet(): 791 if isinstance(self.fail_msg, Exception): 792 raise self.fail_msg 793 elif isinstance(self.fail_msg, str): 794 raise Exception, self.fail_msg 795 else: 796 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 797 # reset variable for next submission 798 try: 799 self.lock.clear() 800 except Exception: 801 pass 802 self.done = Queue.Queue() 803 self.done_pid = [] 804 self.done_pid_queue = Queue.Queue() 805 self.nb_done = 0 806 self.submitted = Queue.Queue() 807 self.pids = Queue.Queue() 808 self.stoprequest.clear() 809 810 except KeyboardInterrupt: 811 # if one of the node fails -> return that error 812 if isinstance(self.fail_msg, Exception): 813 raise self.fail_msg 814 elif isinstance(self.fail_msg, str): 815 raise Exception, self.fail_msg 816 elif self.fail_msg: 817 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 818 # else return orignal error 819 raise
820
821 -class CondorCluster(Cluster):
822 """Basic class for dealing with cluster submission""" 823 824 name = 'condor' 825 job_id = 'CONDOR_ID' 826 827 828 829 @multiple_try()
830 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 831 required_output=[], nb_submit=0):
832 """Submit a job prog to a Condor cluster""" 833 834 text = """Executable = %(prog)s 835 output = %(stdout)s 836 error = %(stderr)s 837 log = %(log)s 838 %(argument)s 839 environment = CONDOR_ID=$(Cluster).$(Process) 840 Universe = vanilla 841 notification = Error 842 Initialdir = %(cwd)s 843 %(requirement)s 844 getenv=True 845 queue 1 846 """ 847 848 if self.cluster_queue not in ['None', None]: 849 requirement = 'Requirements = %s=?=True' % self.cluster_queue 850 else: 851 requirement = '' 852 853 if cwd is None: 854 cwd = os.getcwd() 855 if stdout is None: 856 stdout = '/dev/null' 857 if stderr is None: 858 stderr = '/dev/null' 859 if log is None: 860 log = '/dev/null' 861 if not os.path.exists(prog): 862 prog = os.path.join(cwd, prog) 863 if argument: 864 argument = 'Arguments = %s' % ' '.join(argument) 865 else: 866 argument = '' 867 868 869 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 870 'stderr': stderr,'log': log,'argument': argument, 871 'requirement': requirement} 872 873 #open('submit_condor','w').write(text % dico) 874 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 875 stdin=subprocess.PIPE) 876 output, _ = a.communicate(text % dico) 877 #output = a.stdout.read() 878 #Submitting job(s). 879 #Logging submit event(s). 880 #1 job(s) submitted to cluster 2253622. 881 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 882 try: 883 id = pat.search(output).groups()[0] 884 except: 885 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 886 % output 887 self.submitted += 1 888 self.submitted_ids.append(id) 889 return id
890 891 @store_input() 892 @multiple_try()
893 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 894 log=None, input_files=[], output_files=[], required_output=[], 895 nb_submit=0):
896 """Submit the job on the cluster NO SHARE DISK 897 input/output file should be give relative to cwd 898 """ 899 900 if not required_output and output_files: 901 required_output = output_files 902 903 if (input_files == [] == output_files): 904 return self.submit(prog, argument, cwd, stdout, stderr, log, 905 required_output=required_output, nb_submit=nb_submit) 906 907 text = """Executable = %(prog)s 908 output = %(stdout)s 909 error = %(stderr)s 910 log = %(log)s 911 %(argument)s 912 should_transfer_files = YES 913 when_to_transfer_output = ON_EXIT 914 transfer_input_files = %(input_files)s 915 %(output_files)s 916 Universe = vanilla 917 notification = Error 918 Initialdir = %(cwd)s 919 %(requirement)s 920 getenv=True 921 queue 1 922 """ 923 924 if self.cluster_queue not in ['None', None]: 925 requirement = 'Requirements = %s=?=True' % self.cluster_queue 926 else: 927 requirement = '' 928 929 if cwd is None: 930 cwd = os.getcwd() 931 if stdout is None: 932 stdout = '/dev/null' 933 if stderr is None: 934 stderr = '/dev/null' 935 if log is None: 936 log = '/dev/null' 937 if not os.path.exists(prog): 938 prog = os.path.join(cwd, prog) 939 if argument: 940 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 941 else: 942 argument = '' 943 # input/output file treatment 944 if input_files: 945 input_files = ','.join(input_files) 946 else: 947 input_files = '' 948 if output_files: 949 output_files = 'transfer_output_files = %s' % ','.join(output_files) 950 else: 951 output_files = '' 952 953 954 955 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 956 'stderr': stderr,'log': log,'argument': argument, 957 'requirement': requirement, 'input_files':input_files, 958 'output_files':output_files} 959 960 #open('submit_condor','w').write(text % dico) 961 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 962 stdin=subprocess.PIPE) 963 output, _ = a.communicate(text % dico) 964 #output = a.stdout.read() 965 #Submitting job(s). 966 #Logging submit event(s). 967 #1 job(s) submitted to cluster 2253622. 968 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 969 try: 970 id = pat.search(output).groups()[0] 971 except: 972 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 973 % output 974 self.submitted += 1 975 self.submitted_ids.append(id) 976 return id
977 978 979 980 981 982 @multiple_try(nb_try=10, sleep=10)
983 - def control_one_job(self, id):
984 """ control the status of a single job with it's cluster id """ 985 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 986 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 987 stderr=subprocess.PIPE) 988 989 error = status.stderr.read() 990 if status.returncode or error: 991 raise ClusterManagmentError, 'condor_q returns error: %s' % error 992 993 return status.stdout.readline().strip()
994 995 @check_interupt() 996 @multiple_try(nb_try=10, sleep=10)
997 - def control(self, me_dir):
998 """ control the status of a single job with it's cluster id """ 999 1000 if not self.submitted_ids: 1001 return 0, 0, 0, 0 1002 1003 packet = 15000 1004 idle, run, fail = 0, 0, 0 1005 ongoing = [] 1006 for i in range(1+(len(self.submitted_ids)-1)//packet): 1007 start = i * packet 1008 stop = (i+1) * packet 1009 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1010 " -format \'%-2s\ ' \'ClusterId\' " + \ 1011 " -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1012 1013 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1014 stderr=subprocess.PIPE) 1015 error = status.stderr.read() 1016 if status.returncode or error: 1017 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1018 1019 for line in status.stdout: 1020 id, status = line.strip().split() 1021 ongoing.append(int(id)) 1022 if status in ['I','U']: 1023 idle += 1 1024 elif status == 'R': 1025 run += 1 1026 elif status != 'C': 1027 fail += 1 1028 1029 for id in list(self.submitted_ids): 1030 if int(id) not in ongoing: 1031 status = self.check_termination(id) 1032 if status == 'wait': 1033 run += 1 1034 elif status == 'resubmit': 1035 idle += 1 1036 1037 return idle, run, self.submitted - (idle+run+fail), fail
1038 1039 @multiple_try()
1040 - def remove(self, *args, **opts):
1041 """Clean the jobson the cluster""" 1042 1043 if not self.submitted_ids: 1044 return 1045 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1046 1047 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1048 self.submitted_ids = []
1049
1050 -class PBSCluster(Cluster):
1051 """Basic class for dealing with cluster submission""" 1052 1053 name = 'pbs' 1054 job_id = 'PBS_JOBID' 1055 idle_tag = ['Q'] 1056 running_tag = ['T','E','R'] 1057 complete_tag = ['C'] 1058 1059 maximum_submited_jobs = 2500 1060 1061 @multiple_try()
1062 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1063 required_output=[], nb_submit=0):
1064 """Submit a job prog to a PBS cluster""" 1065 1066 me_dir = self.get_jobs_identifier(cwd, prog) 1067 1068 if len(self.submitted_ids) > self.maximum_submited_jobs: 1069 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1070 self.wait(me_dir, fct, self.maximum_submited_jobs) 1071 1072 1073 text = "" 1074 if cwd is None: 1075 cwd = os.getcwd() 1076 else: 1077 text = " cd %s;" % cwd 1078 if stdout is None: 1079 stdout = '/dev/null' 1080 if stderr is None: 1081 stderr = '/dev/null' 1082 elif stderr == -2: # -2 is subprocess.STDOUT 1083 stderr = stdout 1084 if log is None: 1085 log = '/dev/null' 1086 1087 if not os.path.isabs(prog): 1088 text += "./%s" % prog 1089 else: 1090 text+= prog 1091 1092 if argument: 1093 text += ' ' + ' '.join(argument) 1094 1095 command = ['qsub','-o', stdout, 1096 '-N', me_dir, 1097 '-e', stderr, 1098 '-V'] 1099 1100 if self.cluster_queue and self.cluster_queue != 'None': 1101 command.extend(['-q', self.cluster_queue]) 1102 1103 a = misc.Popen(command, stdout=subprocess.PIPE, 1104 stderr=subprocess.STDOUT, 1105 stdin=subprocess.PIPE, cwd=cwd) 1106 1107 output = a.communicate(text)[0] 1108 id = output.split('.')[0] 1109 if not id.isdigit() or a.returncode !=0: 1110 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1111 % output 1112 1113 self.submitted += 1 1114 self.submitted_ids.append(id) 1115 return id
1116 1117 @multiple_try()
1118 - def control_one_job(self, id):
1119 """ control the status of a single job with it's cluster id """ 1120 cmd = 'qstat '+str(id) 1121 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1122 stderr=subprocess.STDOUT) 1123 1124 for line in status.stdout: 1125 line = line.strip() 1126 if 'cannot connect to server' in line or 'cannot read reply' in line: 1127 raise ClusterManagmentError, 'server disconnected' 1128 if 'Unknown' in line: 1129 return 'F' 1130 elif line.startswith(str(id)): 1131 jobstatus = line.split()[4] 1132 else: 1133 jobstatus="" 1134 1135 if status.returncode != 0 and status.returncode is not None: 1136 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1137 if jobstatus in self.idle_tag: 1138 return 'I' 1139 elif jobstatus in self.running_tag: 1140 return 'R' 1141 return 'F'
1142 1143 1144 @multiple_try()
1145 - def control(self, me_dir):
1146 """ control the status of a single job with it's cluster id """ 1147 cmd = "qstat" 1148 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1149 1150 me_dir = self.get_jobs_identifier(me_dir) 1151 1152 ongoing = [] 1153 1154 idle, run, fail = 0, 0, 0 1155 for line in status.stdout: 1156 if 'cannot connect to server' in line or 'cannot read reply' in line: 1157 raise ClusterManagmentError, 'server disconnected' 1158 if me_dir in line: 1159 ongoing.append(line.split()[0].split('.')[0]) 1160 status2 = line.split()[4] 1161 if status2 in self.idle_tag: 1162 idle += 1 1163 elif status2 in self.running_tag: 1164 run += 1 1165 elif status2 in self.complete_tag: 1166 if not self.check_termination(line.split()[0].split('.')[0]): 1167 idle += 1 1168 else: 1169 fail += 1 1170 1171 if status.returncode != 0 and status.returncode is not None: 1172 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1173 1174 for id in list(self.submitted_ids): 1175 if id not in ongoing: 1176 status2 = self.check_termination(id) 1177 if status2 == 'wait': 1178 run += 1 1179 elif status2 == 'resubmit': 1180 idle += 1 1181 1182 return idle, run, self.submitted - (idle+run+fail), fail
1183 1184 @multiple_try()
1185 - def remove(self, *args, **opts):
1186 """Clean the jobs on the cluster""" 1187 1188 if not self.submitted_ids: 1189 return 1190 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1191 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1192 self.submitted_ids = []
1193
1194 1195 -class SGECluster(Cluster):
1196 """Basic class for dealing with cluster submission""" 1197 # Class written by Arian Abrahantes. 1198 1199 name = 'sge' 1200 job_id = 'JOB_ID' 1201 idle_tag = ['qw', 'hqw','hRqw','w'] 1202 running_tag = ['r','t','Rr','Rt'] 1203 identifier_length = 10 1204
1205 - def def_get_path(self,location):
1206 """replace string for path issues""" 1207 location = os.path.realpath(location) 1208 homePath = os.getenv("HOME") 1209 if homePath: 1210 location = location.replace(homePath,'$HOME') 1211 return location
1212 1213 @multiple_try()
1214 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1215 required_output=[], nb_submit=0):
1216 """Submit a job prog to an SGE cluster""" 1217 1218 me_dir = self.get_jobs_identifier(cwd, prog) 1219 1220 1221 if cwd is None: 1222 #cwd = os.getcwd() 1223 cwd = self.def_get_path(os.getcwd()) 1224 cwd1 = self.def_get_path(cwd) 1225 text = " cd %s;" % cwd1 1226 if stdout is None: 1227 stdout = '/dev/null' 1228 else: 1229 stdout = self.def_get_path(stdout) 1230 if stderr is None: 1231 stderr = '/dev/null' 1232 elif stderr == -2: # -2 is subprocess.STDOUT 1233 stderr = stdout 1234 else: 1235 stderr = self.def_get_path(stderr) 1236 1237 if log is None: 1238 log = '/dev/null' 1239 else: 1240 log = self.def_get_path(log) 1241 1242 text += prog 1243 if argument: 1244 text += ' ' + ' '.join(argument) 1245 1246 #if anything slips through argument 1247 #print "!=== inteded change ",text.replace('/srv/nfs','') 1248 #text = text.replace('/srv/nfs','') 1249 homePath = os.getenv("HOME") 1250 if homePath: 1251 text = text.replace(homePath,'$HOME') 1252 1253 logger.debug("!=== input %s" % text) 1254 logger.debug("!=== output %s" % stdout) 1255 logger.debug("!=== error %s" % stderr) 1256 logger.debug("!=== logs %s" % log) 1257 1258 command = ['qsub','-o', stdout, 1259 '-N', me_dir, 1260 '-e', stderr, 1261 '-V'] 1262 1263 if self.cluster_queue and self.cluster_queue != 'None': 1264 command.extend(['-q', self.cluster_queue]) 1265 1266 a = misc.Popen(command, stdout=subprocess.PIPE, 1267 stderr=subprocess.STDOUT, 1268 stdin=subprocess.PIPE, cwd=cwd) 1269 1270 output = a.communicate(text)[0] 1271 id = output.split(' ')[2] 1272 if not id.isdigit(): 1273 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1274 % output 1275 self.submitted += 1 1276 self.submitted_ids.append(id) 1277 logger.debug(output) 1278 1279 return id
1280 1281 @multiple_try()
1282 - def control_one_job(self, id):
1283 """ control the status of a single job with it's cluster id """ 1284 #cmd = 'qstat '+str(id) 1285 cmd = 'qstat ' 1286 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1287 for line in status.stdout: 1288 #print "!==",line 1289 #line = line.strip() 1290 #if 'Unknown' in line: 1291 # return 'F' 1292 #elif line.startswith(str(id)): 1293 # status = line.split()[4] 1294 if str(id) in line: 1295 status = line.split()[4] 1296 #print "!=status", status 1297 if status in self.idle_tag: 1298 return 'I' 1299 elif status in self.running_tag: 1300 return 'R' 1301 return 'F'
1302 1303 @multiple_try()
1304 - def control(self, me_dir):
1305 """ control the status of a single job with it's cluster id """ 1306 cmd = "qstat " 1307 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1308 1309 me_dir = self.get_jobs_identifier(me_dir) 1310 1311 finished = list(self.submitted_ids) 1312 1313 idle, run, fail = 0, 0, 0 1314 for line in status.stdout: 1315 if me_dir in line: 1316 id,_,_,_,status = line.split()[:5] 1317 if status in self.idle_tag: 1318 idle += 1 1319 finished.remove(id) 1320 elif status in self.running_tag: 1321 run += 1 1322 finished.remove(id) 1323 else: 1324 logger.debug(line) 1325 fail += 1 1326 finished.remove(id) 1327 1328 for id in finished: 1329 self.check_termination(id) 1330 1331 return idle, run, self.submitted - (idle+run+fail), fail
1332 1333 1334 1335 @multiple_try()
1336 - def remove(self, *args, **opts):
1337 """Clean the jobs on the cluster""" 1338 1339 if not self.submitted_ids: 1340 return 1341 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1342 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1343 self.submitted_ids = []
1344
1345 1346 -class LSFCluster(Cluster):
1347 """Basic class for dealing with cluster submission""" 1348 1349 name = 'lsf' 1350 job_id = 'LSB_JOBID' 1351 1352 @multiple_try()
1353 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1354 required_output=[], nb_submit=0):
1355 """Submit the job prog to an LSF cluster""" 1356 1357 1358 me_dir = self.get_jobs_identifier(cwd, prog) 1359 1360 text = "" 1361 command = ['bsub', '-C0', '-J', me_dir] 1362 if cwd is None: 1363 cwd = os.getcwd() 1364 else: 1365 text = " cd %s;" % cwd 1366 if stdout and isinstance(stdout, str): 1367 command.extend(['-o', stdout]) 1368 if stderr and isinstance(stdout, str): 1369 command.extend(['-e', stderr]) 1370 elif stderr == -2: # -2 is subprocess.STDOUT 1371 pass 1372 if log is None: 1373 log = '/dev/null' 1374 1375 text += prog 1376 if argument: 1377 text += ' ' + ' '.join(argument) 1378 1379 if self.cluster_queue and self.cluster_queue != 'None': 1380 command.extend(['-q', self.cluster_queue]) 1381 1382 a = misc.Popen(command, stdout=subprocess.PIPE, 1383 stderr=subprocess.STDOUT, 1384 stdin=subprocess.PIPE, cwd=cwd) 1385 1386 output = a.communicate(text)[0] 1387 #Job <nnnn> is submitted to default queue <normal>. 1388 try: 1389 id = output.split('>',1)[0].split('<')[1] 1390 except: 1391 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1392 % output 1393 if not id.isdigit(): 1394 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1395 % output 1396 self.submitted += 1 1397 self.submitted_ids.append(id) 1398 return id
1399 1400 1401 @multiple_try()
1402 - def control_one_job(self, id):
1403 """ control the status of a single job with it's cluster id """ 1404 1405 cmd = 'bjobs '+str(id) 1406 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1407 1408 for line in status.stdout: 1409 line = line.strip().upper() 1410 if 'JOBID' in line: 1411 continue 1412 elif str(id) not in line: 1413 continue 1414 status = line.split()[2] 1415 if status == 'RUN': 1416 return 'R' 1417 elif status == 'PEND': 1418 return 'I' 1419 elif status == 'DONE': 1420 return 'F' 1421 else: 1422 return 'H' 1423 return 'F'
1424 1425 @multiple_try()
1426 - def control(self, me_dir):
1427 """ control the status of a single job with it's cluster id """ 1428 1429 if not self.submitted_ids: 1430 return 0, 0, 0, 0 1431 1432 cmd = "bjobs " + ' '.join(self.submitted_ids) 1433 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1434 1435 jobstatus = {} 1436 for line in status.stdout: 1437 line = line.strip() 1438 if 'JOBID' in line: 1439 continue 1440 splitline = line.split() 1441 id = splitline[0] 1442 if id not in self.submitted_ids: 1443 continue 1444 jobstatus[id] = splitline[2] 1445 1446 idle, run, fail = 0, 0, 0 1447 for id in self.submitted_ids[:]: 1448 if id in jobstatus: 1449 status = jobstatus[id] 1450 else: 1451 status = 'MISSING' 1452 if status == 'RUN': 1453 run += 1 1454 elif status == 'PEND': 1455 idle += 1 1456 else: 1457 status = self.check_termination(id) 1458 if status == 'wait': 1459 run += 1 1460 elif status == 'resubmit': 1461 idle += 1 1462 1463 return idle, run, self.submitted - (idle+run+fail), fail
1464 1465 @multiple_try()
1466 - def remove(self, *args,**opts):
1467 """Clean the jobs on the cluster""" 1468 1469 if not self.submitted_ids: 1470 return 1471 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1472 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1473 self.submitted_ids = []
1474
1475 -class GECluster(Cluster):
1476 """Class for dealing with cluster submission on a GE cluster""" 1477 1478 name = 'ge' 1479 job_id = 'JOB_ID' 1480 idle_tag = ['qw'] 1481 running_tag = ['r'] 1482 1483 @multiple_try()
1484 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1485 required_output=[], nb_submit=0):
1486 """Submit a job prog to a GE cluster""" 1487 1488 text = "" 1489 if cwd is None: 1490 cwd = os.getcwd() 1491 else: 1492 text = " cd %s; bash " % cwd 1493 if stdout is None: 1494 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1495 if stderr is None: 1496 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1497 elif stderr == -2: # -2 is subprocess.STDOUT 1498 stderr = stdout 1499 if log is None: 1500 log = '/dev/null' 1501 1502 text += prog 1503 if argument: 1504 text += ' ' + ' '.join(argument) 1505 text += '\n' 1506 tmp_submit = os.path.join(cwd, 'tmp_submit') 1507 open(tmp_submit,'w').write(text) 1508 1509 a = misc.Popen(['qsub','-o', stdout, 1510 '-e', stderr, 1511 tmp_submit], 1512 stdout=subprocess.PIPE, 1513 stderr=subprocess.STDOUT, 1514 stdin=subprocess.PIPE, cwd=cwd) 1515 1516 output = a.communicate()[0] 1517 #Your job 874511 ("test.sh") has been submitted 1518 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1519 try: 1520 id = pat.search(output).groups()[0] 1521 except: 1522 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1523 % output 1524 self.submitted += 1 1525 self.submitted_ids.append(id) 1526 return id
1527 1528 @multiple_try()
1529 - def control_one_job(self, id):
1530 """ control the status of a single job with it's cluster id """ 1531 cmd = 'qstat | grep '+str(id) 1532 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1533 if not status: 1534 return 'F' 1535 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1536 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1537 stat = '' 1538 for line in status.stdout.read().split('\n'): 1539 if not line: 1540 continue 1541 line = line.strip() 1542 try: 1543 groups = pat.search(line).groups() 1544 except: 1545 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1546 if groups[0] != id: continue 1547 stat = groups[1] 1548 if not stat: 1549 return 'F' 1550 if stat in self.idle_tag: 1551 return 'I' 1552 if stat in self.running_tag: 1553 return 'R'
1554 1555 @multiple_try()
1556 - def control(self, me_dir=None):
1557 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1558 if not self.submitted_ids: 1559 return 0, 0, 0, 0 1560 idle, run, fail = 0, 0, 0 1561 ongoing = [] 1562 for statusflag in ['p', 'r', 'sh']: 1563 cmd = 'qstat -s %s' % statusflag 1564 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1565 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1566 pat = re.compile("^(\d+)") 1567 for line in status.stdout.read().split('\n'): 1568 line = line.strip() 1569 try: 1570 id = pat.search(line).groups()[0] 1571 except Exception: 1572 pass 1573 else: 1574 if id not in self.submitted_ids: 1575 continue 1576 ongoing.append(id) 1577 if statusflag == 'p': 1578 idle += 1 1579 if statusflag == 'r': 1580 run += 1 1581 if statusflag == 'sh': 1582 fail += 1 1583 for id in list(self.submitted_ids): 1584 if id not in ongoing: 1585 self.check_termination(id) 1586 #self.submitted_ids = ongoing 1587 1588 return idle, run, self.submitted - idle - run - fail, fail
1589 1590 @multiple_try()
1591 - def remove(self, *args, **opts):
1592 """Clean the jobs on the cluster""" 1593 1594 if not self.submitted_ids: 1595 return 1596 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1597 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1598 self.submitted_ids = []
1599
1600 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1601 """start a computation and not wait for it to finish. 1602 this fonction returns a lock which is locked as long as the job is 1603 running.""" 1604 1605 mc = MultiCore(1) 1606 mc.submit(exe, argument, cwd, stdout, **opt) 1607 mc.need_waiting = True 1608 return mc.lock
1609
1610 1611 -class SLURMCluster(Cluster):
1612 """Basic class for dealing with cluster submission""" 1613 1614 name = 'slurm' 1615 job_id = 'SLURM_JOBID' 1616 idle_tag = ['Q','PD','S','CF'] 1617 running_tag = ['R', 'CG'] 1618 complete_tag = ['C'] 1619 identification_length = 8 1620 1621 @multiple_try()
1622 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1623 required_output=[], nb_submit=0):
1624 """Submit a job prog to a SLURM cluster""" 1625 1626 me_dir = self.get_jobs_identifier(cwd, prog) 1627 1628 1629 if cwd is None: 1630 cwd = os.getcwd() 1631 if stdout is None: 1632 stdout = '/dev/null' 1633 if stderr is None: 1634 stderr = '/dev/null' 1635 elif stderr == -2: # -2 is subprocess.STDOUT 1636 stderr = stdout 1637 if log is None: 1638 log = '/dev/null' 1639 1640 command = ['sbatch', '-o', stdout, 1641 '-J', me_dir, 1642 '-e', stderr, prog] + argument 1643 1644 if self.cluster_queue and self.cluster_queue != 'None': 1645 command.insert(1, '-p') 1646 command.insert(2, self.cluster_queue) 1647 1648 a = misc.Popen(command, stdout=subprocess.PIPE, 1649 stderr=subprocess.STDOUT, 1650 stdin=subprocess.PIPE, cwd=cwd) 1651 1652 output = a.communicate() 1653 output_arr = output[0].split(' ') 1654 id = output_arr[3].rstrip() 1655 1656 if not id.isdigit(): 1657 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1658 1659 self.submitted += 1 1660 self.submitted_ids.append(id) 1661 return id
1662 1663 @multiple_try()
1664 - def control_one_job(self, id):
1665 """ control the status of a single job with it's cluster id """ 1666 cmd = 'squeue j'+str(id) 1667 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1668 stderr=open(os.devnull,'w')) 1669 1670 for line in status.stdout: 1671 line = line.strip() 1672 if 'Invalid' in line: 1673 return 'F' 1674 elif line.startswith(str(id)): 1675 status = line.split()[4] 1676 if status in self.idle_tag: 1677 return 'I' 1678 elif status in self.running_tag: 1679 return 'R' 1680 return 'F'
1681 1682 @multiple_try()
1683 - def control(self, me_dir):
1684 """ control the status of a single job with it's cluster id """ 1685 cmd = "squeue" 1686 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1687 1688 me_dir = self.get_jobs_identifier(me_dir) 1689 1690 idle, run, fail = 0, 0, 0 1691 ongoing=[] 1692 for line in status.stdout: 1693 if me_dir in line: 1694 id, _, _,_ , status,_ = line.split(None,5) 1695 ongoing.append(id) 1696 if status in self.idle_tag: 1697 idle += 1 1698 elif status in self.running_tag: 1699 run += 1 1700 elif status in self.complete_tag: 1701 status = self.check_termination(id) 1702 if status == 'wait': 1703 run += 1 1704 elif status == 'resubmit': 1705 idle += 1 1706 else: 1707 fail += 1 1708 1709 #control other finished job 1710 for id in list(self.submitted_ids): 1711 if id not in ongoing: 1712 status = self.check_termination(id) 1713 if status == 'wait': 1714 run += 1 1715 elif status == 'resubmit': 1716 idle += 1 1717 1718 1719 return idle, run, self.submitted - (idle+run+fail), fail
1720 1721 @multiple_try()
1722 - def remove(self, *args, **opts):
1723 """Clean the jobs on the cluster""" 1724 1725 if not self.submitted_ids: 1726 return 1727 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1728 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1729 self.submitted_ids = []
1730
1731 -class HTCaaSCluster(Cluster):
1732 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1733 1734 name= 'htcaas' 1735 job_id = 'HTCAAS_JOBID' 1736 idle_tag = ['waiting'] 1737 running_tag = ['preparing','running'] 1738 complete_tag = ['done'] 1739 1740 @store_input() 1741 @multiple_try()
1742 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1743 log=None, input_files=[], output_files=[], required_output=[], 1744 nb_submit=0):
1745 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1746 input/output file should be given as relative to CWd 1747 """ 1748 # To make workspace name(temp) 1749 cur_usr = os.getenv('USER') 1750 1751 if cwd is None: 1752 cwd = os.getcwd() 1753 1754 cwd_cp = cwd.rsplit("/",2) 1755 1756 if not stdout is None: 1757 print "stdout: %s" % stdout 1758 1759 if not os.path.exists(prog): 1760 prog = os.path.join(cwd, prog) 1761 1762 if not required_output and output_files: 1763 required_output = output_files 1764 1765 logger.debug(prog) 1766 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1767 cwd_arg = cwd+"/arguments" 1768 temp = ' '.join([str(a) for a in argument]) 1769 arg_cmd="echo '"+temp+"' > " + cwd_arg 1770 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1771 if argument : 1772 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1773 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1774 id = a.stdout.read().strip() 1775 1776 else: 1777 cwd_arg = cwd+"/arguments" 1778 temp = ' '.join([str(a) for a in argument]) 1779 temp_file_name = "sub." + os.path.basename(prog) 1780 text = """#!/bin/bash 1781 MYPWD=%(cwd)s 1782 cd $MYPWD 1783 input_files=(%(input_files)s ) 1784 for i in ${input_files[@]} 1785 do 1786 chmod -f +x $i 1787 done 1788 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1789 """ 1790 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1791 'arguments': ' '.join([str(a) for a in argument]), 1792 'program': ' ' if '.py' in prog else 'bash'} 1793 1794 # writing a new script for the submission 1795 new_prog = pjoin(cwd, temp_file_name) 1796 open(new_prog, 'w').write(text % dico) 1797 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1798 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1799 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1800 id = a.stdout.read().strip() 1801 logger.debug(id) 1802 1803 nb_try=0 1804 nb_limit=5 1805 if not id.isdigit() : 1806 print "[ID is not digit]:" + id 1807 1808 while not id.isdigit() : 1809 nb_try+=1 1810 print "[fail_retry]:"+ nb_try 1811 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1812 id = a.stdout.read().strip() 1813 if nb_try > nb_limit : 1814 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1815 break 1816 1817 self.submitted += 1 1818 self.submitted_ids.append(id) 1819 1820 return id
1821 1822 @multiple_try(nb_try=10, sleep=5)
1823 - def control_one_job(self, id):
1824 """ control the status of a single job with it's cluster id """ 1825 1826 if id == 0 : 1827 status_out ='C' 1828 else : 1829 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1830 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1831 stderr=subprocess.PIPE) 1832 error = status.stderr.read() 1833 if status.returncode or error: 1834 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1835 status_out= status.stdout.read().strip() 1836 status_out= status_out.split(":",1)[1] 1837 if status_out == 'waiting': 1838 status_out='I' 1839 elif status_out == 'preparing' or status_out == 'running': 1840 status_out = 'R' 1841 elif status_out != 'done': 1842 status_out = 'F' 1843 elif status_out == 'done': 1844 status_out = 'C' 1845 1846 return status_out
1847 1848 @multiple_try()
1849 - def control(self, me_dir):
1850 """ control the status of a single job with it's cluster id """ 1851 if not self.submitted_ids: 1852 logger.debug("self.submitted_ids not exists") 1853 return 0, 0, 0, 0 1854 1855 ongoing = [] 1856 idle, run, fail = 0, 0, 0 1857 1858 start = self.submitted_ids[0] 1859 end = self.submitted_ids[-1] 1860 1861 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1862 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1863 1864 for line in status.stdout: 1865 #ongoing.append(line.split()[0].strip()) 1866 status2 = line.split()[-1] 1867 if status2 is not 'null' or line.split()[0].strip() is not '0': 1868 ongoing.append(line.split()[0].strip()) 1869 logger.debug("["+line.split()[0].strip()+"]"+status2) 1870 if status2 is 'null' or line.split()[0].strip() is '0': 1871 idle += 1 1872 elif status2 in self.idle_tag: 1873 idle += 1 1874 elif status2 in self.running_tag: 1875 run += 1 1876 elif status2 in self.complete_tag: 1877 if not self.check_termination(line.split()[0]): 1878 idle +=1 1879 else: 1880 fail += 1 1881 1882 return idle, run, self.submitted - (idle+run+fail), fail
1883 1884 @multiple_try()
1885 - def remove(self, *args, **opts):
1886 """Clean the jobson the cluster""" 1887 1888 if not self.submitted_ids: 1889 return 1890 for i in range(len(self.submitted_ids)): 1891 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1892 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1893
1894 -class HTCaaS2Cluster(Cluster):
1895 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1896 1897 name= 'htcaas2' 1898 job_id = 'HTCAAS2_JOBID' 1899 idle_tag = ['waiting'] 1900 running_tag = ['preparing','running'] 1901 complete_tag = ['done'] 1902 1903 @store_input() 1904 @multiple_try()
1905 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1906 log=None, input_files=[], output_files=[], required_output=[], 1907 nb_submit=0):
1908 1909 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1910 input/output file should be given as relative to CWD 1911 """ 1912 if cwd is None: 1913 cwd = os.getcwd() 1914 1915 if not os.path.exists(prog): 1916 prog = os.path.join(cwd, prog) 1917 1918 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1919 if cwd or prog : 1920 self.submitted_dirs.append(cwd) 1921 self.submitted_exes.append(prog) 1922 else : 1923 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1924 1925 if argument : 1926 self.submitted_args.append('='.join([str(a) for a in argument])) 1927 1928 if cwd or prog : 1929 self.submitted += 1 1930 id = self.submitted 1931 self.submitted_ids.append(id) 1932 else: 1933 logger.debug("cwd and prog are not exist! ") 1934 id = 0 1935 1936 else: 1937 temp_file_name = "sub."+ os.path.basename(prog) 1938 text = """#!/bin/bash 1939 MYPWD=%(cwd)s 1940 cd $MYPWD 1941 input_files=(%(input_files)s ) 1942 for i in ${input_files[@]} 1943 do 1944 chmod -f +x $i 1945 done 1946 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1947 """ 1948 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1949 'arguments': ' '.join([str(a) for a in argument]), 1950 'program': ' ' if '.py' in prog else 'bash'} 1951 # writing a new script for the submission 1952 new_prog = pjoin(cwd, temp_file_name) 1953 open(new_prog, 'w').write(text % dico) 1954 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1955 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 1956 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1957 id = a.stdout.read().strip() 1958 logger.debug("[mode2]-["+str(id)+"]") 1959 if cwd and prog : 1960 self.submitted += 1 1961 self.submitted_ids.append(id) 1962 else: 1963 logger.debug("cwd and prog are not exist! ") 1964 id = 0 1965 1966 return id
1967 1968 @multiple_try()
1969 - def metasubmit(self, me_dir=None):
1970 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 1971 tmp_leng= len(self.submitted_ids)/2 1972 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 1973 tmp_dirs2= self.submitted_dirs[tmp_leng:] 1974 tmp_exes1= self.submitted_exes[0:tmp_leng] 1975 tmp_exes2= self.submitted_exes[tmp_leng:] 1976 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a is not ' ']), 1977 '-e', ":".join([str(a) for a in tmp_exes1 if a and a is not ' '])] 1978 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a is not ' ']), 1979 '-e', ":".join([str(a) for a in tmp_exes2 if a and a is not ' '])] 1980 if len(self.submitted_args) > 0 : 1981 tmp_args1= self.submitted_args[0:tmp_leng] 1982 tmp_args2= self.submitted_args[tmp_leng:] 1983 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 1984 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 1985 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 1986 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 1987 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 1988 1989 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 1990 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a is not ' ']), 1991 '-e', ":".join([str(a) for a in self.submitted_exes if a and a is not ' '])] 1992 if len(self.submitted_args) > 0 : 1993 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 1994 if self.submitted_dirs[0] or self.submitted_exes[0] : 1995 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 1996 me_dir = result.stdout.read().strip() 1997 self.submitted_ids[0]=me_dir 1998 else: 1999 me_dir = self.submitted_ids[-1] 2000 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2001 me_dir = self.submitted_ids[0] 2002 else : 2003 me_dir = -1 2004 2005 logger.debug("[" + str(me_dir) + "]") 2006 2007 self.submitted_dirs = [] 2008 self.submitted_exes = [] 2009 self.submitted_args = [] 2010 2011 return me_dir
2012 2013 2014 @multiple_try(nb_try=10, sleep=5)
2015 - def control_one_job(self, id):
2016 """ control the status of a single job with it's cluster id """ 2017 #logger.debug("CONTROL ONE JOB MODE") 2018 if self.submitted == self.submitted_ids[-1] : 2019 id = self.metasubmit(self) 2020 tempid = self.submitted_ids[-1] 2021 self.submitted_ids.remove(self.submitted_ids[-1]) 2022 self.submitted_ids.append(id) 2023 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2024 2025 if id == 0 : 2026 status_out ='C' 2027 else : 2028 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2029 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2030 stderr=subprocess.PIPE) 2031 error = status.stderr.read() 2032 if status.returncode or error: 2033 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error 2034 status_out= status.stdout.read().strip() 2035 status_out= status_out.split(":",1)[1] 2036 logger.debug("[["+str(id)+"]]"+status_out) 2037 if status_out == 'waiting': 2038 status_out='I' 2039 elif status_out == 'preparing' or status_out == 'running': 2040 status_out = 'R' 2041 elif status_out != 'done': 2042 status_out = 'F' 2043 elif status_out == 'done': 2044 status_out = 'C' 2045 self.submitted -= 1 2046 2047 return status_out
2048 2049 @multiple_try()
2050 - def control(self, me_dir):
2051 """ control the status of a single job with it's cluster id """ 2052 if not self.submitted_ids: 2053 logger.debug("self.submitted_ids not exists") 2054 return 0, 0, 0, 0 2055 2056 if "//" in me_dir : 2057 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2058 start = me_dir.split("//")[0] 2059 end = me_dir.split("//")[1] 2060 else : 2061 start = me_dir.split("//")[1] 2062 end = me_dir.split("//")[0] 2063 elif "/" in me_dir : # update 2064 start = 0 2065 end = 0 2066 elif me_dir.isdigit(): 2067 start = me_dir 2068 end = me_dir 2069 elif not me_dir.isdigit(): 2070 me_dir = self.submitted_ids[0] 2071 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2072 2073 ongoing = [] 2074 idle, run, fail, done = 0, 0, 0, 0 2075 2076 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2077 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2078 2079 for line in status.stdout: 2080 status2 = line.split()[-1] 2081 if status2 is not 'null' or line.split()[0].strip() is not '0': 2082 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2083 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2084 2085 if status2 is 'null' or line.split()[0].strip() is '0': 2086 idle += 1 2087 elif status2 in self.idle_tag: 2088 idle += 1 2089 elif status2 in self.running_tag: 2090 run += 1 2091 elif status2 in self.complete_tag: 2092 done += 1 2093 self.submitted -= 1 2094 if not self.check_termination(line.split()[1]): 2095 idle +=1 2096 else: 2097 fail += 1 2098 2099 return idle, run, self.submitted - (idle+run+fail), fail
2100 2101 @multiple_try()
2102 - def remove(self, *args, **opts):
2103 """Clean the jobson the cluster""" 2104 2105 if not self.submitted_ids: 2106 return 2107 id = self.submitted_ids[0] 2108 if id is not 0 : 2109 cmd = "htcaas-job-cancel -m %s" % str(id) 2110 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2111 2112 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2113 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2114 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2115