Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  from __future__ import absolute_import 
  15  from __future__ import print_function 
  16  import subprocess 
  17  import logging 
  18  import os 
  19  import time 
  20  import re 
  21  import glob 
  22  import inspect 
  23  import sys 
  24  import six 
  25  from six.moves import range 
  26  from six.moves import input 
  27   
  28  logger = logging.getLogger('madgraph.cluster')  
  29   
  30  try: 
  31      from madgraph import MadGraph5Error 
  32      import madgraph.various.misc as misc 
  33  except Exception as error: 
  34      if __debug__: 
  35          print(str(error)) 
  36      from internal import MadGraph5Error 
  37      import internal.misc as misc 
  38   
  39  pjoin = os.path.join 
40 41 -class ClusterManagmentError(MadGraph5Error):
42 pass
43
44 -class NotImplemented(MadGraph5Error):
45 pass
46 47 48 multiple_try = misc.multiple_try 49 pjoin = os.path.join
50 51 52 -def check_interupt(error=KeyboardInterrupt):
53 54 def deco_interupt(f): 55 def deco_f_interupt(self, *args, **opt): 56 try: 57 return f(self, *args, **opt) 58 except error: 59 try: 60 self.remove(*args, **opt) 61 except Exception: 62 pass 63 raise error
64 return deco_f_interupt 65 return deco_interupt 66
67 -def store_input(arg=''):
68 69 def deco_store(f): 70 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 71 input_files=[], output_files=[], required_output=[], nb_submit=0): 72 frame = inspect.currentframe() 73 args, _, _, values = inspect.getargvalues(frame) 74 args = dict([(i, values[i]) for i in args if i != 'self']) 75 id = f(self, **args) 76 if self.nb_retry > 0: 77 self.retry_args[id] = args 78 return id
79 return deco_f_store 80 return deco_store 81
82 -def need_transfer(options):
83 """ This function checks whether compression of input files are necessary 84 given the running options given. """ 85 86 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 87 return False 88 else: 89 return True
90
91 -class Cluster(object):
92 """Basic Class for all cluster type submission""" 93 name = 'mother class' 94 identifier_length = 14 95
96 - def __init__(self,*args, **opts):
97 """Init the cluster""" 98 99 self.submitted = 0 100 self.submitted_ids = [] 101 self.finish = 0 102 self.submitted_dirs = [] #HTCaaS 103 self.submitted_exes = [] #HTCaaS 104 self.submitted_args = [] #HTCaaS 105 106 if 'cluster_queue' in opts: 107 self.cluster_queue = opts['cluster_queue'] 108 else: 109 self.cluster_queue = 'madgraph' 110 if 'cluster_temp_path' in opts: 111 self.temp_dir = opts['cluster_temp_path'] 112 else: 113 self.temp_dir = None 114 self.options = {'cluster_status_update': (600, 30)} 115 for key,value in opts.items(): 116 self.options[key] = value 117 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 118 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 119 self.options = dict(opts) 120 self.retry_args = {} 121 # controlling jobs in controlled type submision 122 self.packet = {} 123 self.id_to_packet = {}
124
125 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 126 log=None, required_output=[], nb_submit=0):
127 """How to make one submission. Return status id on the cluster.""" 128 raise NotImplemented('No implementation of how to submit a job to cluster \'%s\'' % self.name)
129 130 131 @store_input()
132 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 133 log=None, input_files=[], output_files=[], required_output=[], 134 nb_submit=0):
135 """How to make one submission. Return status id on the cluster. 136 NO SHARE DISK""" 137 138 if cwd is None: 139 cwd = os.getcwd() 140 if not os.path.exists(prog): 141 prog = os.path.join(cwd, prog) 142 143 if not required_output and output_files: 144 required_output = output_files 145 146 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 147 (input_files == [] == output_files): 148 149 return self.submit(prog, argument, cwd, stdout, stderr, log, 150 required_output=required_output, nb_submit=nb_submit) 151 152 if not input_files and not output_files: 153 # not input/output so not using submit2 154 return self.submit(prog, argument, cwd, stdout, stderr, log, 155 required_output=required_output, nb_submit=nb_submit) 156 157 if cwd is None: 158 cwd = os.getcwd() 159 if not os.path.exists(prog): 160 prog = os.path.join(cwd, prog) 161 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 162 163 text = """#!/bin/bash 164 MYTMP=%(tmpdir)s/run$%(job_id)s 165 MYPWD=%(cwd)s 166 mkdir -p $MYTMP 167 cd $MYPWD 168 input_files=( %(input_files)s ) 169 for i in ${input_files[@]} 170 do 171 cp -R -L $i $MYTMP 172 done 173 cd $MYTMP 174 echo '%(arguments)s' > arguments 175 chmod +x ./%(script)s 176 %(program)s ./%(script)s %(arguments)s 177 exit=$? 178 output_files=( %(output_files)s ) 179 for i in ${output_files[@]} 180 do 181 cp -r $MYTMP/$i $MYPWD 182 done 183 # if [ "$exit" -eq "0" ] 184 # then 185 rm -rf $MYTMP 186 # fi 187 """ 188 189 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 190 'cwd': cwd, 'job_id': self.job_id, 191 'input_files': ' '.join(input_files + [prog]), 192 'output_files': ' '.join(output_files), 193 'arguments': ' '.join([str(a) for a in argument]), 194 'program': ' ' if '.py' in prog else 'bash'} 195 196 # writing a new script for the submission 197 new_prog = pjoin(cwd, temp_file_name) 198 open(new_prog, 'w').write(text % dico) 199 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 200 201 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 202 required_output=required_output, nb_submit=nb_submit)
203 204
205 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 206 log=None, input_files=[], output_files=[], required_output=[], 207 nb_submit=0, packet_member=None):
208 """This function wrap the cluster submition with cluster independant 209 method should not be overwritten (but for DAG type submission)""" 210 211 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 212 output_files, required_output, nb_submit) 213 214 215 if not packet_member: 216 return id 217 else: 218 if isinstance(packet_member, Packet): 219 self.id_to_packet[id] = packet_member 220 packet_member.put(id) 221 if packet_member.tag not in self.packet: 222 self.packet[packet_member.tag] = packet_member 223 else: 224 if packet_member in self.packet: 225 packet = self.packet[packet_member] 226 packet.put(id) 227 self.id_to_packet[id] = packet 228 return id
229
230 - def control(self, me_dir=None):
231 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 232 if not self.submitted_ids: 233 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name) 234 idle, run, fail = 0, 0, 0 235 for pid in self.submitted_ids[:]: 236 status = self.control_one_job(id) 237 if status == 'I': 238 idle += 1 239 elif status == 'R': 240 run += 1 241 elif status == 'F': 242 self.finish +=1 243 self.submitted_ids.remove(pid) 244 else: 245 fail += 1 246 247 return idle, run, self.finish, fail
248
249 - def control_one_job(self, pid):
250 """ control the status of a single job with it's cluster id """ 251 raise NotImplemented('No implementation of how to control the job status to cluster \'%s\'' % self.name)
252
253 - def get_jobs_identifier(self, path, second_path=None):
254 """get a unique run_name for all the jobs helps to identify the runs 255 in the controller for some cluster.""" 256 257 if second_path: 258 path = os.path.realpath(pjoin(path, second_path)) 259 elif not os.path.exists(path): 260 return path # job already done 261 262 if 'SubProcesses' in path: 263 target = path.rsplit('/SubProcesses',1)[0] 264 elif 'MCatNLO' in path: 265 target = path.rsplit('/MCatNLO',1)[0] 266 elif 'PY8_parallelization' in path: 267 target = path.rsplit('/PY8_parallelization',1)[0] 268 elif second_path: 269 target=path 270 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 271 else: 272 target = path 273 274 if target.endswith('/'): 275 target = target[:-1] 276 277 target = misc.digest(target.encode())[-self.identifier_length:] 278 if not target[0].isalpha(): 279 target = 'a' + target[1:] 280 281 return target
282 283 284 @check_interupt()
285 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
286 """Wait that all job are finish. 287 if minimal_job set, then return if idle + run is lower than that number""" 288 289 290 mode = 1 # 0 is long waiting/ 1 is short waiting 291 nb_iter = 0 292 nb_short = 0 293 change_at = 5 # number of iteration from which we wait longer between update. 294 295 if update_first: 296 idle, run, finish, fail = self.control(me_dir) 297 update_first(idle, run, finish) 298 299 #usefull shortcut for readibility 300 longtime, shorttime = self.options['cluster_status_update'] 301 302 nb_job = 0 303 304 if self.options['cluster_type'] == 'htcaas2': 305 me_dir = self.metasubmit(self) 306 307 while 1: 308 old_mode = mode 309 nb_iter += 1 310 idle, run, finish, fail = self.control(me_dir) 311 if nb_job: 312 if idle + run + finish + fail != nb_job: 313 nb_job = idle + run + finish + fail 314 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 315 else: 316 nb_job = idle + run + finish + fail 317 if fail: 318 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 319 if idle + run == 0: 320 #time.sleep(20) #security to ensure that the file are really written on the disk 321 logger.info('All jobs finished') 322 fct(idle, run, finish) 323 break 324 if idle + run < minimal_job: 325 return 326 fct(idle, run, finish) 327 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 328 if nb_iter < change_at: 329 mode = 1 330 elif idle < run: 331 if old_mode == 0: 332 if nb_short: 333 mode = 0 #we already be back from short to long so stay in long 334 #check if we need to go back to short mode 335 elif idle: 336 if nb_iter > change_at + int(longtime)//shorttime: 337 mode = 0 #stay in long waiting mode 338 else: 339 mode = 1 # pass in short waiting mode 340 nb_short =0 341 else: 342 mode = 1 # pass in short waiting mode 343 nb_short = 0 344 elif old_mode == 1: 345 nb_short +=1 346 if nb_short > 3* max(change_at, int(longtime)//shorttime): 347 mode = 0 #go back in slow waiting 348 else: 349 mode = 0 350 351 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 352 if old_mode > mode: 353 logger.info('''Start to wait %ss between checking status. 354 Note that you can change this time in the configuration file. 355 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 356 357 #now Waiting! 358 if mode == 0: 359 try: 360 time.sleep(self.options['cluster_status_update'][0]) 361 except KeyboardInterrupt: 362 logger.info('start to update the status') 363 nb_iter = min(0, change_at -2) 364 nb_short = 0 365 else: 366 time.sleep(self.options['cluster_status_update'][1]) 367 368 369 self.submitted = 0 370 self.submitted_ids = [] 371 self.id_to_packet = {}
372
373 - def check_termination(self, job_id):
374 """Check the termination of the jobs with job_id and relaunch it if needed.""" 375 376 377 if job_id not in self.retry_args: 378 if job_id in self.id_to_packet: 379 nb_in_packet = self.id_to_packet[job_id].remove_one() 380 if nb_in_packet == 0: 381 # packet done run the associate function 382 packet = self.id_to_packet[job_id] 383 # fully ensure that the packet is finished (thread safe) 384 packet.queue.join() 385 #running the function 386 packet.fct(*packet.args) 387 del self.id_to_packet[job_id] 388 return 'resubmit' 389 else: 390 return True 391 392 args = self.retry_args[job_id] 393 if 'time_check' in args: 394 time_check = args['time_check'] 395 else: 396 time_check = 0 397 398 for path in args['required_output']: 399 if args['cwd']: 400 path = pjoin(args['cwd'], path) 401 # check that file exists and is not empty. 402 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 403 break 404 else: 405 # all requested output are present 406 if time_check > 0: 407 logger.info('Job %s Finally found the missing output.' % (job_id)) 408 del self.retry_args[job_id] 409 self.submitted_ids.remove(job_id) 410 # check if the job_id is in a packet 411 if job_id in self.id_to_packet: 412 nb_in_packet = self.id_to_packet[job_id].remove_one() 413 if nb_in_packet == 0: 414 # packet done run the associate function 415 packet = self.id_to_packet[job_id] 416 # fully ensure that the packet is finished (thread safe) 417 packet.queue.join() 418 #running the function 419 packet.fct(*packet.args) 420 del self.id_to_packet[job_id] 421 return 'resubmit' 422 423 return 'done' 424 425 if time_check == 0: 426 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 427 args['time_check'] = time.time() 428 return 'wait' 429 elif self.cluster_retry_wait > time.time() - time_check: 430 return 'wait' 431 432 #jobs failed to be completed even after waiting time!! 433 if self.nb_retry < 0: 434 logger.critical('''Fail to run correctly job %s. 435 with option: %s 436 file missing: %s''' % (job_id, args, path)) 437 input('press enter to continue.') 438 elif self.nb_retry == 0: 439 logger.critical('''Fail to run correctly job %s. 440 with option: %s 441 file missing: %s. 442 Stopping all runs.''' % (job_id, args, path)) 443 self.remove() 444 elif args['nb_submit'] >= self.nb_retry: 445 logger.critical('''Fail to run correctly job %s. 446 with option: %s 447 file missing: %s 448 Fails %s times 449 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 450 self.remove() 451 else: 452 args['nb_submit'] += 1 453 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 454 del self.retry_args[job_id] 455 self.submitted_ids.remove(job_id) 456 if 'time_check' in args: 457 del args['time_check'] 458 if job_id in self.id_to_packet: 459 self.id_to_packet[job_id].remove_one() 460 args['packet_member'] = self.id_to_packet[job_id] 461 del self.id_to_packet[job_id] 462 self.cluster_submit(**args) 463 else: 464 self.submit2(**args) 465 return 'resubmit' 466 return 'done'
467 468 @check_interupt()
469 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 470 stderr=None, log=None, required_output=[], nb_submit=0, 471 input_files=[], output_files=[]):
472 """launch one job on the cluster and wait for it""" 473 474 special_output = False # tag for concatenate the error with the output. 475 if stderr == -2 and stdout: 476 #We are suppose to send the output to stdout 477 special_output = True 478 stderr = stdout + '.err' 479 480 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 481 required_output=required_output, input_files=input_files, 482 output_files=output_files) 483 484 if self.options['cluster_type']=='htcaas2': 485 if self.submitted == self.submitted_ids[-1]: 486 id = self.metasubmit(self) 487 488 frame = inspect.currentframe() 489 args, _, _, values = inspect.getargvalues(frame) 490 args = dict([(i, values[i]) for i in args if i != 'self']) 491 self.retry_args[id] = args 492 493 nb_wait=0 494 while 1: 495 nb_wait+=1 496 status = self.control_one_job(id) 497 if not status in ['R','I']: 498 status = self.check_termination(id) 499 if status in ['wait']: 500 time.sleep(30) 501 continue 502 elif status in ['resubmit']: 503 id = self.submitted_ids[0] 504 time.sleep(30) 505 continue 506 #really stop! 507 time.sleep(30) #security to ensure that the file are really written on the disk 508 break 509 time.sleep(self.options['cluster_status_update'][1]) 510 511 if required_output: 512 status = self.check_termination(id) 513 if status == 'wait': 514 run += 1 515 elif status == 'resubmit': 516 idle += 1 517 518 519 if special_output: 520 # combine the stdout and the stderr 521 #wait up to 50 s to see if those files exists 522 for i in range(5): 523 if os.path.exists(stdout): 524 if not os.path.exists(stderr): 525 time.sleep(5) 526 if os.path.exists(stderr): 527 err_text = open(stderr).read() 528 if not err_text: 529 return 530 logger.warning(err_text) 531 text = open(stdout).read() 532 open(stdout,'w').write(text + err_text) 533 else: 534 return 535 time.sleep(10)
536
537 - def remove(self, *args, **opts):
538 """ """ 539 logger.warning("""This cluster didn't support job removal, 540 the jobs are still running on the cluster.""")
541 542 @store_input()
543 - def metasubmit(self, me_dir):
544 logger.warning("""This cluster didn't support metajob submit.""") 545 return 0
546
547 - def modify_interface(self, run_interface):
548 """routine which allow to modify the run_card/mg5cmd object to change the 549 default behavior of the runs. 550 This is called at the time of the compilation of the run_card. 551 Note that this function can be called multiple times by run. 552 """ 553 #run_card = run_interface.run_card 554 return
555
556 -class Packet(object):
557 """ an object for handling packet of job, it is designed to be thread safe 558 """ 559
560 - def __init__(self, name, fct, args, opts={}):
561 import six.moves.queue 562 import threading 563 self.queue = six.moves.queue.Queue() 564 self.tag = name 565 self.fct = fct 566 self.args = args 567 self.opts = opts 568 self.done = threading.Event()
569
570 - def put(self, *args, **opts):
571 self.queue.put(*args, **opts)
572 573 append = put 574
575 - def remove_one(self):
576 self.queue.get(True) 577 self.queue.task_done() 578 return self.queue.qsize()
579
580 -class MultiCore(Cluster):
581 """class for dealing with the submission in multiple node""" 582 583 job_id = "$" 584
585 - def __init__(self, *args, **opt):
586 """Init the cluster """ 587 588 589 super(MultiCore, self).__init__(self, *args, **opt) 590 591 import six.moves.queue 592 import threading 593 import six.moves._thread 594 self.queue = six.moves.queue.Queue() # list of job to do 595 self.done = six.moves.queue.Queue() # list of job finisned 596 self.submitted = six.moves.queue.Queue() # one entry by job submitted 597 self.stoprequest = threading.Event() #flag to ensure everything to close 598 self.demons = [] 599 self.nb_done =0 600 if 'nb_core' in opt: 601 self.nb_core = opt['nb_core'] 602 elif isinstance(args[0],int): 603 self.nb_core = args[0] 604 else: 605 self.nb_core = 1 606 self.update_fct = None 607 608 self.lock = threading.Event() # allow nice lock of the main thread 609 self.pids = six.moves.queue.Queue() # allow to clean jobs submit via subprocess 610 self.done_pid = [] # list of job finisned 611 self.done_pid_queue = six.moves.queue.Queue() 612 self.fail_msg = None 613 614 # starting the worker node 615 for _ in range(self.nb_core): 616 self.start_demon()
617 618
619 - def start_demon(self):
620 import threading 621 t = threading.Thread(target=self.worker) 622 t.daemon = True 623 t.start() 624 self.demons.append(t)
625 626
627 - def worker(self):
628 import six.moves.queue 629 import six.moves._thread 630 while not self.stoprequest.isSet(): 631 try: 632 args = self.queue.get() 633 tag, exe, arg, opt = args 634 try: 635 # check for executable case 636 if isinstance(exe,str): 637 if os.path.exists(exe) and not exe.startswith('/'): 638 exe = './' + exe 639 if isinstance(opt['stdout'],str): 640 opt['stdout'] = open(opt['stdout'],'w') 641 if opt['stderr'] == None: 642 opt['stderr'] = subprocess.STDOUT 643 if arg: 644 proc = misc.Popen([exe] + arg, **opt) 645 else: 646 proc = misc.Popen(exe, **opt) 647 pid = proc.pid 648 self.pids.put(pid) 649 proc.wait() 650 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 651 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 652 (' '.join([exe]+arg), proc.returncode) 653 logger.warning(fail_msg) 654 self.stoprequest.set() 655 self.remove(fail_msg) 656 # handle the case when this is a python function. Note that 657 # this use Thread so they are NO built-in parralelization this is 658 # going to work on a single core! (but this is fine for IO intensive 659 # function. for CPU intensive fct this will slow down the computation 660 else: 661 pid = tag 662 self.pids.put(pid) 663 # the function should return 0 if everything is fine 664 # the error message otherwise 665 returncode = exe(*arg, **opt) 666 if returncode != 0: 667 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 668 self.stoprequest.set() 669 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 670 except Exception as error: 671 self.fail_msg = sys.exc_info() 672 logger.warning(str(error)) 673 self.stoprequest.set() 674 self.remove(error) 675 676 if __debug__: 677 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 678 679 self.queue.task_done() 680 self.done.put(tag) 681 self.done_pid_queue.put(pid) 682 #release the mother to print the status on the screen 683 try: 684 self.lock.set() 685 except six.moves._thread.error: 686 continue 687 except six.moves.queue.Empty: 688 continue
689 690 691 692
693 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 694 log=None, required_output=[], nb_submit=0):
695 """submit a job on multicore machine""" 696 697 tag = (prog, tuple(argument), cwd, nb_submit) 698 if isinstance(prog, str): 699 700 opt = {'cwd': cwd, 701 'stdout':stdout, 702 'stderr': stderr} 703 704 self.queue.put((tag, prog, argument, opt)) 705 self.submitted.put(1) 706 return tag 707 else: 708 # python function 709 self.queue.put((tag, prog, argument, {})) 710 self.submitted.put(1) 711 return tag
712
713 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 714 stderr=None, log=None, **opts):
715 """launch one job and wait for it""" 716 if isinstance(stdout, str): 717 stdout = open(stdout, 'w') 718 if isinstance(stderr, str): 719 stdout = open(stderr, 'w') 720 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
721
722 - def remove(self, error=None):
723 """Ensure that all thread are killed""" 724 725 # ensure the worker to stop 726 self.stoprequest.set() 727 if error and not self.fail_msg: 728 self.fail_msg = error 729 730 # cleaning the queue done_pid_queue and move them to done_pid 731 while not self.done_pid_queue.empty(): 732 pid = self.done_pid_queue.get() 733 self.done_pid.append(pid) 734 # self.done_pid_queue.task_done() 735 736 while not self.pids.empty(): 737 pid = self.pids.get() 738 self.pids.task_done() 739 if isinstance(pid, tuple): 740 continue 741 if pid in self.done_pid: 742 continue 743 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 744 % {'pid':pid} ) 745 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
746 747
748 - def wait(self, me_dir, update_status, update_first=None):
749 """Waiting that all the jobs are done. This function also control that 750 the submission by packet are handle correctly (i.e. submit the function)""" 751 752 import six.moves.queue 753 import threading 754 755 try: # to catch KeyBoardInterupt to see which kind of error to display 756 last_status = (0, 0, 0) 757 sleep_time = 1 758 use_lock = True 759 first = True 760 while True: 761 force_one_more_loop = False # some security 762 763 # Loop over the job tagged as done to check if some packet of jobs 764 # are finished in case, put the associate function in the queue 765 while self.done.qsize(): 766 try: 767 tag = self.done.get(True, 1) 768 except six.moves.queue.Empty: 769 pass 770 else: 771 if self.id_to_packet and tuple(tag) in self.id_to_packet: 772 packet = self.id_to_packet[tuple(tag)] 773 remaining = packet.remove_one() 774 if remaining == 0: 775 # fully ensure that the packet is finished (thread safe) 776 packet.queue.join() 777 self.submit(packet.fct, packet.args) 778 force_one_more_loop = True 779 self.nb_done += 1 780 self.done.task_done() 781 782 # Get from the various queue the Idle/Done/Running information 783 # Those variable should be thread safe but approximate. 784 Idle = self.queue.qsize() 785 Done = self.nb_done + self.done.qsize() 786 Running = max(0, self.submitted.qsize() - Idle - Done) 787 788 if Idle + Running <= 0 and not force_one_more_loop: 789 update_status(Idle, Running, Done) 790 # Going the quit since everything is done 791 # Fully Ensure that everything is indeed done. 792 self.queue.join() 793 break 794 795 if (Idle, Running, Done) != last_status: 796 if first and update_first: 797 update_first(Idle, Running, Done) 798 first = False 799 else: 800 update_status(Idle, Running, Done) 801 last_status = (Idle, Running, Done) 802 803 # cleaning the queue done_pid_queue and move them to done_pid 804 while not self.done_pid_queue.empty(): 805 pid = self.done_pid_queue.get() 806 self.done_pid.append(pid) 807 self.done_pid_queue.task_done() 808 809 810 # Define how to wait for the next iteration 811 if use_lock: 812 # simply wait that a worker release the lock 813 use_lock = self.lock.wait(300) 814 self.lock.clear() 815 if not use_lock and Idle > 0: 816 use_lock = True 817 else: 818 # to be sure that we will never fully lock at the end pass to 819 # a simple time.sleep() 820 time.sleep(sleep_time) 821 sleep_time = min(sleep_time + 2, 180) 822 if update_first: 823 update_first(Idle, Running, Done) 824 825 if self.stoprequest.isSet(): 826 if isinstance(self.fail_msg, Exception): 827 raise self.fail_msg 828 elif isinstance(self.fail_msg, str): 829 raise Exception(self.fail_msg) 830 else: 831 misc.sprint(self.fail_msg) 832 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 833 # reset variable for next submission 834 try: 835 self.lock.clear() 836 except Exception: 837 pass 838 self.done = six.moves.queue.Queue() 839 self.done_pid = [] 840 self.done_pid_queue = six.moves.queue.Queue() 841 self.nb_done = 0 842 self.submitted = six.moves.queue.Queue() 843 self.pids = six.moves.queue.Queue() 844 self.stoprequest.clear() 845 self.id_to_packet = {} 846 847 except KeyboardInterrupt: 848 # if one of the node fails -> return that error 849 if isinstance(self.fail_msg, Exception): 850 raise self.fail_msg 851 elif isinstance(self.fail_msg, str): 852 raise Exception(self.fail_msg) 853 elif self.fail_msg: 854 six.reraise(self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]) 855 # else return orignal error 856 raise
857
858 -class CondorCluster(Cluster):
859 """Basic class for dealing with cluster submission""" 860 861 name = 'condor' 862 job_id = 'CONDOR_ID' 863 864 865 866 @multiple_try()
867 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 868 required_output=[], nb_submit=0):
869 """Submit a job prog to a Condor cluster""" 870 871 text = """Executable = %(prog)s 872 output = %(stdout)s 873 error = %(stderr)s 874 log = %(log)s 875 %(argument)s 876 environment = CONDOR_ID=$(Cluster).$(Process) 877 Universe = vanilla 878 notification = Error 879 Initialdir = %(cwd)s 880 %(requirement)s 881 getenv=True 882 queue 1 883 """ 884 885 if self.cluster_queue not in ['None', None]: 886 requirement = 'Requirements = %s=?=True' % self.cluster_queue 887 else: 888 requirement = '' 889 890 if cwd is None: 891 cwd = os.getcwd() 892 if stdout is None: 893 stdout = '/dev/null' 894 if stderr is None: 895 stderr = '/dev/null' 896 if log is None: 897 log = '/dev/null' 898 if not os.path.exists(prog): 899 prog = os.path.join(cwd, prog) 900 if argument: 901 argument = 'Arguments = %s' % ' '.join(argument) 902 else: 903 argument = '' 904 905 906 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 907 'stderr': stderr,'log': log,'argument': argument, 908 'requirement': requirement} 909 910 #open('submit_condor','w').write(text % dico) 911 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 912 stdin=subprocess.PIPE) 913 output, _ = a.communicate((text % dico).encode()) 914 #output = a.stdout.read() 915 #Submitting job(s). 916 #Logging submit event(s). 917 #1 job(s) submitted to cluster 2253622. 918 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 919 output = output.decode(errors='ignore') 920 try: 921 id = pat.search(output).groups()[0] 922 except: 923 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 924 % output) 925 self.submitted += 1 926 self.submitted_ids.append(id) 927 return id
928 929 @store_input() 930 @multiple_try()
931 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 932 log=None, input_files=[], output_files=[], required_output=[], 933 nb_submit=0):
934 """Submit the job on the cluster NO SHARE DISK 935 input/output file should be give relative to cwd 936 """ 937 938 if not required_output and output_files: 939 required_output = output_files 940 941 if (input_files == [] == output_files): 942 return self.submit(prog, argument, cwd, stdout, stderr, log, 943 required_output=required_output, nb_submit=nb_submit) 944 945 text = """Executable = %(prog)s 946 output = %(stdout)s 947 error = %(stderr)s 948 log = %(log)s 949 %(argument)s 950 should_transfer_files = YES 951 when_to_transfer_output = ON_EXIT 952 transfer_input_files = %(input_files)s 953 %(output_files)s 954 Universe = vanilla 955 notification = Error 956 Initialdir = %(cwd)s 957 %(requirement)s 958 getenv=True 959 queue 1 960 """ 961 962 if self.cluster_queue not in ['None', None]: 963 requirement = 'Requirements = %s=?=True' % self.cluster_queue 964 else: 965 requirement = '' 966 967 if cwd is None: 968 cwd = os.getcwd() 969 if stdout is None: 970 stdout = '/dev/null' 971 if stderr is None: 972 stderr = '/dev/null' 973 if log is None: 974 log = '/dev/null' 975 if not os.path.exists(prog): 976 prog = os.path.join(cwd, prog) 977 if argument: 978 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 979 else: 980 argument = '' 981 # input/output file treatment 982 if input_files: 983 input_files = ','.join(input_files) 984 else: 985 input_files = '' 986 if output_files: 987 output_files = 'transfer_output_files = %s' % ','.join(output_files) 988 else: 989 output_files = '' 990 991 992 993 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 994 'stderr': stderr,'log': log,'argument': argument, 995 'requirement': requirement, 'input_files':input_files, 996 'output_files':output_files} 997 998 #open('submit_condor','w').write(text % dico) 999 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 1000 stdin=subprocess.PIPE) 1001 output, _ = a.communicate((text % dico).encode()) 1002 #output = a.stdout.read() 1003 #Submitting job(s). 1004 #Logging submit event(s). 1005 #1 job(s) submitted to cluster 2253622. 1006 output = output.decode(errors='ignore') 1007 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 1008 try: 1009 id = pat.search(output).groups()[0] 1010 except: 1011 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1012 % output) 1013 self.submitted += 1 1014 self.submitted_ids.append(id) 1015 return id
1016 1017 1018 1019 1020 1021 @multiple_try(nb_try=10, sleep=10)
1022 - def control_one_job(self, id):
1023 """ control the status of a single job with it's cluster id """ 1024 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1025 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1026 stderr=subprocess.PIPE) 1027 1028 error = status.stderr.read().decode(errors='ignore') 1029 if status.returncode or error: 1030 raise ClusterManagmentError('condor_q returns error: %s' % error) 1031 1032 return status.stdout.readline().decode(errors='ignore').strip()
1033 1034 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'} 1035 @check_interupt() 1036 @multiple_try(nb_try=10, sleep=10)
1037 - def control(self, me_dir):
1038 """ control the status of a single job with it's cluster id """ 1039 1040 if not self.submitted_ids: 1041 return 0, 0, 0, 0 1042 1043 packet = 15000 1044 idle, run, fail = 0, 0, 0 1045 ongoing = [] 1046 for i in range(1+(len(self.submitted_ids)-1)//packet): 1047 start = i * packet 1048 stop = (i+1) * packet 1049 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1050 " -format \"%d \" ClusterId " + \ 1051 " -format \"%d\\n\" JobStatus " 1052 1053 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1054 stderr=subprocess.PIPE) 1055 error = status.stderr.read().decode(errors='ignore') 1056 if status.returncode or error: 1057 raise ClusterManagmentError('condor_q returns error: %s' % error) 1058 1059 for line in status.stdout: 1060 id, status = line.decode(errors='ignore').strip().split() 1061 status = self.jobstatus[status] 1062 ongoing.append(id) 1063 if status in ['I','U']: 1064 idle += 1 1065 elif status == 'R': 1066 run += 1 1067 elif status != 'C': 1068 fail += 1 1069 1070 for id in list(self.submitted_ids): 1071 if id not in ongoing: 1072 status = self.check_termination(id) 1073 if status == 'wait': 1074 run += 1 1075 elif status == 'resubmit': 1076 idle += 1 1077 1078 return idle, run, self.submitted - (idle+run+fail), fail
1079 1080 @multiple_try()
1081 - def remove(self, *args, **opts):
1082 """Clean the jobson the cluster""" 1083 1084 if not self.submitted_ids: 1085 return 1086 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1087 1088 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1089 self.submitted_ids = []
1090
1091 -class PBSCluster(Cluster):
1092 """Basic class for dealing with cluster submission""" 1093 1094 name = 'pbs' 1095 job_id = 'PBS_JOBID' 1096 idle_tag = ['Q'] 1097 running_tag = ['T','E','R'] 1098 complete_tag = ['C'] 1099 1100 maximum_submited_jobs = 2500 1101 1102 @multiple_try()
1103 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1104 required_output=[], nb_submit=0):
1105 """Submit a job prog to a PBS cluster""" 1106 1107 me_dir = self.get_jobs_identifier(cwd, prog) 1108 1109 if len(self.submitted_ids) > self.maximum_submited_jobs: 1110 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1111 self.wait(me_dir, fct, self.maximum_submited_jobs) 1112 1113 1114 text = "" 1115 if cwd is None: 1116 cwd = os.getcwd() 1117 else: 1118 text = " cd %s;" % cwd 1119 if stdout is None: 1120 stdout = '/dev/null' 1121 if stderr is None: 1122 stderr = '/dev/null' 1123 elif stderr == -2: # -2 is subprocess.STDOUT 1124 stderr = stdout 1125 if log is None: 1126 log = '/dev/null' 1127 1128 if not os.path.isabs(prog): 1129 text += "./%s" % prog 1130 else: 1131 text+= prog 1132 1133 if argument: 1134 text += ' ' + ' '.join(argument) 1135 1136 command = ['qsub','-o', stdout, 1137 '-N', me_dir, 1138 '-e', stderr, 1139 '-V'] 1140 1141 if self.cluster_queue and self.cluster_queue != 'None': 1142 command.extend(['-q', self.cluster_queue]) 1143 1144 a = misc.Popen(command, stdout=subprocess.PIPE, 1145 stderr=subprocess.STDOUT, 1146 stdin=subprocess.PIPE, cwd=cwd) 1147 1148 output = a.communicate(text.encode())[0].decode(errors='ignore') 1149 id = output.split('.')[0] 1150 if not id.isdigit() or a.returncode !=0: 1151 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1152 % output) 1153 1154 self.submitted += 1 1155 self.submitted_ids.append(id) 1156 return id
1157 1158 @multiple_try()
1159 - def control_one_job(self, id):
1160 """ control the status of a single job with it's cluster id """ 1161 cmd = 'qstat '+str(id) 1162 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1163 stderr=subprocess.STDOUT) 1164 1165 for line in status.stdout: 1166 line = line.decode(errors='ignore').strip() 1167 if 'cannot connect to server' in line or 'cannot read reply' in line: 1168 raise ClusterManagmentError('server disconnected') 1169 if 'Unknown' in line: 1170 return 'F' 1171 elif line.startswith(str(id)): 1172 jobstatus = line.split()[4] 1173 else: 1174 jobstatus="" 1175 1176 if status.returncode != 0 and status.returncode is not None: 1177 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode) 1178 if jobstatus in self.idle_tag: 1179 return 'I' 1180 elif jobstatus in self.running_tag: 1181 return 'R' 1182 return 'F'
1183 1184 1185 @multiple_try()
1186 - def control(self, me_dir):
1187 """ control the status of a single job with it's cluster id """ 1188 cmd = "qstat" 1189 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1190 1191 me_dir = self.get_jobs_identifier(me_dir) 1192 1193 ongoing = [] 1194 1195 idle, run, fail = 0, 0, 0 1196 for line in status.stdout: 1197 line = line.decode(errors='ignore') 1198 if 'cannot connect to server' in line or 'cannot read reply' in line: 1199 raise ClusterManagmentError('server disconnected') 1200 if me_dir in line: 1201 ongoing.append(line.split()[0].split('.')[0]) 1202 status2 = line.split()[4] 1203 if status2 in self.idle_tag: 1204 idle += 1 1205 elif status2 in self.running_tag: 1206 run += 1 1207 elif status2 in self.complete_tag: 1208 if not self.check_termination(line.split()[0].split('.')[0]): 1209 idle += 1 1210 else: 1211 fail += 1 1212 1213 if status.returncode != 0 and status.returncode is not None: 1214 raise ClusterManagmentError('server fails in someway (errorcode %s)' % status.returncode) 1215 1216 for id in list(self.submitted_ids): 1217 if id not in ongoing: 1218 status2 = self.check_termination(id) 1219 if status2 == 'wait': 1220 run += 1 1221 elif status2 == 'resubmit': 1222 idle += 1 1223 1224 return idle, run, self.submitted - (idle+run+fail), fail
1225 1226 @multiple_try()
1227 - def remove(self, *args, **opts):
1228 """Clean the jobs on the cluster""" 1229 1230 if not self.submitted_ids: 1231 return 1232 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1233 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1234 self.submitted_ids = []
1235
1236 1237 -class SGECluster(Cluster):
1238 """Basic class for dealing with cluster submission""" 1239 # Class written by Arian Abrahantes. 1240 1241 name = 'sge' 1242 job_id = 'JOB_ID' 1243 idle_tag = ['qw', 'hqw','hRqw','w'] 1244 running_tag = ['r','t','Rr','Rt'] 1245 identifier_length = 10 1246
1247 - def def_get_path(self,location):
1248 """replace string for path issues""" 1249 location = os.path.realpath(location) 1250 homePath = os.getenv("HOME") 1251 if homePath: 1252 location = location.replace(homePath,'$HOME') 1253 return location
1254 1255 @multiple_try()
1256 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1257 required_output=[], nb_submit=0):
1258 """Submit a job prog to an SGE cluster""" 1259 1260 me_dir = self.get_jobs_identifier(cwd, prog) 1261 1262 1263 if cwd is None: 1264 #cwd = os.getcwd() 1265 cwd = self.def_get_path(os.getcwd()) 1266 cwd1 = self.def_get_path(cwd) 1267 text = " cd %s;" % cwd1 1268 if stdout is None: 1269 stdout = '/dev/null' 1270 else: 1271 stdout = self.def_get_path(stdout) 1272 if stderr is None: 1273 stderr = '/dev/null' 1274 elif stderr == -2: # -2 is subprocess.STDOUT 1275 stderr = stdout 1276 else: 1277 stderr = self.def_get_path(stderr) 1278 1279 if log is None: 1280 log = '/dev/null' 1281 else: 1282 log = self.def_get_path(log) 1283 1284 text += prog 1285 if argument: 1286 text += ' ' + ' '.join(argument) 1287 1288 #if anything slips through argument 1289 #print "!=== inteded change ",text.replace('/srv/nfs','') 1290 #text = text.replace('/srv/nfs','') 1291 homePath = os.getenv("HOME") 1292 if homePath: 1293 text = text.replace(homePath,'$HOME') 1294 1295 logger.debug("!=== input %s" % text) 1296 logger.debug("!=== output %s" % stdout) 1297 logger.debug("!=== error %s" % stderr) 1298 logger.debug("!=== logs %s" % log) 1299 1300 command = ['qsub','-o', stdout, 1301 '-N', me_dir, 1302 '-e', stderr, 1303 '-V'] 1304 1305 if self.cluster_queue and self.cluster_queue != 'None': 1306 command.extend(['-q', self.cluster_queue]) 1307 1308 a = misc.Popen(command, stdout=subprocess.PIPE, 1309 stderr=subprocess.STDOUT, 1310 stdin=subprocess.PIPE, cwd=cwd) 1311 1312 output = a.communicate(text.encode())[0].decode(errors='ignore') 1313 id = output.split(' ')[2] 1314 if not id.isdigit(): 1315 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1316 % output) 1317 self.submitted += 1 1318 self.submitted_ids.append(id) 1319 logger.debug(output) 1320 1321 return id
1322 1323 @multiple_try()
1324 - def control_one_job(self, id):
1325 """ control the status of a single job with it's cluster id """ 1326 #cmd = 'qstat '+str(id) 1327 cmd = 'qstat ' 1328 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1329 for line in status.stdout: 1330 line = line.decode(errors='ignore') 1331 #print "!==",line 1332 #line = line.strip() 1333 #if 'Unknown' in line: 1334 # return 'F' 1335 #elif line.startswith(str(id)): 1336 # status = line.split()[4] 1337 if str(id) in line: 1338 status = line.split()[4] 1339 #print "!=status", status 1340 if status in self.idle_tag: 1341 return 'I' 1342 elif status in self.running_tag: 1343 return 'R' 1344 return 'F'
1345 1346 @multiple_try()
1347 - def control(self, me_dir):
1348 """ control the status of a single job with it's cluster id """ 1349 cmd = "qstat " 1350 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1351 1352 me_dir = self.get_jobs_identifier(me_dir) 1353 1354 finished = list(self.submitted_ids) 1355 1356 idle, run, fail = 0, 0, 0 1357 for line in status.stdout: 1358 line = line.decode(errors='ignore') 1359 if me_dir in line: 1360 id,_,_,_,status = line.split()[:5] 1361 if status in self.idle_tag: 1362 idle += 1 1363 finished.remove(id) 1364 elif status in self.running_tag: 1365 run += 1 1366 finished.remove(id) 1367 else: 1368 logger.debug(line) 1369 fail += 1 1370 finished.remove(id) 1371 1372 for id in finished: 1373 self.check_termination(id) 1374 1375 return idle, run, self.submitted - (idle+run+fail), fail
1376 1377 1378 1379 @multiple_try()
1380 - def remove(self, *args, **opts):
1381 """Clean the jobs on the cluster""" 1382 1383 if not self.submitted_ids: 1384 return 1385 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1386 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1387 self.submitted_ids = []
1388
1389 1390 -class LSFCluster(Cluster):
1391 """Basic class for dealing with cluster submission""" 1392 1393 name = 'lsf' 1394 job_id = 'LSB_JOBID' 1395 1396 @multiple_try()
1397 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1398 required_output=[], nb_submit=0):
1399 """Submit the job prog to an LSF cluster""" 1400 1401 1402 me_dir = self.get_jobs_identifier(cwd, prog) 1403 1404 text = "" 1405 command = ['bsub', '-C0', '-J', me_dir] 1406 if cwd is None: 1407 cwd = os.getcwd() 1408 else: 1409 text = " cd %s;" % cwd 1410 if stdout and isinstance(stdout, str): 1411 command.extend(['-o', stdout]) 1412 if stderr and isinstance(stdout, str): 1413 command.extend(['-e', stderr]) 1414 elif stderr == -2: # -2 is subprocess.STDOUT 1415 pass 1416 if log is None: 1417 log = '/dev/null' 1418 1419 text += prog 1420 if argument: 1421 text += ' ' + ' '.join(argument) 1422 1423 if self.cluster_queue and self.cluster_queue != 'None': 1424 command.extend(['-q', self.cluster_queue]) 1425 1426 a = misc.Popen(command, stdout=subprocess.PIPE, 1427 stderr=subprocess.STDOUT, 1428 stdin=subprocess.PIPE, cwd=cwd) 1429 1430 output = a.communicate(text.encode())[0].decode(errors='ignore') 1431 #Job <nnnn> is submitted to default queue <normal>. 1432 try: 1433 id = output.split('>',1)[0].split('<')[1] 1434 except: 1435 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1436 % output) 1437 if not id.isdigit(): 1438 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1439 % output) 1440 self.submitted += 1 1441 self.submitted_ids.append(id) 1442 return id
1443 1444 1445 @multiple_try()
1446 - def control_one_job(self, id):
1447 """ control the status of a single job with it's cluster id """ 1448 1449 cmd = 'bjobs '+str(id) 1450 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1451 1452 for line in status.stdout: 1453 line = line.decode(errors='ignore').strip().upper() 1454 if 'JOBID' in line: 1455 continue 1456 elif str(id) not in line: 1457 continue 1458 status = line.split()[2] 1459 if status == 'RUN': 1460 return 'R' 1461 elif status == 'PEND': 1462 return 'I' 1463 elif status == 'DONE': 1464 return 'F' 1465 else: 1466 return 'H' 1467 return 'F'
1468 1469 @multiple_try()
1470 - def control(self, me_dir):
1471 """ control the status of a single job with it's cluster id """ 1472 1473 if not self.submitted_ids: 1474 return 0, 0, 0, 0 1475 1476 cmd = "bjobs " + ' '.join(self.submitted_ids) 1477 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1478 1479 jobstatus = {} 1480 for line in status.stdout: 1481 line = line.decode(errors='ignore').strip() 1482 if 'JOBID' in line: 1483 continue 1484 splitline = line.split() 1485 id = splitline[0] 1486 if id not in self.submitted_ids: 1487 continue 1488 jobstatus[id] = splitline[2] 1489 1490 idle, run, fail = 0, 0, 0 1491 for id in self.submitted_ids[:]: 1492 if id in jobstatus: 1493 status = jobstatus[id] 1494 else: 1495 status = 'MISSING' 1496 if status == 'RUN': 1497 run += 1 1498 elif status == 'PEND': 1499 idle += 1 1500 else: 1501 status = self.check_termination(id) 1502 if status == 'wait': 1503 run += 1 1504 elif status == 'resubmit': 1505 idle += 1 1506 1507 return idle, run, self.submitted - (idle+run+fail), fail
1508 1509 @multiple_try()
1510 - def remove(self, *args,**opts):
1511 """Clean the jobs on the cluster""" 1512 1513 if not self.submitted_ids: 1514 return 1515 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1516 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1517 self.submitted_ids = []
1518
1519 -class GECluster(Cluster):
1520 """Class for dealing with cluster submission on a GE cluster""" 1521 1522 name = 'ge' 1523 job_id = 'JOB_ID' 1524 idle_tag = ['qw'] 1525 running_tag = ['r'] 1526 1527 @multiple_try()
1528 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1529 required_output=[], nb_submit=0):
1530 """Submit a job prog to a GE cluster""" 1531 1532 text = "" 1533 if cwd is None: 1534 cwd = os.getcwd() 1535 else: 1536 text = " cd %s; bash " % cwd 1537 if stdout is None: 1538 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1539 if stderr is None: 1540 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1541 elif stderr == -2: # -2 is subprocess.STDOUT 1542 stderr = stdout 1543 if log is None: 1544 log = '/dev/null' 1545 1546 text += prog 1547 if argument: 1548 text += ' ' + ' '.join(argument) 1549 text += '\n' 1550 tmp_submit = os.path.join(cwd, 'tmp_submit') 1551 open(tmp_submit,'w').write(text) 1552 1553 a = misc.Popen(['qsub','-o', stdout, 1554 '-e', stderr, 1555 tmp_submit], 1556 stdout=subprocess.PIPE, 1557 stderr=subprocess.STDOUT, 1558 stdin=subprocess.PIPE, cwd=cwd) 1559 1560 output = a.communicate()[0].decode(errors='ignore') 1561 #Your job 874511 ("test.sh") has been submitted 1562 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1563 try: 1564 id = pat.search(output).groups()[0] 1565 except: 1566 raise ClusterManagmentError('fail to submit to the cluster: \n%s' \ 1567 % output) 1568 self.submitted += 1 1569 self.submitted_ids.append(id) 1570 return id
1571 1572 @multiple_try()
1573 - def control_one_job(self, id):
1574 """ control the status of a single job with it's cluster id """ 1575 cmd = 'qstat | grep '+str(id) 1576 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1577 if not status: 1578 return 'F' 1579 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1580 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1581 stat = '' 1582 for line in status.stdout.read().decode(errors='ignore').split('\n'): 1583 if not line: 1584 continue 1585 line = line.strip() 1586 try: 1587 groups = pat.search(line).groups() 1588 except: 1589 raise ClusterManagmentError('bad syntax for stat: \n\"%s\"' % line) 1590 if groups[0] != id: continue 1591 stat = groups[1] 1592 if not stat: 1593 return 'F' 1594 if stat in self.idle_tag: 1595 return 'I' 1596 if stat in self.running_tag: 1597 return 'R'
1598 1599 @multiple_try()
1600 - def control(self, me_dir=None):
1601 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1602 if not self.submitted_ids: 1603 return 0, 0, 0, 0 1604 idle, run, fail = 0, 0, 0 1605 ongoing = [] 1606 for statusflag in ['p', 'r', 'sh']: 1607 cmd = 'qstat -s %s' % statusflag 1608 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1609 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1610 pat = re.compile("^(\d+)") 1611 for line in status.stdout.read().decode(errors='ignore').split('\n'): 1612 line = line.strip() 1613 try: 1614 id = pat.search(line).groups()[0] 1615 except Exception: 1616 pass 1617 else: 1618 if id not in self.submitted_ids: 1619 continue 1620 ongoing.append(id) 1621 if statusflag == 'p': 1622 idle += 1 1623 if statusflag == 'r': 1624 run += 1 1625 if statusflag == 'sh': 1626 fail += 1 1627 for id in list(self.submitted_ids): 1628 if id not in ongoing: 1629 self.check_termination(id) 1630 #self.submitted_ids = ongoing 1631 1632 return idle, run, self.submitted - idle - run - fail, fail
1633 1634 @multiple_try()
1635 - def remove(self, *args, **opts):
1636 """Clean the jobs on the cluster""" 1637 1638 if not self.submitted_ids: 1639 return 1640 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1641 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1642 self.submitted_ids = []
1643
1644 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1645 """start a computation and not wait for it to finish. 1646 this fonction returns a lock which is locked as long as the job is 1647 running.""" 1648 1649 mc = MultiCore(1) 1650 mc.submit(exe, argument, cwd, stdout, **opt) 1651 mc.need_waiting = True 1652 return mc.lock
1653
1654 1655 -class SLURMCluster(Cluster):
1656 """Basic class for dealing with cluster submission""" 1657 1658 name = 'slurm' 1659 job_id = 'SLURM_JOBID' 1660 idle_tag = ['Q','PD','S','CF'] 1661 running_tag = ['R', 'CG'] 1662 complete_tag = ['C'] 1663 identifier_length = 8 1664 1665 @multiple_try()
1666 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1667 required_output=[], nb_submit=0):
1668 """Submit a job prog to a SLURM cluster""" 1669 1670 me_dir = self.get_jobs_identifier(cwd, prog) 1671 1672 1673 if cwd is None: 1674 cwd = os.getcwd() 1675 if stdout is None: 1676 stdout = '/dev/null' 1677 if stderr is None: 1678 stderr = '/dev/null' 1679 elif stderr == -2: # -2 is subprocess.STDOUT 1680 stderr = stdout 1681 if log is None: 1682 log = '/dev/null' 1683 1684 command = ['sbatch', '-o', stdout, 1685 '-J', me_dir, 1686 '-e', stderr, prog] + argument 1687 1688 if self.cluster_queue and self.cluster_queue != 'None': 1689 command.insert(1, '-p') 1690 command.insert(2, self.cluster_queue) 1691 1692 a = misc.Popen(command, stdout=subprocess.PIPE, 1693 stderr=subprocess.STDOUT, 1694 stdin=subprocess.PIPE, cwd=cwd) 1695 1696 output = a.communicate() 1697 output_arr = output[0].decode(errors='ignore').split(' ') 1698 id = output_arr[3].rstrip() 1699 1700 if not id.isdigit(): 1701 id = re.findall('Submitted batch job ([\d\.]+)', ' '.join(output_arr)) 1702 1703 if not id or len(id)>1: 1704 raise ClusterManagmentError( 'fail to submit to the cluster: \n%s' \ 1705 % ('stdout: %s\nstderr %s' %(output[0],output[1]))) 1706 id = id[0] 1707 1708 1709 self.submitted += 1 1710 self.submitted_ids.append(id) 1711 return id
1712 1713 @multiple_try()
1714 - def control_one_job(self, id):
1715 """ control the status of a single job with it's cluster id """ 1716 cmd = 'squeue j'+str(id) 1717 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1718 stderr=open(os.devnull,'w')) 1719 1720 for line in status.stdout: 1721 line = line.decode(errors='ignore').strip() 1722 if 'Invalid' in line: 1723 return 'F' 1724 elif line.startswith(str(id)): 1725 status = line.split()[4] 1726 if status in self.idle_tag: 1727 return 'I' 1728 elif status in self.running_tag: 1729 return 'R' 1730 return 'F'
1731 1732 @multiple_try()
1733 - def control(self, me_dir):
1734 """ control the status of a single job with it's cluster id """ 1735 cmd = "squeue" 1736 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1737 1738 me_dir = self.get_jobs_identifier(me_dir) 1739 1740 idle, run, fail = 0, 0, 0 1741 ongoing=[] 1742 for line in pstatus.stdout: 1743 line = line.decode(errors='ignore') 1744 if me_dir in line: 1745 id, _, _,_ , status,_ = line.split(None,5) 1746 ongoing.append(id) 1747 if status in self.idle_tag: 1748 idle += 1 1749 elif status in self.running_tag: 1750 run += 1 1751 elif status in self.complete_tag: 1752 status = self.check_termination(id) 1753 if status == 'wait': 1754 run += 1 1755 elif status == 'resubmit': 1756 idle += 1 1757 else: 1758 fail += 1 1759 1760 #control other finished job 1761 for id in list(self.submitted_ids): 1762 if id not in ongoing: 1763 status = self.check_termination(id) 1764 if status == 'wait': 1765 run += 1 1766 elif status == 'resubmit': 1767 idle += 1 1768 1769 1770 return idle, run, self.submitted - (idle+run+fail), fail
1771 1772 @multiple_try()
1773 - def remove(self, *args, **opts):
1774 """Clean the jobs on the cluster""" 1775 1776 if not self.submitted_ids: 1777 return 1778 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1779 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1780 self.submitted_ids = []
1781
1782 -class HTCaaSCluster(Cluster):
1783 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1784 1785 name= 'htcaas' 1786 job_id = 'HTCAAS_JOBID' 1787 idle_tag = ['waiting'] 1788 running_tag = ['preparing','running'] 1789 complete_tag = ['done'] 1790 1791 @store_input() 1792 @multiple_try()
1793 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1794 log=None, input_files=[], output_files=[], required_output=[], 1795 nb_submit=0):
1796 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1797 input/output file should be given as relative to CWd 1798 """ 1799 # To make workspace name(temp) 1800 cur_usr = os.getenv('USER') 1801 1802 if cwd is None: 1803 cwd = os.getcwd() 1804 1805 cwd_cp = cwd.rsplit("/",2) 1806 1807 if not stdout is None: 1808 print("stdout: %s" % stdout) 1809 1810 if not os.path.exists(prog): 1811 prog = os.path.join(cwd, prog) 1812 1813 if not required_output and output_files: 1814 required_output = output_files 1815 1816 logger.debug(prog) 1817 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1818 cwd_arg = cwd+"/arguments" 1819 temp = ' '.join([str(a) for a in argument]) 1820 arg_cmd="echo '"+temp+"' > " + cwd_arg 1821 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1822 if argument : 1823 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1824 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1825 id = a.stdout.read().strip() 1826 1827 else: 1828 cwd_arg = cwd+"/arguments" 1829 temp = ' '.join([str(a) for a in argument]) 1830 temp_file_name = "sub." + os.path.basename(prog) 1831 text = """#!/bin/bash 1832 MYPWD=%(cwd)s 1833 cd $MYPWD 1834 input_files=(%(input_files)s ) 1835 for i in ${input_files[@]} 1836 do 1837 chmod -f +x $i 1838 done 1839 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1840 """ 1841 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1842 'arguments': ' '.join([str(a) for a in argument]), 1843 'program': ' ' if '.py' in prog else 'bash'} 1844 1845 # writing a new script for the submission 1846 new_prog = pjoin(cwd, temp_file_name) 1847 open(new_prog, 'w').write(text % dico) 1848 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1849 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1850 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1851 id = a.stdout.read().strip() 1852 logger.debug(id) 1853 1854 nb_try=0 1855 nb_limit=5 1856 if not id.isdigit() : 1857 print("[ID is not digit]:" + id) 1858 1859 while not id.isdigit() : 1860 nb_try+=1 1861 print("[fail_retry]:"+ nb_try) 1862 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1863 id = a.stdout.read().strip() 1864 if nb_try > nb_limit : 1865 raise ClusterManagementError('fail to submit to the HTCaaS cluster: \n %s' % id) 1866 break 1867 1868 self.submitted += 1 1869 self.submitted_ids.append(id) 1870 1871 return id
1872 1873 @multiple_try(nb_try=10, sleep=5)
1874 - def control_one_job(self, id):
1875 """ control the status of a single job with it's cluster id """ 1876 1877 if id == 0 : 1878 status_out ='C' 1879 else : 1880 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1881 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1882 stderr=subprocess.PIPE) 1883 error = status.stderr.read().decode(errors='ignore') 1884 if status.returncode or error: 1885 raise ClusterManagmentError('htcaas-job-submit returns error: %s' % error) 1886 status_out= status.stdout.read().decode(errors='ignore').strip() 1887 status_out= status_out.split(":",1)[1] 1888 if status_out == 'waiting': 1889 status_out='I' 1890 elif status_out == 'preparing' or status_out == 'running': 1891 status_out = 'R' 1892 elif status_out != 'done': 1893 status_out = 'F' 1894 elif status_out == 'done': 1895 status_out = 'C' 1896 1897 return status_out
1898 1899 @multiple_try()
1900 - def control(self, me_dir):
1901 """ control the status of a single job with it's cluster id """ 1902 if not self.submitted_ids: 1903 logger.debug("self.submitted_ids not exists") 1904 return 0, 0, 0, 0 1905 1906 ongoing = [] 1907 idle, run, fail = 0, 0, 0 1908 1909 start = self.submitted_ids[0] 1910 end = self.submitted_ids[-1] 1911 1912 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1913 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1914 1915 for line in status.stdout: 1916 #ongoing.append(line.split()[0].strip()) 1917 status2 = line.decode(errors='ignore').split()[-1] 1918 if status2 != 'null' or line.split()[0].strip() != '0': 1919 ongoing.append(line.split()[0].strip()) 1920 logger.debug("["+line.split()[0].strip()+"]"+status2) 1921 if status2 != 'null' or line.split()[0].strip() != '0': 1922 idle += 1 1923 elif status2 in self.idle_tag: 1924 idle += 1 1925 elif status2 in self.running_tag: 1926 run += 1 1927 elif status2 in self.complete_tag: 1928 if not self.check_termination(line.split()[0]): 1929 idle +=1 1930 else: 1931 fail += 1 1932 1933 return idle, run, self.submitted - (idle+run+fail), fail
1934 1935 @multiple_try()
1936 - def remove(self, *args, **opts):
1937 """Clean the jobson the cluster""" 1938 1939 if not self.submitted_ids: 1940 return 1941 for i in range(len(self.submitted_ids)): 1942 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1943 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1944
1945 -class HTCaaS2Cluster(Cluster):
1946 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1947 1948 name= 'htcaas2' 1949 job_id = 'HTCAAS2_JOBID' 1950 idle_tag = ['waiting'] 1951 running_tag = ['preparing','running'] 1952 complete_tag = ['done'] 1953 1954 @store_input() 1955 @multiple_try()
1956 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1957 log=None, input_files=[], output_files=[], required_output=[], 1958 nb_submit=0):
1959 1960 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1961 input/output file should be given as relative to CWD 1962 """ 1963 if cwd is None: 1964 cwd = os.getcwd() 1965 1966 if not os.path.exists(prog): 1967 prog = os.path.join(cwd, prog) 1968 1969 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1970 if cwd or prog : 1971 self.submitted_dirs.append(cwd) 1972 self.submitted_exes.append(prog) 1973 else: 1974 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1975 1976 if argument : 1977 self.submitted_args.append('='.join([str(a) for a in argument])) 1978 1979 if cwd or prog : 1980 self.submitted += 1 1981 id = self.submitted 1982 self.submitted_ids.append(id) 1983 else: 1984 logger.debug("cwd and prog are not exist! ") 1985 id = 0 1986 1987 else: 1988 temp_file_name = "sub."+ os.path.basename(prog) 1989 text = """#!/bin/bash 1990 MYPWD=%(cwd)s 1991 cd $MYPWD 1992 input_files=(%(input_files)s ) 1993 for i in ${input_files[@]} 1994 do 1995 chmod -f +x $i 1996 done 1997 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1998 """ 1999 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 2000 'arguments': ' '.join([str(a) for a in argument]), 2001 'program': ' ' if '.py' in prog else 'bash'} 2002 # writing a new script for the submission 2003 new_prog = pjoin(cwd, temp_file_name) 2004 open(new_prog, 'w').write(text % dico) 2005 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 2006 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 2007 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 2008 id = a.stdout.read().strip() 2009 logger.debug("[mode2]-["+str(id)+"]") 2010 if cwd and prog : 2011 self.submitted += 1 2012 self.submitted_ids.append(id) 2013 else: 2014 logger.debug("cwd and prog are not exist! ") 2015 id = 0 2016 2017 return id
2018 2019 @multiple_try()
2020 - def metasubmit(self, me_dir=None):
2021 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 2022 tmp_leng= len(self.submitted_ids)/2 2023 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 2024 tmp_dirs2= self.submitted_dirs[tmp_leng:] 2025 tmp_exes1= self.submitted_exes[0:tmp_leng] 2026 tmp_exes2= self.submitted_exes[tmp_leng:] 2027 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a != ' ']), 2028 '-e', ":".join([str(a) for a in tmp_exes1 if a and a != ' '])] 2029 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a != ' ']), 2030 '-e', ":".join([str(a) for a in tmp_exes2 if a and a != ' '])] 2031 if len(self.submitted_args) > 0 : 2032 tmp_args1= self.submitted_args[0:tmp_leng] 2033 tmp_args2= self.submitted_args[tmp_leng:] 2034 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 2035 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 2036 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2037 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2038 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2039 2040 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2041 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a != ' ']), 2042 '-e', ":".join([str(a) for a in self.submitted_exes if a and a != ' '])] 2043 if len(self.submitted_args) > 0 : 2044 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2045 if self.submitted_dirs[0] or self.submitted_exes[0] : 2046 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2047 me_dir = result.stdout.read().strip() 2048 self.submitted_ids[0]=me_dir 2049 else: 2050 me_dir = self.submitted_ids[-1] 2051 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2052 me_dir = self.submitted_ids[0] 2053 else: 2054 me_dir = -1 2055 2056 logger.debug("[" + str(me_dir) + "]") 2057 2058 self.submitted_dirs = [] 2059 self.submitted_exes = [] 2060 self.submitted_args = [] 2061 2062 return me_dir
2063 2064 2065 @multiple_try(nb_try=10, sleep=5)
2066 - def control_one_job(self, id):
2067 """ control the status of a single job with it's cluster id """ 2068 #logger.debug("CONTROL ONE JOB MODE") 2069 if self.submitted == self.submitted_ids[-1] : 2070 id = self.metasubmit(self) 2071 tempid = self.submitted_ids[-1] 2072 self.submitted_ids.remove(self.submitted_ids[-1]) 2073 self.submitted_ids.append(id) 2074 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2075 2076 if id == 0 : 2077 status_out ='C' 2078 else: 2079 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2080 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2081 stderr=subprocess.PIPE) 2082 error = status.stderr.read().decode(errors='ignore') 2083 if status.returncode or error: 2084 raise ClusterManagmentError('htcaas-job-status returns error: %s' % error) 2085 status_out= status.stdout.read().decode(errors='ignore').strip() 2086 status_out= status_out.split(":",1)[1] 2087 logger.debug("[["+str(id)+"]]"+status_out) 2088 if status_out == 'waiting': 2089 status_out='I' 2090 elif status_out == 'preparing' or status_out == 'running': 2091 status_out = 'R' 2092 elif status_out != 'done': 2093 status_out = 'F' 2094 elif status_out == 'done': 2095 status_out = 'C' 2096 self.submitted -= 1 2097 2098 return status_out
2099 2100 @multiple_try()
2101 - def control(self, me_dir):
2102 """ control the status of a single job with it's cluster id """ 2103 if not self.submitted_ids: 2104 logger.debug("self.submitted_ids not exists") 2105 return 0, 0, 0, 0 2106 2107 if "//" in me_dir : 2108 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2109 start = me_dir.split("//")[0] 2110 end = me_dir.split("//")[1] 2111 else : 2112 start = me_dir.split("//")[1] 2113 end = me_dir.split("//")[0] 2114 elif "/" in me_dir : # update 2115 start = 0 2116 end = 0 2117 elif me_dir.isdigit(): 2118 start = me_dir 2119 end = me_dir 2120 elif not me_dir.isdigit(): 2121 me_dir = self.submitted_ids[0] 2122 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2123 2124 ongoing = [] 2125 idle, run, fail, done = 0, 0, 0, 0 2126 2127 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2128 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2129 2130 for line in status.stdout: 2131 line = line.decode(errors='ignore') 2132 status2 = line.split()[-1] 2133 if status2 != 'null' or line.split()[0].strip() != '0': 2134 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2135 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2136 2137 if status2 == 'null' or line.split()[0].strip() == '0': 2138 idle += 1 2139 elif status2 in self.idle_tag: 2140 idle += 1 2141 elif status2 in self.running_tag: 2142 run += 1 2143 elif status2 in self.complete_tag: 2144 done += 1 2145 self.submitted -= 1 2146 if not self.check_termination(line.split()[1]): 2147 idle +=1 2148 else: 2149 fail += 1 2150 2151 return idle, run, self.submitted - (idle+run+fail), fail
2152 2153 @multiple_try()
2154 - def remove(self, *args, **opts):
2155 """Clean the jobson the cluster""" 2156 2157 if not self.submitted_ids: 2158 return 2159 id = self.submitted_ids[0] 2160 if id: 2161 cmd = "htcaas-job-cancel -m %s" % str(id) 2162 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2163 2164 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2165 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2166 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2167 2168 onecore=MultiCore(1) # create a thread to run simple bash job without having to 2169 #fork the main process 2170