Source code for anadama2.grid.grid

# -*- coding: utf-8 -*-

import os
import sys
import threading
try:
    import Queue
except ImportError:
    import queue as Queue
import time
import tempfile
import string
import logging
import itertools
import re

import six

from .. import runners
from .. import picklerunner

from ..helpers import format_command
from ..helpers import file_size

if os.name == 'posix' and sys.version_info[0] < 3:
    import subprocess32 as subprocess
else:
    import subprocess

[docs]class GridJobRequires(object):
    """Defines the resources required for a task on the grid.

    :param time: Wall clock time in minutes.
    :type time: int

    :param mem: RAM Usage in MB (8*1024*1024 bits).
    :type mem: int

    :param cores: CPU cores.
    :type cores: int
    
    :param partition: grid partition.
    :type partition: string
    """
    
    def __init__(self, time, mem, cores, partition, depends=None):
        # if time is not an int, try to format the equation
        if not str(time).isdigit():
            self.time = format_command(time, depends=depends, cores=cores)
        else:
            self.time = int(time)
        
        # if memory is not an int, try to format the equation
        if not str(mem).isdigit():
            self.mem = format_command(mem, depends=depends, cores=cores) 
        else:
            self.mem = int(mem)
            
        self.cores = int(cores) 
        
        self.partition = partition
    
[docs]class Grid(object):
    """ Base Grid Workflow manager class """
    
    def __init__(self, name, worker, queue, tmpdir, benchmark_on=None):
        self.name = name
        self.worker = worker
        self.queue = queue
        self.tmpdir = tmpdir
        
        # create the folder if it does not already exist for temp directory
        if not os.path.isdir(self.tmpdir):
            os.makedirs(self.tmpdir)
        
        self.task_data = dict()

    def _get_grid_task_settings(self, kwargs, depends):
        """ Get the resources required to run this task on the grid """
        # check for the required keywords
        requires=[]
        for key in ["time","mem","cores"]:
            try:
                requires.append(kwargs[key])
            except KeyError:
                raise KeyError(key+" is a required keyword argument for a grid task")

        # check for optional keyword
        try:
            requires.append(kwargs["partition"])
        except KeyError:
            requires.append(None)    
        
        requires+=[depends]
        
        return (GridJobRequires(*requires), self.tmpdir)
        
[docs]    def do(self, task, **kwargs):
        """Accepts the following extra arguments:
        
        :param time: The maximum time in minutes allotted to run the
          command
        :type time: int

        :param mem: The maximum memory in megabytes allocated to run
          the command
        :type mem: int

        :param cores: The number of CPU cores allocated to the job
        :type cores: int

        :param partition: The grid partition to send this job to
        :type partition: str
        """
        
        self.add_task(task, **kwargs)


[docs]    def add_task(self, task, **kwargs):
        """Accepts the following extra arguments:
        
        :keyword time: The maximum time in minutes allotted to run the
          command
        :type time: int

        :keyword mem: The maximum memory in megabytes allocated to run
          the command
        :type mem: int

        :keyword cores: The number of CPU cores allocated to the job
        :type cores: int

        :keyword partition: The grid partition to send this job to
        :type partition: str
        """
        
        self.task_data[task.task_no] = self._get_grid_task_settings(kwargs, task.depends)


    def runner(self, workflow, jobs=1, grid_jobs=1):
        runner = runners.GridRunner(workflow)
        runner.add_worker(runners.ParallelLocalWorker,
                          name="local", rate=jobs, default=True)
        runner.add_worker(self.worker, name=self.name, rate=grid_jobs)
        runner.routes.update((
            ( task_no, (self.name, list(extra)+[self.queue, workflow._reporter]) )
            for task_no, extra in six.iteritems(self.task_data)
        ))
        return runner   

class GridQueue(object):
    
    def __init__(self, partition, benchmark_on=None):
        # check for short/long partitions
        if not isinstance(partition, list):
            partition = [x.strip() for x in partition.split(",")] 
        try:
            self.partition_short, self.partition_long, self.partition_cutoff = partition
            self.partition_cutoff = int(self.partition_cutoff)
        except ValueError:
            self.partition_short = partition[0]
            self.partition_long = partition[0]
            self.partition_cutoff = 0
            
        # this is the refresh rate for checking the queue, in seconds
        self.refresh_rate = 10*60

        # this is the rate for checking the job status, in seconds
        self.check_job_rate = 60
        
        # this is the number of minutes to wait if there is an time out
        # socket error returned from the scheduler when running a command
        self.timeout_sleep = 5*60
        
        # this is the number of times to retry after a timeout error
        self.timeout_retry_max = 3
        
        # this is the number of seconds to wait after job submission
        self.submit_sleep = 5
        
        # this is the last time the queue was checked
        self.last_check = time.time()
        self.sacct = None
        
        # create a lock for jobs in queue
        self.lock_status = threading.Lock()
        self.lock_submit = threading.Lock()
        
        # set if benchmarking should be run
        self.benchmark_on = benchmark_on
        
    @staticmethod
    def submit_command(grid_script):
        raise NotImplementedError
    
    def submit_template(self):
        raise NotImplementedError
    
    def job_failed(self,status):
        raise NotImplementedError
    
    def job_stopped(self,status):
        raise NotImplementedError
    
    def refresh_queue_status(self):
        raise NotImplementedError
    
    def job_memkill(self, status, jobid, memory):
        return False
        
    def job_timeout(self, status, jobid, time):
        return False
    
    def get_job_status_from_stderr(self, error_file, grid_job_status, grid_jobid):
        return grid_job_status
    
    def get_partition(self, time, partition):
        """ Get the partition for the task based on the time requested """
        
        # if a partition is already set for the task, use that partition
        if not partition is None:
            return partition
        
        if time > self.partition_cutoff:
            return self.partition_long
        else:
            return self.partition_short
    
    def get_queue_status(self, refresh=None):
        """ Get the queue accounting stats """
        
        # lock to prevent race conditions with status update
        self.lock_status.acquire()
        
        # check the last time the queue was captured and refresh if set
        current_time = time.time()
        if ( current_time - self.last_check > self.refresh_rate ) or refresh or self.sacct is None:
            self.last_check = current_time
            logging.info("Getting latest queue info to refresh job status")
            self.sacct = self.refresh_queue_status()
            
        self.lock_status.release()
        
        return self.sacct
    
    def get_all_stats_for_jobid(self,jobid):
        """ Get all the stats for a specific job id """
        
        # use the existing stats, to get the information for the jobid
        try:
            job_stats=list(filter(lambda x: x[0].startswith(jobid),self.get_queue_status()))
        except IndexError:
            job_stats=[]
       
        # if the job stats are not found for the job, return an NA state
        if not job_stats:
            job_stats=[[jobid,"Pending","NA","NA","NA"]] 

        return job_stats
    
    def get_job_status(self, jobid):
        """ Check the status of the job """
        
        info=self.get_all_stats_for_jobid(jobid)
        
        return info[0][1]

    def record_benchmark(self, jobid, task_number, reporter):
        """ Check the benchmarking stats of the grid id """
        
        # check if benchmarking is set to off
        if not self.benchmark_on:
            logging.info("Benchmarking is set to off")
            return
        
        reporter.task_grid_status(task_number,jobid,"Getting benchmarking data")
        status, cpus, elapsed, memory = self.get_benchmark(jobid)
            
        logging.info("Benchmark information for job id %s:\nElapsed Time: %s \nCores: %s\nMemory: %s MB",
            task_number, elapsed, cpus, memory)   
        
        reporter.task_grid_status(task_number,jobid,"Final status of "+status)
        
    def get_benchmark(self, jobid, wait=None):
        """ Get the benchmarking data for the jobid """
        
        # if the job is not shown to have finished running then
        # wait for the next queue refresh
        status=self.get_job_status(jobid)
        if wait or not (self.job_stopped(status) or self.job_failed(status)):
            wait_time = abs(self.refresh_rate - (time.time() - self.last_check)) + 10
            time.sleep(wait_time)

        info=self.get_all_stats_for_jobid(jobid)
        
        try:
            status=info[0][1]
        except IndexError:
            status="Unknown"

        try:
            cpus=info[0][2]
        except IndexError:
            cpus="NA"
    
        try:
            elapsed=info[0][3]
        except IndexError:
            elapsed="NA"

        # get the memory max from the batch line which is the second line of output
        try:
            memory=info[0][4]
        except IndexError:
            memory="NA"
        
        if "K" in memory:    
            # if memory is in KB, convert to MB
            memory="{:.1f}".format(float(memory.replace("K",""))/1024.0)
        elif "M" in memory:
            memory="{:.1f}".format(float(memory.replace("M","")))
        elif "G" in memory:
            # if memory is in GB, convert to MB
            memory="{:.1f}".format(float(memory.replace("G",""))*1024.0)
            
        return status, cpus, elapsed, memory
    
    def run_grid_command(self,command):
        """ Run the grid command and check for errors """
        
        error=None
        try:
            logging.debug("Running grid command: %s"," ".join(command))
            stdout=subprocess.check_output(command, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as err:
            error=err.output
            stdout=error or "error"
            
        timeout_error=False
        if error and "error" in error and "Socket timed out on send/recv operation" in error:
            # check for a socket timeout error
            timeout_error=True
            
        return stdout, timeout_error
    
    def run_grid_command_resubmit(self,command):
        """ Run this grid command, check for error, resubmit if needed """
        
        # run the grid command
        stdout, timeout_error = self.run_grid_command(command)
        
        # retry if timeout error present after wait
        resubmissions = 0
        if timeout_error and resubmissions < self.timeout_retry_max:
            resubmissions+=1
            # wait before retrying
            logging.warning("Unable to run grid command, waiting and retrying")
            time.sleep(self.timeout_sleep)
            stdout, timeout_error = self.run_grid_command(command)
        
        return stdout
    
    @staticmethod
    def job_submission_failed(jobid):
        """ Check if the job failed in submission and did not get an id """
        return True if not jobid.isdigit() else False

    def submit_job(self,grid_script):
        """ Submit the grid jobs and return the grid job id """
        
        # lock so only one task submits jobs to the queue at a time
        self.lock_submit.acquire()

        # submit the job and get the grid id
        logging.debug("Submitting job to grid")
        stdout=self.run_grid_command_resubmit(self.submit_command(grid_script))
        
        try:
            # search for the decimal job id at any location in stdout
            jobid=re.findall(r'\d+',stdout)[0]
        except IndexError:
            jobid="error"
        
        # check the jobid for a submission failed
        if self.job_submission_failed(jobid):
            logging.error("Unable to submit job to queue: "+stdout)
        
        # pause for the scheduler
        time.sleep(self.submit_sleep)
        
        self.lock_submit.release()
    
        return jobid
    
    def create_grid_script(self,partition,cpus,minutes,memory,command,taskid,dir):
        """ Create a grid script from the template also creating temp stdout and stderr files """
    
        # create temp files for stdout, stderr, and return code    
        handle_out, out_file=tempfile.mkstemp(suffix=".out",prefix="task_"+str(taskid)+"_",dir=dir)
        os.close(handle_out)
        handle_err, error_file=tempfile.mkstemp(suffix=".err",prefix="task_"+str(taskid)+"_",dir=dir)
        os.close(handle_err)
        handle_rc, rc_file=tempfile.mkstemp(suffix=".rc",prefix="task_"+str(taskid)+"_",dir=dir)
        os.close(handle_rc)
        
        # add the remaining sections to the bash template
        bash_template = string.Template("\n".join(["#!/bin/bash "] + self.submit_template() + ["", "${command}", "${rc_command}"]))
    
        # convert the minutes to the time string "HH:MM:00"
        hours, remaining_minutes = divmod(minutes, 60)
        time = "{:02d}:{:02d}:00".format(hours, remaining_minutes)
    
        bash=bash_template.substitute(partition=partition,cpus=cpus,time=time,
            memory=memory,command=command,output=out_file,error=error_file,rc_command="export RC=$? ; echo $RC > "+rc_file+" ; bash -c 'exit $RC'")
        file_handle, new_file=tempfile.mkstemp(suffix=".bash",prefix="task_"+str(taskid)+"_",dir=dir)
        os.write(file_handle,bash)
        os.close(file_handle)
        
        return new_file, out_file, error_file, rc_file


[docs]class GridWorker(threading.Thread):
    """ Base Grid Worker class """
    
    def __init__(self, work_q, result_q, lock, reporter):
        super(GridWorker, self).__init__()
        self.daemon = True
        self.logger = runners.logger
        self.work_q = work_q
        self.result_q = result_q
        self.lock = lock
        self.reporter = reporter    
    
    @staticmethod
    def appropriate_q_class(*args, **kwargs):
        return six.moves.queue.Queue(*args, **kwargs)    

    @staticmethod
    def appropriate_lock():
        return threading.Lock() 
    
[docs]    def run(self):
        return runners.worker_run_loop(self.work_q, self.result_q, self.run_task_by_type,
            self.reporter, self.lock)
   
    @classmethod 
    def run_task_by_type(cls, task, extra):
        # if any of the tasks are a function, then use pickle interface
        if list(filter(six.callable,task.actions)):
            return cls.run_task_function(task, extra)
        else:
            return cls.run_task_command(task, extra)   
        
    @classmethod
    def run_task_function(cls, task, extra):
        (perf, tmpdir, grid_queue, reporter) = extra
        
        # create a script to run the python function
        pickle_script = picklerunner.PickleScript(task, tmpdir, "task_"+str(task.task_no))
        pickle_task = pickle_script.create_task()
        
        # run the task as a command
        result = cls.run_task_command(pickle_task, extra)
        
        # decode the result
        result = pickle_script.result(result)

        return result
        
    @classmethod
    def run_task_command(cls, task, extra):
        (perf, tmpdir, grid_queue, reporter) = extra
        # report the task has started
        reporter.task_running(task.task_no)
        
        # create a script and stdout/stderr files for this task
        commands="\n".join(task.actions)
        logging.info("Running commands for task id %s:\n%s", task.task_no, commands)
    
        resubmission = 0    
        cores, time, memory, partition = perf.cores, perf.time, perf.mem, perf.partition
    
        jobid, out_file, error_file, rc_file = cls.submit_grid_job(cores, time, memory, 
            partition, tmpdir, commands, task, grid_queue, reporter)
    
        # monitor job if submission was successful
        result, job_final_status = cls.check_submission_then_monitor_grid_job(grid_queue, 
            task, jobid, out_file, error_file, rc_file, reporter)
    
        # if a timeout or memory max, resubmit at most three times
        while ( grid_queue.job_timeout(job_final_status, jobid, time) or grid_queue.job_memkill(job_final_status, jobid, memory) ) and resubmission < 3:
            resubmission+=1
            # increase the memory or the time
            if grid_queue.job_timeout(job_final_status, jobid, time):
                time = "({})*2".format(time) if isinstance(time,str) else time*2
                logging.info("Resubmission number %s of grid job for task id %s with 2x more time: %s minutes", 
                    resubmission, task.task_no, time)
                reporter.task_grid_status(task.task_no,jobid,"Resubmitting due to time out")
            elif grid_queue.job_memkill(job_final_status, jobid, memory):
                memory = "({})*2".format(memory) if isinstance(memory,str) else memory*2
                logging.info("Resubmission number %s of grid job for task id %s with 2x more memory: %s MB",
                    resubmission, task.task_no, memory)
                reporter.task_grid_status(task.task_no,jobid,"Resubmitting due to max memory")
            
            jobid, out_file, error_file, rc_file = cls.submit_grid_job(cores, time, memory,
                partition, tmpdir, commands, task, grid_queue, reporter)
    
            # monitor job if submission was successful
            result, job_final_status = cls.check_submission_then_monitor_grid_job(grid_queue, 
                task, jobid, out_file, error_file, rc_file, reporter)
                
        # get the benchmarking data if the job was submitted
        if not grid_queue.job_submission_failed(jobid):
            grid_queue.record_benchmark(jobid, task.task_no, reporter)
        
        return result
    
    @classmethod
    def submit_grid_job(cls, cores, time, memory, partition, tmpdir, commands, task, grid_queue, reporter):
        
        # evaluate the time/memory requests for the job
        time, memory = cls.evaluate_resource_requests(time, memory)
        
        # get the partition for the task
        current_partition = grid_queue.get_partition(time, partition)
        
        # create the grid bash script
        grid_script, out_file, error_file, rc_file = grid_queue.create_grid_script(current_partition,
            cores, time, memory, commands, task.task_no, tmpdir)
    
        logging.info("Created grid files for task id %s: %s, %s, %s, %s",
            task.task_no, grid_script, out_file, error_file, rc_file)
    
        # submit the job
        jobid = grid_queue.submit_job(grid_script)
        
        logging.info("Submitted job for task id %s: grid id %s", task.task_no,
            jobid)
        
        if not grid_queue.job_submission_failed(jobid):
            reporter.task_grid_status(task.task_no,jobid,"Submitted")
       
        return jobid, out_file, error_file, rc_file
    
[docs]    @staticmethod
    def log_grid_output(taskid, file, file_type):
        """ Write the grid stdout/stderr files to the log """
        
        try:
            lines=open(file).readlines()
        except EnvironmentError:
            lines=[]
            
        logging.info("Grid %s from task id %s:\n%s",taskid, file_type, "".join(lines))
        
[docs]    @staticmethod
    def get_return_code(file):
        """ Read the return code from the file """
    
        try:
            line=open(file).readline().rstrip()
        except EnvironmentError:
            line=""
    
        return line
    
[docs]    @staticmethod
    def evaluate_resource_requests(time,mem):
        """ Evaluate the time/memory requests for the grid job, allowing for ints or formulas """
        
        try:
            time=eval(str(time))
        except TypeError:
            raise TypeError("Unable to evaluate time request for task: "+ time)
        
        try:
            mem=eval(str(mem))
        except TypeError:
            raise TypeError("Unable to evaluate memory request for task: "+ mem)
        
        return time, mem  
    
    @classmethod
    def check_submission_then_monitor_grid_job(cls, grid_queue, task, grid_jobid, 
        out_file, error_file, rc_file, reporter):
        
        # monitor job if submission was successful
        if not grid_queue.job_submission_failed(grid_jobid):
            result, job_final_status = cls.monitor_grid_job(grid_queue, task, grid_jobid,
                out_file, error_file, rc_file, reporter)
        else:
            job_final_status = "SUBMIT FAILED"
            # get the anadama task result
            result=runners._get_task_result(task)
            # add the extra error
            result = result._replace(error=str(result.error)+"Unable to submit job to queue.")
            
        return result, job_final_status
   
    @classmethod 
    def monitor_grid_job(cls, grid_queue, task, grid_jobid, out_file, error_file, rc_file, reporter): 
        # poll to check for status
        grid_job_status=None
        for tries in itertools.count(1):
            # only check status at intervals
            time.sleep(grid_queue.check_job_rate)
            
            # check the queue stats
            grid_job_status = grid_queue.get_job_status(grid_jobid)
            reporter.task_grid_status_polling(task.task_no,grid_jobid,grid_job_status)
            
            logging.info("Status for job id %s with grid id %s is %s",task.task_no,
                grid_jobid,grid_job_status)
            
            if grid_queue.job_stopped(grid_job_status):
                logging.info("Grid status for job id %s shows it has stopped",task.task_no)
                break
            
            # check if the return code file is written
            if os.path.getsize(rc_file) > 0:
                logging.info("Return code file for job id %s shows it has stopped",task.task_no)
                break
            
        # check if a grid error is written to the output file
        grid_job_status = grid_queue.get_job_status_from_stderr(error_file, grid_job_status, grid_jobid)
        
        # write the stdout and stderr to the log
        cls.log_grid_output(task.task_no, out_file, "standard output")
        cls.log_grid_output(task.task_no, error_file, "standard error")
        cls.log_grid_output(task.task_no, rc_file, "return code")
        
        # check the return code
        extra_error=""
        return_code=cls.get_return_code(rc_file)
        if return_code and not return_code == "0":
            extra_error="\nReturn Code Error: " + return_code
          
        # check the queue status
        if grid_queue.job_failed(grid_job_status):
            extra_error+="\nGrid Status Error: " + grid_job_status
     
        # get the anadama task result
        result=runners._get_task_result(task)
    
        # add the extra error if found
        if extra_error:
            result = result._replace(error=str(result.error)+extra_error)
     
        return result, grid_job_status
Source code for anadama2.grid.grid

anadama2

Navigation

Related Topics