/* -------------------------------------------------------------------------- */
/* Copyright 2002-2011, GridWay Project Leads (GridWay.org)                   */
/*                                                                            */
/* Licensed under the Apache License, Version 2.0 (the "License"); you may    */
/* not use this file except in compliance with the License. You may obtain    */
/* a copy of the License at                                                   */
/*                                                                            */
/* http://www.apache.org/licenses/LICENSE-2.0                                 */
/*                                                                            */
/* Unless required by applicable law or agreed to in writing, software        */
/* distributed under the License is distributed on an "AS IS" BASIS,          */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   */
/* See the License for the specific language governing permissions and        */
/* limitations under the License.                                             */
/* -------------------------------------------------------------------------- */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <pwd.h>
#include <time.h>
#include <unistd.h>
#include <stdarg.h>
#include <errno.h>
#include <sys/stat.h>

#include "gw_job.h"
#include "gw_dm.h"
#include "gw_em.h"
#include "gw_em_mad.h"
#include "gw_common.h"
#include "gw_log.h"
#include "gw_user_pool.h"
#include "gw_host_pool.h"
#include "gw_file_parser.h"

static int gw_job_recover_exit_code(gw_job_t *job);

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

int gw_job_recover(gw_job_t *job)
{
    time_t timestamp;

    int    user_id;
    int    rc;
    int    pinc, pstart;
        
    char   job_state_name[GW_MSG_STRING_SHORT];
    char   user_name[GW_MSG_STRING_SHORT];
    char   proxy_path[GW_MSG_STRING_LONG];
    char   job_home[GW_MSG_STRING_LONG];
    char   history_filename[GW_MSG_STRING_LONG];
    char   state_filename[GW_MSG_STRING_LONG];
    char   template_filename[GW_MSG_STRING_LONG];
    char   conf_filename[GW_MSG_STRING_LONG];
    
    FILE * file;
    FILE * state_file;
    FILE * history_file;
   
    gw_template_t  template;
    gw_job_state_t job_state;
    gw_job_state_t previous_job_state;
    
    struct passwd * pw_ent;
    struct stat     dir_stat;

    /*------------------------------------------------------------------------*/

    rc=stat(job->directory, &dir_stat);

    if ( rc || (!S_ISDIR(dir_stat.st_mode)) )
        return 1;

    /*------------------------------------------------------------------------*/

    gw_log_print("DM",'I', "Recovering job %d.\n", job->id);
	
    sprintf(template_filename, "%s/job.template", job->directory);
    
    rc = gw_template_init(&template, template_filename);
    
    if (rc != 0)
    {
        gw_log_print("DM",'E',"Parse error, template file of job %d: %s.\n",
                job->id, template_filename);
        return -1;
    }
    
    gw_job_template_init(&(job->template), &template);
    
    /*------------------------------------------------------------------------*/
    
    sprintf(conf_filename, "%s/job.conf", job->directory);
    
    file = fopen(conf_filename, "r");

    if (file == NULL)
    {
        gw_log_print("DM",'E',"Could not open configuration file of job %d: %s\n",
                job->id, conf_filename); 
        return -1;
    }

    rc = fscanf(file, "%ld %s %s %s %i %i %i %i", &timestamp, user_name, proxy_path, job_home,
            &pstart, &pinc, &(job->fixed_priority),&(job->type_dep));

    if (proxy_path[0] == '-' && proxy_path[1] == '\0')
        proxy_path[0] = '\0';
    
    if (rc != 8)
    {
        gw_log_print("DM",'E',"Bad filed number (%d) in job %d configuration file.\n",
                     rc,
                     job->id);
        return -1;
    }
    
    fclose(file);
    
    /*------------------------------------------------------------------------*/

    if (gw_user_pool_exists(user_name, proxy_path, &user_id) == GW_FALSE)
    {
#ifdef GWJOBDEBUG    	
        gw_log_print("DM",'D',"Registering user %s with proxy path %s.\n", user_name, proxy_path);
#endif    
        rc = gw_user_pool_user_allocate(user_name, proxy_path, &user_id);

        if ( rc != 0 )
        {
            gw_log_print("DM",'E',"Could not register user %s.\n",
                    user_name);
            return -1;
        }
    }
    
#ifdef GWJOBDEBUG
    gw_log_print("DM",'D',"User %s registered with UID %d.\n", user_name,
            user_id);
#endif

    /*------------------------------------------------------------------------*/

    job->start_time = timestamp;
    job->owner      = strdup(user_name);
    job->user_id    = user_id;
    job->pstart     = pstart;
    job->pinc       = pinc;
    
    pw_ent = getpwnam(user_name);
    
    if (pw_ent != NULL)
    	job->template.user_home = strdup(pw_ent->pw_dir);
    else
    {
    	gw_log_print("DM",'E',"Could not get home for user %s.\n", user_name);
    	return -1;
    }
    	
    job->template.job_home = strdup(job_home);

    /*------------------------------------------------------------------------*/

#ifdef GWJOBDEBUG
    gw_log_print("DM",'D',"Recovering state transitions of job %d.\n", job->id); 
#endif

    sprintf(state_filename, "%s/job.state", job->directory);
    sprintf(history_filename, "%s/job.history", job->directory);

    state_file   = fopen(state_filename, "r");
    history_file = fopen(history_filename, "r");

    if (state_file == NULL)
    {
        gw_log_print("DM",'E',"Could not open state file of job %d: %s\n",
                     job->id, 
                     state_filename);
                     
        if (history_file != NULL)
            fclose(history_file);
        
        return -1;
    }

    /* If history file does not exits, generate an error only if we need to
      access a history record */

    /* Perform again state transitions */
    previous_job_state = GW_JOB_STATE_LIMIT;
    
    while (fscanf(state_file, "%ld %s", &timestamp, job_state_name) == 2)
    {
        if (previous_job_state == GW_JOB_STATE_LIMIT)
        {
            previous_job_state = GW_JOB_STATE_INIT;
        }
        else 
        {
            previous_job_state = job_state;
        }
        
        job_state = gw_job_get_state_code(job_state_name);

        if (job_state == GW_JOB_STATE_ZOMBIE)
        {
        	rc = gw_job_recover_exit_code(job);
        	if (rc == 0)
        		gw_log_print("DM",'I',"Exit code of job %d is %d.\n",job->id,job->exit_code);
        	else
        		gw_log_print("DM",'E',"Unable to find exit code.\n");
        }

        /* Re-construct job lifecycle (states, history & statistics) */

        rc = gw_job_recover_state_transition(job, 
                                             previous_job_state, 
                                             job_state,
                                             timestamp, 
                                             history_file);
        if (rc == -1)
        {
            gw_log_print("DM",'E',
                    "Recovering state transition (%s->%s) of job %d. "
                    "Will not recover job.\n",
                    gw_job_get_state_name(previous_job_state),
                    gw_job_get_state_name(job_state), job->id);
                    
            fclose(state_file);
        
            if (history_file != NULL)
                fclose(history_file);
                
            return -1;
        }
    }

    if (!feof(state_file))
    {
        gw_log_print("DM",'E',"Bad number of fields in job state file of job %d.\n",
                     job->id);
                     
        fclose(state_file);
        
        if (history_file != NULL)
            fclose(history_file);
        
        return -1;
    }

    fclose(state_file);

    if (history_file != NULL)
        fclose(history_file);

    /*------------------------------------------------------------------------*/
    
#ifdef GWJOBDEBUG
    gw_log_print("DM",'D',"Recovering last state of job %d\n", job->id); 
#endif

    gw_job_set_state(job, previous_job_state, GW_TRUE);


    rc = gw_job_recover_last_state_transition(job,
                                              previous_job_state,
                                              job_state, 
                                              timestamp);
    if (rc == -1)
    {
        gw_log_print("DM",'E',
                "Could not recover last state transition (%s->%s) of job %d.\n",
                gw_job_get_state_name(previous_job_state),
                gw_job_get_state_name(job_state), job->id);
    }


    /* ----- Update user stats ------- */
    
	gw_user_pool_inc_jobs(user_id,1);
    
    return 0;
}

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

int gw_job_recover_state_transition(gw_job_t *job,
        gw_job_state_t previous_job_state, gw_job_state_t job_state,
        time_t timestamp, FILE *history_file)
{
    int rc;
    
    switch (previous_job_state)
    {
    case GW_JOB_STATE_PROLOG:
    case GW_JOB_STATE_MIGR_PROLOG:
         job->history->stats[PROLOG_EXIT_TIME] = timestamp;
         break;

    case GW_JOB_STATE_WRAPPER:
        job->history->stats[WRAPPER_EXIT_TIME] = timestamp;
        break;

    case GW_JOB_STATE_EPILOG:
    case GW_JOB_STATE_EPILOG_FAIL:
    case GW_JOB_STATE_EPILOG_RESTART:
    case GW_JOB_STATE_STOP_EPILOG:
    case GW_JOB_STATE_KILL_EPILOG:
        job->history->stats[EPILOG_EXIT_TIME] = timestamp;
        break;

    case GW_JOB_STATE_MIGR_EPILOG:
        job->history->next->stats[EPILOG_EXIT_TIME] = timestamp;
        job->history->next->stats[MIGRATION_EXIT_TIME] = timestamp;
        break;

    case GW_JOB_STATE_INIT:
    case GW_JOB_STATE_PENDING:
    case GW_JOB_STATE_HOLD:
    case GW_JOB_STATE_PRE_WRAPPER:
    case GW_JOB_STATE_STOP_CANCEL:
    case GW_JOB_STATE_STOPPED:
    case GW_JOB_STATE_KILL_CANCEL:
    case GW_JOB_STATE_MIGR_CANCEL:
    case GW_JOB_STATE_EPILOG_STD:
    case GW_JOB_STATE_FAILED:
    case GW_JOB_STATE_ZOMBIE:
        break;

    case GW_JOB_STATE_LIMIT:
        return -1;
    }

    /* ---------------------------------------------------------------------- */
    
    rc = 0;
    
    switch (job_state)
    {
    case GW_JOB_STATE_PROLOG:
#ifdef GWJOBDEBUG
        gw_log_print("DM",'D',"Recovering history record of job %d\n",
                job->id);
#endif     
        rc = gw_job_recover_history_record(history_file, job);
        
        if (rc == 0)
        {
            job->history->stats[START_TIME]        = timestamp;
            job->history->stats[PROLOG_START_TIME] = timestamp;
        }
        break;

    case GW_JOB_STATE_MIGR_CANCEL:
#ifdef GWJOBDEBUG
        gw_log_print("DM",'D',"Recovering history record of job %d.\n",
                     job->id);
#endif
        rc = gw_job_recover_history_record(history_file, job);

        break;
        
    case GW_JOB_STATE_MIGR_PROLOG:
        job->history->stats[START_TIME] = timestamp;
        job->history->stats[PROLOG_START_TIME] = timestamp;
        break;

    case GW_JOB_STATE_MIGR_EPILOG:
        job->history->next->stats[EPILOG_START_TIME] = timestamp;
        break;
        
    case GW_JOB_STATE_WRAPPER:
        job->history->stats[WRAPPER_START_TIME] = timestamp;
        break;
        
    case GW_JOB_STATE_EPILOG_STD:
    case GW_JOB_STATE_KILL_EPILOG:
    case GW_JOB_STATE_STOP_EPILOG:
    case GW_JOB_STATE_EPILOG_RESTART:
    case GW_JOB_STATE_EPILOG_FAIL:
        job->history->stats[EPILOG_START_TIME] = timestamp;
        break;

    case GW_JOB_STATE_EPILOG:
        if ( previous_job_state == GW_JOB_STATE_WRAPPER )
            job->history->stats[EPILOG_START_TIME] = timestamp;
        break;
        
    case GW_JOB_STATE_PENDING:    	
    case GW_JOB_STATE_HOLD:
    case GW_JOB_STATE_PRE_WRAPPER:
    case GW_JOB_STATE_STOP_CANCEL:
    case GW_JOB_STATE_KILL_CANCEL:
        break;

    case GW_JOB_STATE_STOPPED:
    case GW_JOB_STATE_FAILED:
        job->history->stats[EXIT_TIME] = timestamp;
        job->exit_time = timestamp;
        break;

    case GW_JOB_STATE_ZOMBIE:
        job->history->stats[EXIT_TIME] = timestamp;
        job->exit_time = timestamp;
        job->history->reason = GW_REASON_NONE;
        break;

    case GW_JOB_STATE_INIT:
    case GW_JOB_STATE_LIMIT:
        rc = -1;
        break;
    }
    
    return rc;
}

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

int gw_job_recover_last_state_transition(gw_job_t *job,
        gw_job_state_t previous_job_state, gw_job_state_t job_state,
        time_t timestamp)
{
    int *id;
    int rc;
    
    id = (int *)malloc(sizeof(int));
    *id = job->id;

    switch(job_state)
    {
    case GW_JOB_STATE_PENDING:
        gw_dm_mad_job_schedule(&gw_dm.dm_mad[0],
                               *id,
                               -1,
                               job->user_id,
                               GW_REASON_NONE);
    case GW_JOB_STATE_HOLD:
    case GW_JOB_STATE_STOPPED:
    case GW_JOB_STATE_FAILED:
    case GW_JOB_STATE_ZOMBIE:
        free(id);
        gw_job_set_state(job, job_state, GW_TRUE);
        break;

    case GW_JOB_STATE_PROLOG:

    	gw_user_pool_inc_running_jobs(job->user_id, 1);

    	gw_host_inc_slots_nb(job->history->host, job->template.np, job->history->queue);
       
        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_PROLOG", (void *)id);
        break;

    case GW_JOB_STATE_WRAPPER:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);
        
       	gw_host_inc_slots_nb(job->history->host, job->template.np,job->history->queue);
 
        gw_job_set_state(job, GW_JOB_STATE_WRAPPER, GW_TRUE);

        gw_log_print("DM",'I',"Recovering GRAM contact for job %d.\n", job->id); 
  
        rc = gw_job_recover_job_contact(job);
        
        if (rc == -1)
        {
            //gw_job_set_state(job, previous_job_state, GW_TRUE);
            gw_job_set_state(job, GW_JOB_STATE_PENDING, GW_TRUE);
            
            //gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_WRAPPER", (void *)id);
            gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_PENDING", (void *)id);
        }
        else
        	free(id);
        break;

    case GW_JOB_STATE_EPILOG:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);
    	
        gw_host_inc_rjobs_nb(job->history->host,job->history->queue);
                
        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG", (void *)id);
        break;

    case GW_JOB_STATE_EPILOG_FAIL:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);
               
        gw_host_inc_rjobs_nb(job->history->host,job->history->queue);
        
        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_FAIL", (void *)id);
        break;

    case GW_JOB_STATE_EPILOG_RESTART:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);
               
        gw_host_inc_rjobs_nb(job->history->host,job->history->queue);

        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_RESTART", (void *)id);
        break;

    case GW_JOB_STATE_EPILOG_STD:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);
               
        gw_host_inc_rjobs_nb(job->history->host,job->history->queue);

        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_EPILOG_STD", (void *)id);
        break;

    case GW_JOB_STATE_KILL_CANCEL:
        gw_job_set_state(job, job_state, GW_TRUE);
        /* continues to KILL_EPILOG */

    case GW_JOB_STATE_KILL_EPILOG:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);

        gw_host_inc_rjobs_nb(job->history->host,job->history->queue);
        
        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_KILL_EPILOG", (void *)id);
        break;

    case GW_JOB_STATE_MIGR_CANCEL:
        gw_job_set_state(job, job_state, GW_TRUE);
        /* continues to MIGR_PROLOG */
            
    case GW_JOB_STATE_MIGR_PROLOG:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);
    	
    	gw_host_inc_slots_nb(job->history->host, job->template.np,job->history->queue);
        
    	gw_host_inc_rjobs_nb(job->history->next->host,job->history->next->queue);
        				
        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_MIGR_PROLOG", (void *)id);
        break;

    case GW_JOB_STATE_MIGR_EPILOG:
    	gw_user_pool_inc_running_jobs(job->user_id, 1);

    	gw_host_inc_slots_nb(job->history->host, job->template.np,job->history->queue);
  
    	gw_host_inc_rjobs_nb(job->history->next->host,job->history->next->queue);
    
        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_MIGR_EPILOG", (void *)id);
        break;

    case GW_JOB_STATE_PRE_WRAPPER:
   	
    	gw_user_pool_inc_running_jobs(job->user_id, 1);

    	gw_host_inc_slots_nb(job->history->host, job->template.np,job->history->queue);
    	            
        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_PRE_WRAPPER", (void *)id);
        break;

    case GW_JOB_STATE_STOP_CANCEL:
        gw_job_set_state(job, job_state, GW_TRUE);
        /* continues to STOP_EPILOG */
        
    case GW_JOB_STATE_STOP_EPILOG:

    	gw_user_pool_inc_running_jobs(job->user_id, 1);

    	gw_host_inc_rjobs_nb(job->history->host,job->history->queue);

        gw_am_trigger(&(gw_dm.am), "GW_DM_STATE_STOP_EPILOG", (void *)id);
        break;

    case GW_JOB_STATE_INIT:
    case GW_JOB_STATE_LIMIT:
        return -1;
    }

    return 0;
}

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

int gw_job_recover_history_record(FILE *history_file, gw_job_t *job)
{
    int rc;
    char hostname[GW_MSG_STRING_HOST], queue_name[GW_MSG_STRING_SHORT];
    char fork_name[GW_MSG_STRING_SHORT], lrms_name[GW_MSG_STRING_SHORT], lrms_type[GW_MSG_STRING_SHORT];
    int rank, priority;
    char em_mad_name[GW_MSG_STRING_SHORT], tm_mad_name[GW_MSG_STRING_SHORT], im_mad_name[GW_MSG_STRING_SHORT];
    int host_id;
    gw_host_t *host;
    
    if (history_file == NULL)
        return -1;

    rc = fscanf(history_file, "%s %d %s %s %s %s %s %s %s", hostname,
            &rank, queue_name, fork_name, lrms_name, lrms_type,
            em_mad_name, tm_mad_name, im_mad_name);

    if (rc != 9)
    {
        gw_log_print("DM",'E',"Wrong field number (%d) in history record of job %d.\n",
                     rc,
                     job->id);
        return -1;
    }

    host = gw_host_pool_search(hostname, GW_FALSE);

    if (host == NULL)
    {
#ifdef GWJOBDEBUG    	
        gw_log_print("DM",'D',"Registering host %s.\n", hostname);
#endif
        
        priority = gw_sch_get_host_priority(&(gw_conf.sch_conf),
                                            hostname,
                                            im_mad_name);
    
        host_id = gw_host_pool_host_allocate(hostname, 
                                             priority,
                                             em_mad_name, 
                                             tm_mad_name, 
                                             im_mad_name);
            
        host = gw_host_pool_get_host(host_id, GW_FALSE);
    }
    
#ifdef GWJOBDEBUG
    gw_log_print("DM",'D',"Host %s registered with HID %d.\n", hostname,
            host->host_id);
#endif

    /* Needed to generate the RSL */
    host->lrms_type = strdup(lrms_type);
    
    /*------------------------------------------------------------------------*/

	if (job->history != NULL) /* Not the first record */
		job->restarted++;
		
    rc = gw_job_history_add(&(job->history),
                            host,
                            rank,
                            queue_name,
                            fork_name,
                            lrms_name,
                            lrms_type,
                            job->owner,
                            job->template.user_home,
                            job->id,
                            job->user_id,
                            GW_TRUE);

    if (rc == -1)
    {
        gw_log_print("DM",'E',"Could not add history record.\n");
        return -1;
    }
    
    return 0;
}

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

int gw_job_recover_job_contact(gw_job_t *job)
{
    char filename[GW_MSG_STRING_LONG];
    FILE *file;
    int rc;
    char job_contact[GW_MSG_STRING_LONG];
    gw_em_mad_t *em_mad;
    
    sprintf(filename, "%s/job.contact", job->directory);

    file = fopen(filename, "r");

    if (file == NULL)
    {
        gw_log_print("DM",'E',"Could not open GRAM contact file of job %d.\n",
                     job->id);
        return -1;
    }
    else
    {
        rc = fscanf(file, "%s", job_contact);
            
        if (rc != 1)
        {
            gw_log_print("DM",'E',"Could not read GRAM contact of job %d.\n",
                         job->id);
            return -1;
        }

#ifdef GWJOBDEBUG
        gw_log_print("DM",'D',"Job contact for job %d is %s.\n", 
                     job->id,
                     job_contact);
#endif                         
        em_mad = gw_user_pool_get_em_mad(job->user_id,
                                         job->history->host->em_mad);
                
        gw_em_mad_recover(em_mad, job->id, job_contact);

        fclose(file);
    }

    return 0;
}

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

char * gw_job_recover_get_contact(gw_job_t *job)
{
    char filename[GW_MSG_STRING_LONG];
    FILE *file;
    int rc;
    char job_contact[GW_MSG_STRING_LONG];
    char *jc;
    
    jc = NULL;
    
    sprintf(filename, "%s/job.contact", job->directory);

    file = fopen(filename, "r");

    if (file != NULL)
    {
        rc = fscanf(file, "%s", job_contact);
        
        if ( rc == 1 )
            jc = strdup(job_contact);

        fclose(file);
    }

    return jc;
}
 

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

int gw_job_recover_exit_code(gw_job_t *job)
{
    char         stdout_wrapper[512];
    int          rc;
    char *       exit_code;


    snprintf(stdout_wrapper,sizeof(char)*512,"%s/stdout.wrapper.%i",
                        job->directory, job->restarted);

    rc = gw_parse_file(stdout_wrapper, EXIT_STATUS, &exit_code);

    if ( ( rc != -1) && ( exit_code != NULL ) )
    {
    	switch ( exit_code[0] )
        {
            case 'S':
            case 'P':
                break;
            default:
            	job->exit_code = atoi(exit_code);
            	break;
        }
        free (exit_code);
        return 0;
    }
    else
    {
    	return -1;
    }
}
