"""
    This Module accepts the absolute file name as input argument.
    File Format is supposed to be CSV with the predefined set of columns

        1. Performs invalid data filtration

        2. Cleans the data where ever required

        3. Load Provider and Patient related dimension tables followed by Encounter information

"""
__authour__ = 'Manjunath Karanam'
import pandas as pd
from dateutil.parser import parse
from sqlalchemy import create_engine
import os
import sys
import logging
import mysql.connector

logging.basicConfig(filename='./Logs/apixio_encounter.log', level=logging.INFO, filemode='a',
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.info('*' * 20 + " Program Begins !!! " + '*' * 20)
logging.info("Imported Required Libraries and initialized logging system ")

"""
    Prepares the DB required prelims

        1. Prepare the connection string to the mysql database

        2. Define queries to fetch data from the DB
            
"""

# Create sqlalchemy engine connection string to connect to database (mysql)
engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
                       .format(user="root",
                               pw="apixio",
                               db="apixio"))

prvdr_query = """
select Provider_IK, Provider_Name as Provider_Name_Tgt,Provider_Npi from Provider_Dim order by Provider_Npi
"""

ptnt_query = """
select patient_IK, Patient_Name as Patient_Name_Tgt, dob as dob_Tgt from Patient_Dim order by 1, 2
"""


def datefmtx(x):
    """
        Accepts the string as input

            1. Replace the misspelled characters in 'April'

            2. Parse the string to appropriate date format

            3. Return the date in desired format of '%Y-%m-%d' E.g., 1900-12-31

    """

    # Replace 'Apirl' with 'April'
    x = str(x).replace('Apirl', 'April')

    # Parse the date to '%Y-%m-%d' format
    return parse(x).strftime('%Y-%m-%d')


def extract_n_transform(in_filename):
    """ Reads the File and Transform the Data. Accepts File name as argument and returns a cleansed DataFrame

            1. Drop the bad data (Records with No Dob and encounter data are considered bad data

            2. Error files are placed in the Error directory with .err file name extension

            3. Rename the column names in DataFrame as per the Naming convention declared and designed in DB

            4. Formats the data to '%Y-%m-%d' format E.g., 1900-12-31

            5. Change the Patient Name to Title case
            
    """

    # Read and clean CSV
    try:

        logging.info("-" * 20 + ' Extract_n_Transform : in Progress ' + "-" * 20)
        df = pd.read_csv(in_filename)
        logging.info(" File read successfully to dataframe")

        # Define logical Not Null Columns
        not_null_cols_filter = (df['dob'].isnull() | df['encounter date'].isnull() | df['provider npi'].isnull() | df[
            'patient'].isnull())
        df_err = df[not_null_cols_filter]
        df = df[~not_null_cols_filter]
        logging.info(" Null value check for not null columns are evaluated")

        # Write data issues to a file in Error directory
        df_err.to_csv('./Error/' + file_nm + '_encntr.err', index=False)
        logging.info(" Error records in file are stored as another csv file and placed in Error folder ")
        logging.info(" File location : " + './Error/' + file_nm + '_encntr.err')

        # Rename Columns to match our Naming Convention
        df = df.rename(columns={'provider': 'Provider_Name', 'provider npi': 'Provider_Npi', "patient": "Patient_Name",
                                'encounter date': 'Encounter_Date', 'encounter note': 'Encounter_Note',
                                'chief complaint': 'Chief_Complaint'})
        logging.info(" Column names renamed as per internal Naming convention")

        # Apply Date Format on dob and encounter date columns
        df['dob'] = df['dob'].apply(lambda x: datefmtx(x))
        df['Encounter_Date'] = df['Encounter_Date'].apply(lambda x: datefmtx(x))
        df['File_Name'] = file

        # Apply Title case to Patient name and Provider Name
        df['Patient_Name'] = df['Patient_Name'].apply(lambda x: x.title())
        logging.info(" Applied minimum required transformations to the Data ")
        logging.info("-" * 20 + ' Extract_n_Transform : Completed ' + "-" * 20)

    except FileExistsError:

        if os.path.isfile(in_filename):
            os.rename(in_filename, "./Error/" + file)
            logging.error(" Issues while processing the file ")
            logging.info(" File is moved to the Error folder" + "./Error/" + file)
        else:
            logging.info(" Given File doesnt exists please check the path and rerun the script. ")
            print(" Extract_n_Transform : Given File doesnt exists please check the path and rerun the script. ")
            print(" Extract_n_Transform : Please provide absolute path : Keep it in the Designated project Data folder")
            exit()

    return df


def prvdr_dim_load(df):
    """
    Prepare Provider data from the main df. Accepts DataFrame as input and load validated Provider data to DB table
    
        1. Removing the repeated values to normalize

        2. Remove the records with Nan as Provider are stored in provider error file,
         with .err file name extension in Error directory

        3. Perform the existence check with the Already existing Provider Dimension table

        4. Insert Non Existing records to the Provider_Dim table 

    """
    logging.info("-" * 20 + ' Prvdr_Dim_Load : in Progress ' + "-" * 20)
    # Extract Provider related Columns & Drop Duplicates to normalize
    prvdr_df = df[["Provider_Npi", "Provider_Name"]].drop_duplicates().sort_values(['Provider_Npi'])
    logging.info(" Provider subset DataFrame is created ")

    # Filter the Error records and prepare the valid Provider data
    prvdr_filt_na = prvdr_df['Provider_Name'].isnull()
    prvdr_err = prvdr_df[prvdr_filt_na]
    prvdr_info = prvdr_df[~prvdr_filt_na]
    logging.info(" Provider Error records are collected to Dataframe ")

    # Write data issues to a file in Error directory
    prvdr_err.to_csv('./Error/' + file_nm + '_prvdr.err', index=False)
    logging.info(" Error Records are stored to a csv file in Error Directory ")
    logging.info(" File Name: ./Error/" + file_nm + '_prvdr.err')

    # Perform Existence Check with available db records
    ext_prvdr_info = pd.read_sql_query(prvdr_query, con=engine)
    prvdr_jn = prvdr_info.merge(ext_prvdr_info, on="Provider_Npi", how="left")
    prv_filt_jn = prvdr_jn['Provider_Name_Tgt'].isnull()
    prvdr_jn = prvdr_jn[prv_filt_jn].iloc[:, :2]
    logging.info(" Non Existing Records are ready to load to Provider_Dim table ")

    # Insert new set of records to the DB table and ignore the repeated ones
    prvdr_jn.to_sql('Provider_DIM', engine, if_exists='append', schema='apixio', index=False)
    logging.info(" Provider_Dim table populated successfully ")
    logging.info("-" * 20 + ' Prvdr_Dim_Load : Completed  ' + "-" * 20)


def ptnt_dim_load(df):
    """
    Prepare Patient data from the main DataFrame. Accepts DataFrame as input and load validated Patient data to DB table
    
        1. Removing the repeated values to normalize

        2. Remove the records with Nan as Patient are stored in Patient error file with .err file name extension
        in Error directory

        3. Perform the existence check with the Already existing Patient Dimension table

        4. Insert Non Existing records to the Patient_Dim table 

    """
    logging.info("-" * 20 + ' Ptnt_Dim_Load : in Progress ' + "-" * 20)
    # Extract Patient related Columns & Drop Duplicates to normalize
    ptnt_df = df[["Patient_Name", "dob"]].drop_duplicates().sort_values(by=["Patient_Name", 'dob'])
    logging.info(" Patient subset DataFrame is created ")

    # Filter the Error records and prepare the valid Patient data
    ptnt_filt_na = ptnt_df['dob'] == '1900-12-31'
    ptnt_err = ptnt_df.loc[ptnt_filt_na]
    ptnt_info = ptnt_df.loc[~ptnt_filt_na]
    logging.info(" Patient Error records are collected to DataFrame ")

    # Write data issues to a file in Error directory
    ptnt_err.to_csv('./Error/' + file_nm + '_ptnt.err', index=False)
    logging.info(" Error Records are stored to a csv file in Error Directory ")
    logging.info(" File Name: ./Error/" + file_nm + '_ptnt.err')

    # Perform Existence Check with available db records
    # Adjust the dob data to match the format with dfs 
    ext_ptnt_info = pd.read_sql_query(ptnt_query, con=engine)
    ext_ptnt_info['dob_Tgt'] = ext_ptnt_info['dob_Tgt'].apply(lambda x: datefmtx(x))
    ptnt_jn = ptnt_info.merge(ext_ptnt_info, left_on=["Patient_Name", "dob"], right_on=["Patient_Name_Tgt", "dob_Tgt"],
                              how="left")

    patient_name_tgt = ptnt_jn['Patient_Name_Tgt'].isnull()
    ptnt_jn = ptnt_jn[patient_name_tgt].iloc[:, :2]
    logging.info(" Non Existing Records are ready to load to Patient_Dim table ")

    # Insert new set of records to the DB table and ignore the repeated ones
    ptnt_jn.to_sql("Patient_DIM", engine, if_exists='append', schema='apixio', index=False)
    logging.info(" Patient_Dim table populated successfully ")
    logging.info("-" * 20 + ' Ptnt_Dim_Load : Completed ' + "-" * 20)


def encntr_info_load(df):

    """
    Prepare Encounter data from the main DataFrame
    
        1. Derive the Dim keys to store data in normalized format 

        2. Back out partial data in case of failure and load the correct data

    """

    logging.info("-" * 20 + ' Encntr_Info_Load : in Progress ' + "-" * 20)
    # establishing the connection
    conn = mysql.connector.connect(
        user='root', password='apixio', host='127.0.0.1', database='mysql')

    # Creating a cursor object using the cursor() method
    cursor = conn.cursor()

    # Retrieving single row
    logging.info(" Check the existing count records for the same file in the table: ")
    conunt_sql = f"select count(*) from apixio.Encounter_Info where File_name = '{file}'"
    cursor.execute(conunt_sql)
    logging.info(cursor.fetchall())

    # Preparing the query to delete records
    sql = f"DELETE FROM apixio.Encounter_Info WHERE File_name = '{file}'"
    logging.info(" Prepare Delete Statements to delete the records if exists ")

    try:
        # Execute the SQL command
        cursor.execute(sql)

        # Commit your changes in the database
        conn.commit()
    except FileExistsError:
        # Roll back in case there is any error
        conn.rollback()

    # Retrieving data
    logging.info(" Contents of the table after delete operation ")
    cursor.execute(conunt_sql)
    logging.info(cursor.fetchall())

    # Closing the connection
    conn.close()

    # Prepare Dim lookup dfs
    # Prepare Patient Information
    ext_ptnt_info = pd.read_sql_query(ptnt_query, con=engine)
    ext_ptnt_info['dob_Tgt'] = ext_ptnt_info['dob_Tgt'].apply(lambda x: datefmtx(x))
    logging.info(" Get the patient table IKs to form one of the FKs of the row ")

    # Prepare Provider Information
    ext_prvdr_info = pd.read_sql_query(prvdr_query, con=engine)
    logging.info(" Get the provider table IKs to form one of the FKs of the row ")

    # Merge all dimension tables 
    enc_pt_prv_df = df.merge(ext_ptnt_info, left_on=["Patient_Name", "dob"],
                             right_on=["Patient_Name_Tgt", "dob_Tgt"]).merge(ext_prvdr_info, on="Provider_Npi")
    encntr_info = enc_pt_prv_df[
        ['patient_IK', 'Provider_IK', 'Encounter_Date', 'Chief_Complaint', 'Encounter_Note', 'File_Name']]
    logging.info(" Derive FKs ")

    # Insert new set of appropriate records with respective
    encntr_info.to_sql("Encounter_Info", engine, schema='apixio', index=False, if_exists='append')
    logging.info(" Encounter Info table populated successfully ")

    os.rename(filename, "./Processed/" + file)
    logging.info(" File moved to Processed Directory: ./Processed/" + file)

    logging.info("-" * 20 + ' encntr_info_load : Completed ' + "-" * 20)


def create_folder_structure(directory):
    """ Create the project required folders if doesnt exists. Accepts the main project folder

        (Assuming Script is in project folder. Considering it as project home folder)

            1. Creates the non Existing Project folders

            2. Information is logged as File Exists on any exceptions.

    """

    logging.info("-" * 20 + ' Create_Folder_Structure : In Progress ' + "-" * 20)

    try:

        os.mkdir(directory+'/Data')
        os.mkdir(directory+'/Error')
        os.mkdir(directory+'/Processed')
        os.mkdir(directory+'/Hold')
        os.mkdir(directory+'/Logs')
        logging.info(" Project Folders Created!!")

    except FileExistsError:

        logging.info(" Project Folders already Exists!!")

    logging.info("-" * 20 + ' Create_Folder_Structure : Completed ' + "-" * 20)


def main(in_filename):
    """ Main function which invokes all the functions internally to connect the dots.

        Accepts the file name to be processed and loaded to the Data Warehouse or ODS

            1. Main function which invokes all the interrelated functions

    """

    logging.info("-" * 20 + ' Main Program : Started ' + "-" * 20)
    dx = extract_n_transform(in_filename)

    prvdr_dim_load(dx)
    ptnt_dim_load(dx)

    encntr_info_load(dx)
    logging.info("-" * 20 + ' Main Program : Completed ' + "-" * 20)


if __name__ == '__main__':

    filename = sys.argv[1]

    file = os.path.basename(filename)
    file_nm, _, file_ext = file.partition('.')

    cdw = os.path.curdir
    create_folder_structure(cdw)

    main(filename)
    logging.info('*' * 20 + " Program Ends !!! " + '*' * 20)
