# -*- coding: utf-8 -*-
"""
Created on 4th March 2022

@author: Team B IE University 2022
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import holoviews as hv; hv.extension('bokeh', 'matplotlib')
import pandas as pd
from datetime import date
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
pd.options.mode.chained_assignment = None

from woe import WOE

pd.options.plotting.backend = 'holoviews'

class BrushingDataframe(pd.DataFrame):
    """
    The class is used to extend the properties of Dataframes to a prticular
    type of Dataframes in the Risk Indistry. 
    It provides the end user with both general and specific cleaning functions, 
    though they never reference a specific VARIABLE NAME.
    
    It facilitates the End User to perform some Date Feature Engineering,
    Scaling, Encoding, etc. to avoid code repetition.
    """

    #Initializing the inherited pd.DataFrame
    def __init__(self, *args, **kwargs):
        super().__init__(*args,**kwargs)
    
    @property
    def _constructor(self):
        def func_(*args,**kwargs):
            df = BrushingDataframe(*args,**kwargs)
            return df
        return func_
    
#-----------------------------------------------------------------------------
                        # DATA HANDLING
#-----------------------------------------------------------------------------

    def SetAttributes(self, kwargs):
        """
        The function will update the type of the variable submitted for change.
        It will veify first that the key is present in the desired dataframe.
        If present, it will try to change the type to the desired format.
        If not possible, it will continue to the next element.         
        Parameters
        ----------
        **kwargs : The key-argument pair of field-type relationship that
        wants to be updated.
        Returns
        -------
        None.
        """
        if self.shape[0] > 0:
            for key,vartype in kwargs.items():
                if key in self.columns:
                    try:
                        self[key] = self[key].astype(vartype)
                    except:
                        print("Undefined type {}".format(str(vartype)))
                else:
                    print("The dataframe does not contain variable {}.".format(str(key)))
        else:
            print("The dataframe has not yet been initialized")

#-----------------------------------------------------------------------------
                        # SUPERVISED - BINARY CLASSIFICATION - DATA CLEANING
#-----------------------------------------------------------------------------    
    def cleaning_missing(self, input_vars=[] ):
        """
        TO BE IMPLEMENTED: data cleaning (provide methods for data scanning and cleaning, 
            for example: scan each column, indicating if droping or keeping the variable for 
            modelling and why, for the ones keeping indicates which cleaning / transformation 
            is recommended for the missing values and if scalling / dummy creation is recommended, 
            if not always inform that is not necessary);
        Returns
        -------
          A print with the analysis or new clean columns .

        """
        if input_vars:
            self = self[input_vars]
            
        for column in self.columns.values:
        # Replace NaNs with the median or mode of the column depending on the column type
            try:
                self[column].fillna(self[column].median(), inplace=True)
            except TypeError:
                self[column] = self[column].str.lower()
                most_frequent = self[column].mode()
                if len(most_frequent) > 0:
                    self[column].fillna(self[column].mode()[0], inplace=True)
                else:
                    self[column].fillna(method='bfill', inplace=True)
                    self[column].fillna(method='ffill', inplace=True)
   
        return self
    def cleaning_missing_display(self, input_vars=[] ):
       
        """
        TO BE IMPLEMENTED: data cleaning (provide methods for data scanning and cleaning, 
            for example: scan each column, indicating if droping or keeping the variable for 
            modelling and why, for the ones keeping indicates which cleaning / transformation 
            is recommended for the missing values and if scalling / dummy creation is recommended, 
            if not always inform that is not necessary);
        Returns
        -------
          A print with the analysis or new clean columns .

        """
        if input_vars:
            self = self[input_vars]
        ## Check Null Values    
        null_df = self.isna().sum().to_frame("Null Counts")
        null_df = null_df.loc[null_df ['Null Counts']>0].rename_axis('Features').reset_index()
        null_table = hv.Table(null_df, label='Features with Null Values')
        ## Check data types
        types_df  = self.dtypes.to_frame("Data Types").rename_axis('Features').reset_index()
        types_df ["Data Types"] = types_df ["Data Types"].astype(str)
        types_df.loc[(types_df['Data Types'].str.contains("object", case=False)) & (types_df['Features'].str.contains("date", case=False) ),"Recomendation"] = "Consider convert to Date or Time type"
        types_df = types_df.fillna("")
        typ_table = hv.Table(types_df , label='Features data types')
        ## Plot Histogram for numericals
        hist_plots = []
        for column in self._get_numeric_data().columns:
            hist_plots.append(self[column].plot.hist(bins=100, bin_range=(0, self[column].max()), title='Histogram and Box Plot ' + column).opts(width=800))
            hist_plots.append(self[column].plot.box(invert=True).opts(xrotation=90,width=800))
        
        histplots = hv.Layout(hist_plots)
        ## Plot for categorical
        cat_plots = []
        for column in self.select_dtypes(include=['object']).dtypes.index:
            #if int(self[column].nunique()) < 100:
            df = pd.DataFrame()
            df = self[column].value_counts().sort_index().to_frame().reset_index().rename(columns={"index": column, column: "Counts"})
            df = df.sort_values(by='Counts', ascending=False).head(100)
            cat_plots.append( hv.Bars(df.groupby(column).sum()[["Counts"]]).opts(invert_axes=True, width=800,height=500,xrotation=45,title=column))
        catplots = hv.Layout(cat_plots)

        corr = self.corr()
        corrplots = hv.HeatMap((corr.index, corr.columns, corr.values)).opts(title='Correlation Matrix').opts(width=800,height=500,xrotation=45,tools=['hover'])
        p = (null_table.opts(height=100) + typ_table.opts(width=800, height=400) + catplots + histplots + corrplots ).cols(1)

        return p
    
    

    def recommended_transformation(self, input_vars=[], WOE_tresh = 10,  target='',reference_date= '',test_size_in= 0.3):
        """
        
        TO BE IMPLEMENTED: data preparation (for each column provide methods to perform
        transformations - for example: time calculation like age, days as customers, 
        days to due date, label encoding, imputation, standard scalling, dummy creation 
        or replacement of category value by its probability of default depending, justify 
        transformation depending of the variable type, or explain why transformation is 
        not necessary);

        Returns
        -------
          A print with the analysis or new transformed columns.                
        """
        if input_vars:
            self = self[input_vars]
        ### Convert feature with Date descrtiption to date
        log_recom = []

        log_recom.append("Convert feature with Date descrtiption to date")
        for column in self.columns:
            if (column.lower().find('date') != -1 ) & (self[column].dtype == 'object'):
                self[column] = pd.to_datetime(self[column])
                log_recom.append("  Convert "+column+" to Date types")
                month = str(column.upper().replace('DATE', ""))+"_MONTH"
                self[month] = self[column].dt.month_name(locale='English')
                log_recom.append("Create new feature based on refference day ")
                if (column != reference_date) and (reference_date != ''):
                    new_column = str(reference_date)+"___"+str(column)
                    self[new_column] = abs(self[column] - pd.to_datetime(self[reference_date]))
                    self[new_column] = self[new_column].dt.days

        ### Split 
        self = self.reset_index()
        self.drop(['index'], axis = 1,inplace=True)
        self = self.select_dtypes(exclude=['datetime64[ns]'])
        y = self[target]
        X = self.drop([target], axis = 1) 
        X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=self[target], test_size=test_size_in)
        
        ### WOE on categorical features based on treshold default 10 unique category values 
        my_WOE = WOE()
        X_train.loc[:,target] = y_train
        X_test.loc[:,target] = y_test

        for column in self.columns:
           if (self[column].dtype == 'object') & (self[column].nunique() > WOE_tresh):
                my_WOE.fit(X_train,column,target)
                X_train.loc[:,column] = my_WOE.transform(X_train,column,target)
                X_test.loc[:,column] = my_WOE.transform(X_test,column,target)
        X_train.drop([target], axis = 1,inplace=True) 
        X_test.drop([target], axis = 1,inplace=True) 
        return X_train, X_test, y_train, y_test