# This source code file is a part of SigProfilerTopography
# SigProfilerTopography is a tool included as part of the SigProfiler
# computational framework for comprehensive analysis of mutational
# signatures from next-generation sequencing of cancer genomes.
# SigProfilerTopography provides the downstream data analysis of
# mutations and extracted mutational signatures w.r.t.
# nucleosome occupancy, replication time, strand bias and processivity.
# Copyright (C) 2018-2020 Burcak Otlu


# Version2
# This version use np.arrays
# Right now replication strand bias analysis works for single point mutations and signatures.
# This python code analyses the Replication Strand Bias

import multiprocessing
import numpy as np
import pandas as pd
import os
import math

from SigProfilerTopography.source.commons.TopographyCommons import CHROM
from SigProfilerTopography.source.commons.TopographyCommons import START
from SigProfilerTopography.source.commons.TopographyCommons import END
from SigProfilerTopography.source.commons.TopographyCommons import SIGNAL

from SigProfilerTopography.source.commons.TopographyCommons import PYRAMIDINESTRAND
from SigProfilerTopography.source.commons.TopographyCommons import SAMPLE

from SigProfilerTopography.source.commons.TopographyCommons import TYPE

from SigProfilerTopography.source.commons.TopographyCommons import SUBS
from SigProfilerTopography.source.commons.TopographyCommons import INDELS
from SigProfilerTopography.source.commons.TopographyCommons import DINUCS

from SigProfilerTopography.source.commons.TopographyCommons import MUTATION
from SigProfilerTopography.source.commons.TopographyCommons import LENGTH

from SigProfilerTopography.source.commons.TopographyCommons import LEADING
from SigProfilerTopography.source.commons.TopographyCommons import LAGGING
from SigProfilerTopography.source.commons.TopographyCommons import REPLICATIONSTRANDBIAS

from SigProfilerTopography.source.commons.TopographyCommons import DATA
from SigProfilerTopography.source.commons.TopographyCommons import LIB
from SigProfilerTopography.source.commons.TopographyCommons import CHRBASED

from SigProfilerTopography.source.commons.TopographyCommons import updateDictionaries_simulations_integrated
from SigProfilerTopography.source.commons.TopographyCommons import updateDictionaries_simulations_integrated_for_list_comprehension

from SigProfilerTopography.source.commons.TopographyCommons import readWig_with_fixedStep_variableStep

from SigProfilerTopography.source.nucleosomeoccupancy.ChrBasedSignalArrays import readFileInBEDFormat
from SigProfilerTopography.source.commons.TopographyCommons import memory_usage


from SigProfilerTopography.source.commons.TopographyCommons import getDictionary
from SigProfilerTopography.source.commons.TopographyCommons import readChrBasedMutationsDF
from SigProfilerTopography.source.commons.TopographyCommons import accumulate_simulations_integrated_for_each_tuple

from SigProfilerTopography.source.commons.TopographyCommons import writeDictionary
from SigProfilerTopography.source.commons.TopographyCommons import write_type_strand_bias_dictionary_as_dataframe
from SigProfilerTopography.source.commons.TopographyCommons import write_signature_mutation_type_strand_bias_dictionary_as_dataframe


from SigProfilerTopography.source.commons.TopographyCommons import Type2ReplicationStrand2CountDict_Filename
from SigProfilerTopography.source.commons.TopographyCommons import Signature2MutationType2ReplicationStrand2CountDict_Filename

from SigProfilerTopography.source.commons.TopographyCommons import Type2Sample2ReplicationStrand2CountDict_Filename
from SigProfilerTopography.source.commons.TopographyCommons import Sample2Type2ReplicationStrand2CountDict_Filename

from SigProfilerTopography.source.commons.TopographyCommons import USING_IMAP_UNORDERED
from SigProfilerTopography.source.commons.TopographyCommons import USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM
from SigProfilerTopography.source.commons.TopographyCommons import USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM_SPLIT
from SigProfilerTopography.source.commons.TopographyCommons import USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM_SPLIT_USING_POOL_INPUT_LIST

from SigProfilerTopography.source.commons.TopographyCommons import NUMBER_OF_MUTATIONS_IN_EACH_SPLIT
from SigProfilerTopography.source.commons.TopographyCommons import MAXIMUM_NUMBER_JOBS_IN_THE_POOL_AT_ONCE

from SigProfilerTopography.source.commons.TopographyCommons import get_chrBased_simBased_combined_df_split
from SigProfilerTopography.source.commons.TopographyCommons import get_chrBased_simBased_combined_chunks_df
from SigProfilerTopography.source.commons.TopographyCommons import get_chrBased_simBased_combined_df
from SigProfilerTopography.source.commons.TopographyCommons import decideFileType

#For Supp Fig2B
CHR10_THRESHOLD_START = 16400000
CHR10_THRESHOLD_END = 26400000

#For Supp Fig2A
CHR20_START = 36260000
CHR20_END = 36830000

# FOR FINDING TRANSITION ZONES (LEADING or LAGGING)
# THRESHOLD_CONSECUTIVE_LONG_STRETCH_LENGTH= 250000 #used in Supp Fig2B
# THRESHOLD_CONSECUTIVE_LONG_STRETCH_LENGTH= 150000
THRESHOLD_CONSECUTIVE_LONG_STRETCH_LENGTH= 10000

# THRESHOLD_DISCARD_LATEST_TRANSITION_ZONE = 100000 #used in Supp Fig2B
THRESHOLD_DISCARD_LATEST_TRANSITION_ZONE = 25000
# THRESHOLD_LATEST_TRANSITION_ZONE = 0


########################################################################
def checkForSameSignedSlopeBetweenConsecutivePeakandValley(chrLong,peakorValleyStart, peakorValleyEnd, chrBasedSmoothedWaveletReplicationTimeSignalDF):
    transitionZoneList =[]

    # print('################ checkForConsecutive starts ############ fromStart: %s toEnd: %s' %(peakorValleyStart,peakorValleyEnd))
    subset_df = chrBasedSmoothedWaveletReplicationTimeSignalDF[(chrBasedSmoothedWaveletReplicationTimeSignalDF[START]>=peakorValleyStart) & (chrBasedSmoothedWaveletReplicationTimeSignalDF[END]<=peakorValleyEnd)]

    consecutiveLength = 0
    formerRow= None
    formerSlopeDirection = None

    start = peakorValleyStart

    for index,row in subset_df.iterrows():
        if formerRow is None:
            #We read the row for the first time
            formerRow = row
            consecutiveLength += 1000
        else:
            slope = (row.get(SIGNAL) - formerRow.get(SIGNAL)) / 1000
            formerRow = row

            if (formerSlopeDirection is None):
                formerSlopeDirection = np.sign(slope)
                consecutiveLength += 1000
            elif (formerSlopeDirection==np.sign(slope)):
                consecutiveLength += 1000
            else:
                #They have different signs
                if (consecutiveLength>=THRESHOLD_CONSECUTIVE_LONG_STRETCH_LENGTH):
                    # print('Slope sign changed -- Found one: from %d to %d with %d bases with slope sign %s' %(start,((row.get('start') + row.get('end'))//2), consecutiveLength, formerSlopeDirection))
                    transitionZoneList.append((chrLong,start,(row.get(START) + row.get(END))//2,formerSlopeDirection,consecutiveLength))
                #initialize and start again
                consecutiveLength = 1000
                start = (row.get(START) + row.get(END))//2
                formerRow= row
                formerSlopeDirection= np.sign(slope)
                continue

            # print('slope: %f - np.sign(slope): %f -  consecutiveLength: %d ' %(slope,np.sign(slope),consecutiveLength))
            formerSlopeDirection = np.sign(slope)

    #This is for the last probable transition zone.
    if (consecutiveLength >= THRESHOLD_CONSECUTIVE_LONG_STRETCH_LENGTH):
        # print('After for loop ends, found one: from %d to %s with %d bases with slope sign %s' % (start, (row.get('start') + row.get('end'))//2, consecutiveLength, formerSlopeDirection))
        transitionZoneList.append((chrLong,start,(row.get(START) + row.get(END))//2,formerSlopeDirection,consecutiveLength))

    # print('################ checkForConsecutive ends ############ fromStart: %s toEnd: %s' % (peakorValleyStart,peakorValleyEnd))
    return transitionZoneList
########################################################################


########################################################################
# chr10_subset_wavelet_processed_df
#           chr     start       end   signal
# 265577  chr10  16400500  16401499  24.9438
# 265578  chr10  16401500  16402499  24.9585

# valleys_peaks_df
#         chr     start       end    type
# 415     chr10  16454500  16455500    Peak
# 415  chr10  16528500  16529500  Valley

def findLongStretchesofConsistentTransitionZones(chrLong,fromStart,toEnd,chrBasedSmoothedWaveletReplicationTimeSignalDF,valleys_peaks_df):
    transitionZonesList =[]
    for index,row in  valleys_peaks_df.iterrows():
        peakorValleyStart = row[START]
        peakorValleyEnd = row[END]
        peakorValleyMidpoint = (peakorValleyStart+peakorValleyEnd)//2

        type = row['type']
        if (type =='Peak'):
            if (peakorValleyMidpoint>fromStart):
                # print('from: %d - to: %d - difference: %d'  %(fromStart,peakorValleyMidpoint, (peakorValleyMidpoint-fromStart)))
                found = checkForSameSignedSlopeBetweenConsecutivePeakandValley(chrLong,fromStart, peakorValleyMidpoint, chrBasedSmoothedWaveletReplicationTimeSignalDF)
                transitionZonesList.extend(found)
                # print('found %s' %found)
            fromStart=peakorValleyMidpoint
        elif (type=='Valley'):
            valleyStart =row[START]
            valleyEnd = row[END]
            valleyMidpoint = (valleyStart+valleyEnd)//2
            # This is something special to valley
            newValleyStart1 = valleyMidpoint - THRESHOLD_DISCARD_LATEST_TRANSITION_ZONE
            newValleyStart2 = valleyMidpoint + THRESHOLD_DISCARD_LATEST_TRANSITION_ZONE
            if (newValleyStart1>fromStart):
                # print('from: %d - to: %d - difference: %d' % (fromStart, newValleyStart1, (newValleyStart1 - fromStart)))
                found = checkForSameSignedSlopeBetweenConsecutivePeakandValley(chrLong,fromStart, newValleyStart1,chrBasedSmoothedWaveletReplicationTimeSignalDF)
                transitionZonesList.extend(found)
                # print('found %s' % found)
            # bypass the genome region between newValleyStart1 and newValleyStart2
            fromStart = newValleyStart2
    #
    #For the last interval
    if (toEnd>fromStart):
        # print('last one from: %d - to: %d -difference: %d' %(fromStart,toEnd,(toEnd-fromStart)))
        found = checkForSameSignedSlopeBetweenConsecutivePeakandValley(chrLong,fromStart, toEnd, chrBasedSmoothedWaveletReplicationTimeSignalDF)
        transitionZonesList.extend(found)
        # print('found %s' %found)

    return transitionZonesList
########################################################################


########################################################################
#TODO Is (replicationStrand_row['end']+1) okey?
#We assume that there are no overlapping intervals with positive and negative slopes.
#To test it have one array for positive slope fill with 1s
#                one array for negative slope fill with -2a
#                add them if you habe any -1 that means that you contradict this assumption.
def fillReplicationStrandArray(replicationStrand_row,chrBased_replication_array):
    # e.g.: replicationStrand_row
    # chr chrX
    # start   154861998
    # end 155096999
    # slopeDirection  1 (1 means leading strand -1 means lagging strand on positive strand)
    # length  235000

    # labels = ['chr', 'start', 'end', 'slopeDirection', 'length']
    chrBased_replication_array[replicationStrand_row['start']:replicationStrand_row['end']+1] = replicationStrand_row['slopeDirection']
########################################################################

########################################################################
# April 23, 2020
# Summary:
#   if mutationPyramidineStrand and slope have the same sign increase LEADING STRAND count
#   else mutationPyramidineStrand and slope have the opposite sign increase LAGGING STRAND count
# This is for apply
def searchAllMutationOnReplicationStrandArray_using_list_comprehension(
        mutation_row,
        chrBasedReplicationArray,
        simNum2Type2ReplicationStrand2CountDict,
        simNum2Sample2Type2ReplicationStrand2CountDict,
        simNum2Type2Sample2ReplicationStrand2CountDict,
        simNum2Signature2MutationType2ReplicationStrand2CountDict,
        subsSignature_cutoff_numberofmutations_averageprobability_df,
        indelsSignature_cutoff_numberofmutations_averageprobability_df,
        dinucsSignature_cutoff_numberofmutations_averageprobability_df,
        sample_based,
        df_columns):

    indexofStart = df_columns.index(START)
    start = mutation_row[indexofStart]

    mutationType = None

    indexofPyrimidineStrand = df_columns.index(PYRAMIDINESTRAND)
    pyramidineStrand = mutation_row[indexofPyrimidineStrand]

    indexofSample = df_columns.index(SAMPLE)
    sample = mutation_row[indexofSample]

    #############################################################################################################
    indexofType = df_columns.index(TYPE)
    my_type=mutation_row[indexofType]

    if(my_type==SUBS):
        end = start+1
        #e.g.: C>A
        indexofMutation = df_columns.index(MUTATION)
        mutationType = mutation_row[indexofMutation]
        signature_cutoff_numberofmutations_averageprobability_df = subsSignature_cutoff_numberofmutations_averageprobability_df
    elif (my_type==INDELS):
        indexofLength = df_columns.index(LENGTH)
        end = start+int(mutation_row[indexofLength])
        signature_cutoff_numberofmutations_averageprobability_df = indelsSignature_cutoff_numberofmutations_averageprobability_df
    elif (my_type==DINUCS):
        end = start+2
        signature_cutoff_numberofmutations_averageprobability_df = dinucsSignature_cutoff_numberofmutations_averageprobability_df
    #############################################################################################################

    #############################################################################################################
    #if there is overlap with chrBasedReplicationArray
    slicedArray = chrBasedReplicationArray[int(start):int(end)]

    if (np.any(slicedArray)):

        #It must be full with at most -1 and +1
        uniqueValueArray = np.unique(slicedArray[np.nonzero(slicedArray)])

        if (uniqueValueArray.size>2):
            print('There is a situation!!!')

        elif ((uniqueValueArray.size==2) and (pyramidineStrand!=0)):
            #Increment both LEADING and LAGGING
            updateDictionaries_simulations_integrated_for_list_comprehension(mutation_row,
                                        mutationType,
                                        sample,
                                        sample_based,
                                        simNum2Type2ReplicationStrand2CountDict,
                                        simNum2Sample2Type2ReplicationStrand2CountDict,
                                        simNum2Type2Sample2ReplicationStrand2CountDict,
                                        simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                        LAGGING,
                                        signature_cutoff_numberofmutations_averageprobability_df,
                                        df_columns)

            updateDictionaries_simulations_integrated_for_list_comprehension(mutation_row,
                                        mutationType,
                                        sample,
                                        sample_based,
                                        simNum2Type2ReplicationStrand2CountDict,
                                        simNum2Sample2Type2ReplicationStrand2CountDict,
                                        simNum2Type2Sample2ReplicationStrand2CountDict,
                                        simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                        LEADING,
                                        signature_cutoff_numberofmutations_averageprobability_df,
                                        df_columns)


        # I expect the value of 1 (LEADING on the positive strand) or -1 (LAGGING on the positive strand) so size must be one.
        elif (uniqueValueArray.size == 1):
            for uniqueValue in np.nditer(uniqueValueArray):
                # type(decileIndex) is numpy.ndarray
                slope = int(uniqueValue)

                #They have the same sign, multiplication (1,1) (-1,-1) must be 1
                if (slope*pyramidineStrand > 0):
                    updateDictionaries_simulations_integrated_for_list_comprehension(mutation_row,
                                            mutationType,
                                            sample,
                                            sample_based,
                                            simNum2Type2ReplicationStrand2CountDict,
                                            simNum2Sample2Type2ReplicationStrand2CountDict,
                                            simNum2Type2Sample2ReplicationStrand2CountDict,
                                            simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                            LEADING,
                                            signature_cutoff_numberofmutations_averageprobability_df,
                                            df_columns)

                # They have the opposite sign, multiplication(1,-1) (-1,-)  must be -1
                elif (slope*pyramidineStrand < 0):
                    updateDictionaries_simulations_integrated_for_list_comprehension(mutation_row,
                                            mutationType,
                                            sample,
                                            sample_based,
                                            simNum2Type2ReplicationStrand2CountDict,
                                            simNum2Sample2Type2ReplicationStrand2CountDict,
                                            simNum2Type2Sample2ReplicationStrand2CountDict,
                                            simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                            LAGGING,
                                            signature_cutoff_numberofmutations_averageprobability_df,
                                            df_columns)
        else:
            print('There is a situation!!!')
    #############################################################################################################

########################################################################


########################################################################
#April 5, 2020
# Summary:
#   if mutationPyramidineStrand and slope have the same sign increase LEADING STRAND count
#   else mutationPyramidineStrand and slope have the opposite sign increase LAGGING STRAND count
# This is for apply
def searchAllMutationOnReplicationStrandArray_using_apply(
        mutation_row,
        chrBasedReplicationArray,
        simNum2Type2ReplicationStrand2CountDict,
        simNum2Sample2Type2ReplicationStrand2CountDict,
        simNum2Type2Sample2ReplicationStrand2CountDict,
        simNum2Signature2MutationType2ReplicationStrand2CountDict,
        subsSignature_cutoff_numberofmutations_averageprobability_df,
        indelsSignature_cutoff_numberofmutations_averageprobability_df,
        dinucsSignature_cutoff_numberofmutations_averageprobability_df,
        sample_based):

    start = mutation_row[START]
    mutationType = None

    pyramidineStrand = mutation_row[PYRAMIDINESTRAND]
    sample = mutation_row[SAMPLE]

    #############################################################################################################
    my_type=mutation_row[TYPE]

    if(my_type==SUBS):
        end = start+1
        #e.g.: C>A
        mutationType = mutation_row[MUTATION]
        signature_cutoff_numberofmutations_averageprobability_df = subsSignature_cutoff_numberofmutations_averageprobability_df
    elif (my_type==INDELS):
        end = start+int(mutation_row[LENGTH])
        signature_cutoff_numberofmutations_averageprobability_df = indelsSignature_cutoff_numberofmutations_averageprobability_df
    elif (my_type==DINUCS):
        end = start+2
        signature_cutoff_numberofmutations_averageprobability_df = dinucsSignature_cutoff_numberofmutations_averageprobability_df
    #############################################################################################################

    #############################################################################################################
    #if there is overlap with chrBasedReplicationArray
    slicedArray = chrBasedReplicationArray[int(start):int(end)]

    if (np.any(slicedArray)):

        #It must be full with at most -1 and +1
        uniqueValueArray = np.unique(slicedArray[np.nonzero(slicedArray)])

        if (uniqueValueArray.size>2):
            print('There is a situation!!!')

        elif ((uniqueValueArray.size==2) and (pyramidineStrand!=0)):
            #Increment both LEADING and LAGGING
            updateDictionaries_simulations_integrated(mutation_row,
                                        mutationType,
                                        sample,
                                        sample_based,
                                        simNum2Type2ReplicationStrand2CountDict,
                                        simNum2Sample2Type2ReplicationStrand2CountDict,
                                        simNum2Type2Sample2ReplicationStrand2CountDict,
                                        simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                        LAGGING,
                                        signature_cutoff_numberofmutations_averageprobability_df)

            updateDictionaries_simulations_integrated(mutation_row,
                                        mutationType,
                                        sample,
                                        sample_based,
                                        simNum2Type2ReplicationStrand2CountDict,
                                        simNum2Sample2Type2ReplicationStrand2CountDict,
                                        simNum2Type2Sample2ReplicationStrand2CountDict,
                                        simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                        LEADING,
                                        signature_cutoff_numberofmutations_averageprobability_df)


        # I expect the value of 1 (LEADING on the positive strand) or -1 (LAGGING on the positive strand) so size must be one.
        elif (uniqueValueArray.size == 1):
            for uniqueValue in np.nditer(uniqueValueArray):
                # type(decileIndex) is numpy.ndarray
                slope = int(uniqueValue)

                #They have the same sign, multiplication (1,1) (-1,-1) must be 1
                if (slope*pyramidineStrand > 0):
                    updateDictionaries_simulations_integrated(mutation_row,
                                            mutationType,
                                            sample,
                                            sample_based,
                                            simNum2Type2ReplicationStrand2CountDict,
                                            simNum2Sample2Type2ReplicationStrand2CountDict,
                                            simNum2Type2Sample2ReplicationStrand2CountDict,
                                            simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                            LEADING,
                                            signature_cutoff_numberofmutations_averageprobability_df)

                # They have the opposite sign, multiplication(1,-1) (-1,-)  must be -1
                elif (slope*pyramidineStrand < 0):
                    updateDictionaries_simulations_integrated(mutation_row,
                                            mutationType,
                                            sample,
                                            sample_based,
                                            simNum2Type2ReplicationStrand2CountDict,
                                            simNum2Sample2Type2ReplicationStrand2CountDict,
                                            simNum2Type2Sample2ReplicationStrand2CountDict,
                                            simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                            LAGGING,
                                            signature_cutoff_numberofmutations_averageprobability_df)
        else:
            print('There is a situation!!!')
    #############################################################################################################

########################################################################



########################################################################
#This code checks whether valleys and peaks are one after another, not two consecutive elements are both valley and peak.
def checkforValidness(chrBased_valleys_peaks_df):
    formerRowType = None

    for index, row in chrBased_valleys_peaks_df.iterrows():
        if formerRowType is None:
            formerRowType = row['type']
        elif (row['type']== formerRowType):
            return False
        else:
            formerRowType = row['type']

    return True
########################################################################


########################################################################
def get_chr_based_replication_strand_array_for_callback(chrLong,chromSize,repliseq_signal_df,valleys_df,peaks_df):
    chrBased_replication_array=get_chr_based_replication_strand_array(chrLong, chromSize, repliseq_signal_df, valleys_df, peaks_df)
    return (chrLong,chrBased_replication_array)
########################################################################


########################################################################
def get_chr_based_replication_strand_array(chrLong,chromSize,repliseq_signal_df,valleys_df,peaks_df):

    # Read chrBasedSmoothedWaveletReplicationTimeSignalDF
    chrBased_SmoothedWaveletReplicationTimeSignal_df = repliseq_signal_df[repliseq_signal_df[CHROM] == chrLong]

    chrBasedValleysDF = valleys_df[valleys_df[CHROM] == chrLong].copy()
    chrBasedValleysDF['type'] = 'Valley'
    chrBasedValleysDF.astype(dtype={START: int, END: int})

    chrBasedPeaksDF = peaks_df[peaks_df[CHROM] == chrLong].copy()
    chrBasedPeaksDF['type'] = 'Peak'
    chrBasedPeaksDF.astype(dtype={START: int, END: int})

    # Concat Peaks and Valleys
    chrBased_valleys_peaks_df = pd.concat([chrBasedValleysDF, chrBasedPeaksDF], axis=0)

    # Sort Valleys and peaks
    chrBased_valleys_peaks_df.sort_values(START, inplace=True)

    if ((chrBased_SmoothedWaveletReplicationTimeSignal_df is not None) and (not chrBased_SmoothedWaveletReplicationTimeSignal_df.empty) and (checkforValidness(chrBased_valleys_peaks_df))):
        chrBased_replication_array = fill_chr_based_replication_strand_array(chrLong, chromSize,chrBased_SmoothedWaveletReplicationTimeSignal_df,chrBased_valleys_peaks_df)
        return chrBased_replication_array
    else:
        return None
########################################################################


########################################################################
def fill_chr_based_replication_strand_array(chrLong,
                                            chromSize,
                                            chrBasedSmoothedWaveletReplicationTimeSignalDF,
                                            chrBased_valleys_peaks_df):
    # +1 means leading strand, -1 means lagging strand
    # we will fill this array using smoothedSignal, peaks and valleys for each chromosome
    chrBased_replication_array = np.zeros(chromSize, dtype=np.int8)

    firstIndex = chrBasedSmoothedWaveletReplicationTimeSignalDF.index[0]
    lastIndex = chrBasedSmoothedWaveletReplicationTimeSignalDF.index[-1]

    startColumnIndex = chrBasedSmoothedWaveletReplicationTimeSignalDF.columns.get_loc(START)
    endColumnIndex = chrBasedSmoothedWaveletReplicationTimeSignalDF.columns.get_loc(END)

    start = chrBasedSmoothedWaveletReplicationTimeSignalDF.iloc[0, startColumnIndex]  # get the first row start
    end = chrBasedSmoothedWaveletReplicationTimeSignalDF.iloc[-1, endColumnIndex]  # get the last row end

    # Step1 Find the transition zones
    chrBasedTransitionZonesList = findLongStretchesofConsistentTransitionZones(chrLong,start,end,
                                                                               chrBasedSmoothedWaveletReplicationTimeSignalDF,
                                                                               chrBased_valleys_peaks_df)

    labels = ['chr', 'start', 'end', 'slopeDirection', 'length']
    chrBasedTransitionZonesDF = pd.DataFrame.from_records(chrBasedTransitionZonesList, columns=labels)

    # Step2 Fill the replication array using transition zones
    chrBasedTransitionZonesDF.apply(fillReplicationStrandArray, chrBased_replication_array=chrBased_replication_array,axis=1)

    return chrBased_replication_array
########################################################################


########################################################################
def read_repliseq_dataframes(smoothedWaveletRepliseqDataFilename,valleysBEDFilename,peaksBEDFilename):

    ################### Read the Smoothed Wavelet Replication Time Signal starts ###########################
    #new way, JAN 7, 2020
    file_extension = os.path.splitext(os.path.basename(smoothedWaveletRepliseqDataFilename))[1]
    if (file_extension.lower() == '.wig'):
        isFileTypeBEDGRAPH = decideFileType(smoothedWaveletRepliseqDataFilename)
        if isFileTypeBEDGRAPH:
            repliseq_wavelet_signal_df=pd.read_csv(smoothedWaveletRepliseqDataFilename, sep='\t', comment='#', header=None, names=[CHROM,START,END,SIGNAL])
        else:
            repliseq_wavelet_signal_df=readWig_with_fixedStep_variableStep(smoothedWaveletRepliseqDataFilename)
    elif (file_extension.lower()=='.bedgraph'):
        repliseq_wavelet_signal_df = pd.read_csv(smoothedWaveletRepliseqDataFilename, sep='\t', comment='#',header=None, names=[CHROM, START, END, SIGNAL])

    print('Chromosome names in replication time signal data: %s' % (repliseq_wavelet_signal_df[CHROM].unique()))
    # print('repliseq_wavelet_signal_df[chr].unique')
    # print(repliseq_wavelet_signal_df['chr'].unique())
    ################### Read the Smoothed Wavelet Replication Time Signal ends #############################


    ############## Read the Valleys and Peaks starts #######################################
    #read Valleys (local minima) bed file and read Peaks (local maxima) bed file
    # valleysBEDFilename = 'GSM923442_hg19_wgEncodeUwRepliSeqMcf7ValleysRep1.bed'
    # peaksBEDFilename = 'GSM923442_hg19_wgEncodeUwRepliSeqMcf7PkRep1.bed'

    # #old way starts
    # valleys_df= readBED(valleysBEDFilename)
    # print('Chromosome names in replication time valleys data: %s' % (valleys_df['chr'].unique()))
    # # print('valleys_df[chr].unique()')
    # # print(valleys_df['chr'].unique())
    #
    # peaks_df = readBED(peaksBEDFilename)
    # print('Chromosome names in replication time peaks data: %s' % (peaks_df['chr'].unique()))
    # # print('peaks_df[chr].unique()')
    # # print(peaks_df['chr'].unique())
    #
    # valleys_df.drop(valleys_df.columns[[3,4,5,6,7,8]], axis=1, inplace=True)
    # peaks_df.drop(peaks_df.columns[[3,4,5,6,7,8]], axis=1, inplace=True)
    # #old way ends

    #new way starts JAN 7, 2020
    discard_signal=True
    valleys_df= readFileInBEDFormat(valleysBEDFilename,discard_signal)
    valleys_df[END] = valleys_df[END] - 1
    print('Chromosome names in replication time valleys data: %s' % (valleys_df[CHROM].unique()))

    peaks_df = readFileInBEDFormat(peaksBEDFilename,discard_signal)
    peaks_df[END] = peaks_df[END] - 1
    print('Chromosome names in replication time peaks data: %s' % (peaks_df[CHROM].unique()))
    #new way ends JAN 7, 2020


    ############## Read the Valleys and Peaks ends ########################################

    return repliseq_wavelet_signal_df, valleys_df, peaks_df
########################################################################

########################################################################
# April 28, 2020
# Search for all mutatitions
def searchAllMutationsOnReplicationStrandArray(chrBased_simBased_combined_df_split,chrBased_replication_array,subsSignature_cutoff_numberofmutations_averageprobability_df,indelsSignature_cutoff_numberofmutations_averageprobability_df,dinucsSignature_cutoff_numberofmutations_averageprobability_df,sample_based,verbose):

    ################################################################################
    #Fill these dictionaries for the (chrLong,simNum,splitIndex) tuple
    simNum2Type2ReplicationStrand2CountDict= {}
    simNum2Sample2Type2ReplicationStrand2CountDict= {}
    simNum2Type2Sample2ReplicationStrand2CountDict = {}
    simNum2Signature2MutationType2ReplicationStrand2CountDict = {}
    ################################################################################

    ################################################################################
    # # Read chrBasedSmoothedWaveletReplicationTimeSignalDF
    # chrBased_SmoothedWaveletReplicationTimeSignal_df = repliseq_signal_df[repliseq_signal_df[CHROM] == chrLong]
    #
    # chrBasedValleysDF = valleys_df[valleys_df[CHROM] == chrLong].copy()
    # chrBasedValleysDF['type'] = 'Valley'
    # chrBasedValleysDF.astype(dtype={START: int, END: int})
    #
    # chrBasedPeaksDF = peaks_df[peaks_df[CHROM] == chrLong].copy()
    # chrBasedPeaksDF['type'] = 'Peak'
    # chrBasedPeaksDF.astype(dtype={START: int, END: int})
    #
    # # Concat Peaks and Valleys
    # chrBased_valleys_peaks_df = pd.concat([chrBasedValleysDF, chrBasedPeaksDF], axis=0)
    #
    # # Sort Valleys and peaks
    # chrBased_valleys_peaks_df.sort_values(START, inplace=True)
    #
    # if ((chrBased_SmoothedWaveletReplicationTimeSignal_df is not None) and (not chrBased_SmoothedWaveletReplicationTimeSignal_df.empty) and (checkforValidness(chrBased_valleys_peaks_df))):
    #     chrBased_replication_array = fill_chr_based_replication_strand_array(chrLong, chromSize,chrBased_SmoothedWaveletReplicationTimeSignal_df,chrBased_valleys_peaks_df)

    ################################################################################
    if ((chrBased_simBased_combined_df_split is not None) and (not chrBased_simBased_combined_df_split.empty)):
        if verbose: print('\tVerbose Worker pid %s SBS searchMutationd_comOnReplicationStrandArray_simulations_integrated starts %s MB' % (str(os.getpid()), memory_usage()))

        # ##############################################################################################
        # # Using apply
        # # Run Replication Strand Bias Analyses: 165.512188 minutes for 1 simulation
        # chrBased_simBased_combined_df_split.apply(searchAllMutationOnReplicationStrandArray_using_apply,
        #                                           chrBasedReplicationArray=chrBased_replication_array,
        #                                           simNum2Type2ReplicationStrand2CountDict=simNum2Type2ReplicationStrand2CountDict,
        #                                           simNum2Sample2Type2ReplicationStrand2CountDict=simNum2Sample2Type2ReplicationStrand2CountDict,
        #                                           simNum2Type2Sample2ReplicationStrand2CountDict=simNum2Type2Sample2ReplicationStrand2CountDict,
        #                                           simNum2Signature2MutationType2ReplicationStrand2CountDict=simNum2Signature2MutationType2ReplicationStrand2CountDict,
        #                                           subsSignature_cutoff_numberofmutations_averageprobability_df=subsSignature_cutoff_numberofmutations_averageprobability_df,
        #                                           indelsSignature_cutoff_numberofmutations_averageprobability_df=indelsSignature_cutoff_numberofmutations_averageprobability_df,
        #                                           dinucsSignature_cutoff_numberofmutations_averageprobability_df=dinucsSignature_cutoff_numberofmutations_averageprobability_df,
        #                                           sample_based=sample_based,
        #                                           axis=1)
        # ##############################################################################################

        ##############################################################################################
        # Using list comprehension
        # Run Replication Strand Bias Analyses: 153.900515 minutes for 1 simulation
        df_columns = list(chrBased_simBased_combined_df_split.columns.values)
        [searchAllMutationOnReplicationStrandArray_using_list_comprehension(mutation_row,
                                                                            chrBased_replication_array,
                                                                            simNum2Type2ReplicationStrand2CountDict,
                                                                            simNum2Sample2Type2ReplicationStrand2CountDict,
                                                                            simNum2Type2Sample2ReplicationStrand2CountDict,
                                                                            simNum2Signature2MutationType2ReplicationStrand2CountDict,
                                                                            subsSignature_cutoff_numberofmutations_averageprobability_df,
                                                                            indelsSignature_cutoff_numberofmutations_averageprobability_df,
                                                                            dinucsSignature_cutoff_numberofmutations_averageprobability_df,
                                                                            sample_based,
                                                                            df_columns) for mutation_row in chrBased_simBased_combined_df_split.values]
        ##############################################################################################



        if verbose: print('\tVerbose Worker pid %s SBS searchMutationOnReplicationStrandArray_simulations_integrated ends %s MB' % (str(os.getpid()), memory_usage()))
    ################################################################################

    ################################################################################


    return (simNum2Type2ReplicationStrand2CountDict,
            simNum2Sample2Type2ReplicationStrand2CountDict,
            simNum2Type2Sample2ReplicationStrand2CountDict,
            simNum2Signature2MutationType2ReplicationStrand2CountDict)
########################################################################


########################################################################
# April 30, 2020
# Read chromBased and simBased combined (SBS, DBS and ID) dataframe in the process
def searchAllMutationsOnReplicationStrandArray_for_apply_async_read_data_in_the_process(outputDir,jobname,chrLong,simNum,subsSignature_cutoff_numberofmutations_averageprobability_df,indelsSignature_cutoff_numberofmutations_averageprobability_df,dinucsSignature_cutoff_numberofmutations_averageprobability_df,sample_based,verbose):

    chr_based_replication_time_file_name = '%s_replication_time.npy' % (chrLong)
    chr_based_replication_time_file_path = os.path.join(outputDir, jobname, DATA, REPLICATIONSTRANDBIAS, LIB, CHRBASED,chr_based_replication_time_file_name)

    if (os.path.exists(chr_based_replication_time_file_path)):
        chrBased_replication_array = np.load(chr_based_replication_time_file_path)
    else:
        chrBased_replication_array=None

    if chrBased_replication_array is not None:
        chrBased_simBased_combined_df=get_chrBased_simBased_combined_df(outputDir,jobname,chrLong,simNum)
        return searchAllMutationsOnReplicationStrandArray(chrBased_simBased_combined_df,chrBased_replication_array,subsSignature_cutoff_numberofmutations_averageprobability_df,indelsSignature_cutoff_numberofmutations_averageprobability_df,dinucsSignature_cutoff_numberofmutations_averageprobability_df,sample_based,verbose)
    else:
        return ({},{},{},{})
########################################################################


########################################################################
def read_create_write_replication_time_array_in_parallel(outputDir,jobname,chromNamesList,chromSizesDict,smoothedWaveletRepliseqDataFilename,valleysBEDFilename,peaksBEDFilename,verbose):

    repliseq_signal_df, valleys_df, peaks_df = read_repliseq_dataframes(smoothedWaveletRepliseqDataFilename,valleysBEDFilename,peaksBEDFilename)

    ################################
    def write_chrom_based_replication_array(result_tuple):
        chrLong=result_tuple[0]
        chrBased_replication_array=result_tuple[1]
        if (chrBased_replication_array is not None):
            os.makedirs(os.path.join(outputDir, jobname, DATA, REPLICATIONSTRANDBIAS, LIB, CHRBASED), exist_ok=True)
            #File name without extension
            chr_based_replication_time_file_name='%s_replication_time' %(chrLong)
            chr_based_replication_time_file_path = os.path.join(outputDir,jobname,DATA,REPLICATIONSTRANDBIAS,LIB,CHRBASED,chr_based_replication_time_file_name)
            np.save(chr_based_replication_time_file_path, chrBased_replication_array)
    ################################

    ################################
    numofProcesses = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=numofProcesses)
    ################################

    ################################
    jobs = []
    ################################

    for chrLong in chromNamesList:
        chromSize = chromSizesDict[chrLong]
        jobs.append(pool.apply_async(get_chr_based_replication_strand_array_for_callback,
                                 args=(chrLong, chromSize, repliseq_signal_df,valleys_df, peaks_df,),
                                 callback=write_chrom_based_replication_array))

    ##############################################################################
    # wait for all jobs to finish
    for job in jobs:
        if verbose: print('\tVerbose Write Chrom Based Replication Time Array for Replicatio Strand Bias Analysis Worker pid %s job.get():%s ' % (str(os.getpid()), job.get()))
    ##############################################################################

    ################################
    pool.close()
    pool.join()
    ################################

########################################################################


########################################################################
# pool.imap_unordered : Fills poolInputList (jobs), sends poolInputList and accumulates results one by one.
# Slowest one but completes without memory problem.
# Can be updated and tested to read chrom based sim based mutations data and chrom based replication array in the worker process.
#
# pool.apply_async:  USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM_SPLIT
#
# pool.apply_async:  USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM_SPLIT_USING_POOL_INPUT_LIST
# You fill your pool input list therefore you decide how many jobs  to send at once.
# Faster than imap_unordered. Low memory usage.
# Can be updated to read chrom based sim based mutations data and chrom based replication array in the worker process.
# If USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM does not end because of memory error, this can be used.
# All 28/28 processes are running. When the jobs in the pool input list are finishing some processes waits others to finish.
#
# pool.apply_async:  USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM
# For each possible (chrLong,simNum) couple read the data and array on the worker process
# Fastest, consumes more memory than others. 22/28 processes are running. For Combined_PACWG_nonPCAWG Skin_Melanoma after 1 hour all 28/28 running.
def replicationStrandBiasAnalysis(computationType,sample_based,chromSizesDict,chromNamesList,outputDir,jobname,numofSimulations,smoothedWaveletRepliseqDataFilename,valleysBEDFilename, peaksBEDFilename,subsSignature_cutoff_numberofmutations_averageprobability_df,indelsSignature_cutoff_numberofmutations_averageprobability_df,dinucsSignature_cutoff_numberofmutations_averageprobability_df,verbose):

    print('\n#################################################################################')
    print('--- ReplicationStrandBias Analysis starts')

    ###############################################
    #April 30, 2020
    read_create_write_replication_time_array_in_parallel(outputDir,jobname,chromNamesList,chromSizesDict,smoothedWaveletRepliseqDataFilename,valleysBEDFilename,peaksBEDFilename,verbose)
    ###############################################

    ############################Chr based parallel code starts ################################################
    #prepare the input for parallel lines starts
    strandBias = REPLICATIONSTRANDBIAS

    ###############################################################################
    #Accumulate results
    simNum2Type2ReplicationStrand2AccumulatedCountDict = {}
    simNum2Sample2Type2ReplicationStrand2AccumulatedCountDict = {}
    simNum2Type2Sample2ReplicationStrand2AccumulatedCountDict = {}
    simNum2Signature2MutationType2ReplicationStrand2AccumulatedCountDict = {}
    ###############################################################################

    #########################################################################################
    def accumulate_apply_async_result(result_tuple):
        # Accumulate the result coming from (chr,sim,split) tuple
        simNum2Type2Strand2CountDict = result_tuple[0]
        simNum2Sample2Type2Strand2CountDict = result_tuple[1]
        simNum2Type2Sample2Strand2CountDict = result_tuple[2]
        simNum2Signature2MutationType2Strand2CountDict = result_tuple[3]

        print('MONITOR ACCUMULATE', flush=True)

        accumulate_simulations_integrated_for_each_tuple(
            simNum2Type2Strand2CountDict,
            simNum2Sample2Type2Strand2CountDict,
            simNum2Type2Sample2Strand2CountDict,
            simNum2Signature2MutationType2Strand2CountDict,
            simNum2Type2ReplicationStrand2AccumulatedCountDict,
            simNum2Sample2Type2ReplicationStrand2AccumulatedCountDict,
            simNum2Type2Sample2ReplicationStrand2AccumulatedCountDict,
            simNum2Signature2MutationType2ReplicationStrand2AccumulatedCountDict)
    #########################################################################################

    ###############################################################################
    #April 30, 2020
    #read chrom based sim based mutations data and chrom based replication time data in each worker process
    if (computationType==USING_APPLY_ASYNC_FOR_EACH_CHROM_AND_SIM):

        sim_nums = range(0, numofSimulations + 1)
        sim_num_chr_tuples = ((sim_num, chrLong) for sim_num in sim_nums for chrLong in chromNamesList)

        ################################
        numofProcesses = multiprocessing.cpu_count()
        pool = multiprocessing.Pool(processes=numofProcesses)
        ################################

        ################################
        jobs = []
        ################################

        for simNum, chrLong in sim_num_chr_tuples:
            jobs.append(pool.apply_async(searchAllMutationsOnReplicationStrandArray_for_apply_async_read_data_in_the_process,
                                    args=(outputDir,jobname,chrLong,simNum,subsSignature_cutoff_numberofmutations_averageprobability_df,indelsSignature_cutoff_numberofmutations_averageprobability_df,dinucsSignature_cutoff_numberofmutations_averageprobability_df,sample_based,verbose,),
                                    callback=accumulate_apply_async_result))
            print('MONITOR %s simNum:%d len(jobs):%d' % (chrLong, simNum, len(jobs)), flush=True)
        ################################################################################

        ##############################################################################
        # wait for all jobs to finish
        for job in jobs:
            if verbose: print('\tVerbose Replication Strand Bias Worker pid %s job.get():%s ' % (str(os.getpid()), job.get()))
        ##############################################################################

        ################################
        pool.close()
        pool.join()
        ################################
    ###############################################################################


    ############################################################################################################
    #####################################       Output starts      #############################################
    ############################################################################################################
    replication_strands = [LAGGING, LEADING]

    write_signature_mutation_type_strand_bias_dictionary_as_dataframe(simNum2Signature2MutationType2ReplicationStrand2AccumulatedCountDict,
                                                                      strandBias,
                                                                      replication_strands,
                                                                      outputDir,
                                                                      jobname)


    write_type_strand_bias_dictionary_as_dataframe(simNum2Type2ReplicationStrand2AccumulatedCountDict,
                                                   strandBias,
                                                   replication_strands,
                                                   outputDir,
                                                   jobname)

    if sample_based:
        writeDictionary(simNum2Sample2Type2ReplicationStrand2AccumulatedCountDict,outputDir,jobname,Sample2Type2ReplicationStrand2CountDict_Filename,strandBias,None)
        writeDictionary(simNum2Type2Sample2ReplicationStrand2AccumulatedCountDict,outputDir,jobname,Type2Sample2ReplicationStrand2CountDict_Filename,strandBias,None)
        #TODO write as dataframe if sample_based will be maintained
    ############################################################################################################
    #####################################       Output ends      ###############################################
    ############################################################################################################

    print('--- ReplicationStrandBias Analysis ends')
    print('#################################################################################\n')
########################################################################