import pandas as pd
import glob,os,time,IPython,random
import numpy as np

class shuffle:
    
    def demo():
        '''demo is used to demonstrate typical examples about this class.'''
        
        greeting='''
import clubear as cb
#clubear.csv is generated by cb.manager.demo()
sf=cb.shuffle('clubear.csv')
sf.dc() #shuffle by dc method
'''
        print(greeting);print('')
    
    def __init__(self,pathfile,seed=0):
        '''Initialization: check pathfile exists.'''
        if not os.path.exists(pathfile): print('shuffle: The pathfile dose not exists.'); return
        if not isinstance(seed,int): print('shuffle: The random seed must be an int.'); return
        
        self.pathfile=pathfile
        random.seed(seed)
        
    def dc(self):
        '''dc is used to shuffle data by a divde-and-conque method.'''
        
        filename=self.pathfile.split('/')[-1]
        newfile=filename+'.shuffle'

        reader=open(self.pathfile,encoding='iso8859-1')
        reader.seek(0,2)
        filesize=reader.tell()
        nfiles=int(filesize/1.0e+8)+1

        '''Stage 1: Random Spliting'''
        if not os.path.exists('_shuffle_'): os.makedirs('_shuffle_')
        tmpfiles=os.listdir('_shuffle_')
        [os.remove('_shuffle_/'+each) for each in tmpfiles]

        writers=[open('_shuffle_/tmp'+str(each)+'.csv','w',encoding='iso8859-1') for each in range(nfiles)]

        okline=0;reader.seek(0,0)
        header=reader.readline()
        start_time=time.time()
        for eachline in reader:
            pos=random.random()
            pos=int(pos*nfiles)
            pos=min(pos,nfiles-1)
            writers[pos].write(eachline)
            okline=okline+1
                
            if okline%1.0e+6==0: 
                end_time=time.time();elapsed_time=end_time-start_time
                IPython.display.clear_output(wait=True)
                pd.set_option('display.float_format', lambda x: '%.2f' % x)
                print('File shuffling: ',self.pathfile)
                print('Stage 1 accomplished',end=' ')
                print(('%.0f'%(okline/1.0e+6)),'*10**6 lines processed with',end=' ')
                print(('%.1f'%elapsed_time),'seconds elapsed.')
                            
        [each.close() for each in writers]
                
        '''Stage 2: Shuffling and Merge'''

        writer=open(newfile,'w',encoding='iso8859-1')
        writer.write(header)
        datafiles=['_shuffle_/tmp'+str(each)+'.csv' for each in range(nfiles)]
        start_time=time.time()
        for k in range(nfiles):
            reader=open(datafiles[k],encoding='iso8859-1')
            datalist=[]
            for eachline in reader: datalist.append(eachline)
            num_lines=len(datalist)
            pos=list(np.argsort(np.random.uniform(0,1,num_lines)))
            for each in pos: writer.write(datalist[each])
                
            end_time=time.time();elapsed_time=end_time-start_time
            percent=(k+1)/nfiles*100
            IPython.display.clear_output(wait=True)
            print('File shuffling: ',self.pathfile)
            print('Stage 2 accomplished ',('%.1f'%percent),'% with',end=' ')
            print(('%.1f'%elapsed_time),'seconds elapsed.')
            reader.close()
            
        writer.close()
        [os.remove(each) for each in datafiles]
        os.rmdir('_shuffle_')