#!/usr/bin/env python3
"""
This code generates a bacteriophage identification profile for each sample.
"""
## useful imports
import time
import io
import os
import re
import sys
from sys import argv
from io import open
from termcolor import colored
from Bio import SeqIO
from functools import reduce
from termcolor import colored
import argparse
import pandas
import shutil

## import my modules
from BacterialTyper.scripts import functions
from BacterialTyper.config import set_config

## import phispy modules
import PhiSpyModules
import pkg_resources

######
def results_PhiSpy(folder, name):
	"""Parse results generated by PhiSpy
	
	This functions renames and updates files with information for a better interpretation of results. 
	
	:param folder: Absolute path to output directory containing PhiSpy results.
	:param name: Sample name id
	
	:type folder: string
	:type name: string
	
	.. note:: See PhiSpy results entry available in :ref:`PhiSpy-results`

	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.functions.get_fullpath_list`

		- :func:`BacterialTyper.scripts.functions.create_subfolder`

		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
		
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`
				
	"""
	
	
	## filename stamp of the process
	filename_stamp = folder + '/.PhiSpy_results'

	# check if previously done
	if os.path.isfile(filename_stamp):
		stamp =	functions.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
	else:	
	
		## get files
		PhiSpy_files = functions.get_fullpath_list(folder)
		
		## create sup_folder
		sup_folder = functions.create_subfolder("sup_files", folder)
		
		## remove any previous files
		sup_folder_files = functions.get_fullpath_list(sup_folder)
		if (sup_folder_files):
			for f in sup_folder_files:
				## remove original
				os.remove(f)
			
		print ("+ Parse results and sort files generated by PhiSpy")
		
		## parse files and rename
		for f in PhiSpy_files:
			baseName = os.path.basename(f)
			
			#print("#################################")
			#print(f)
			#print(baseName)
			
			if baseName == "classify.tsv":
				# move to sup_folder
				shutil.move(f, sup_folder)
				
			elif baseName == "prophage_tbl.tsv":
				# rename & add to excel
				new_name = folder + "/" + name + '_PhiSpy-classification_genes.tsv'
				## read pandas
				prophage_tsv = pandas.read_csv(f, sep="\t", header=0)
				
				# rename
				shutil.move(f, new_name)			
				
			### gff
			elif baseName == "prophage.gff3":
				# rename
				new_name = folder + "/" + name + '_PhiSpy-prophage.gff3'
				shutil.move(f, new_name)
				
			elif baseName == "prophage_coordinates.tsv":
				# rename, add header & add to excel
				columns = ("#prophage_ID", "Contig","Start","End","attL_Start","attL_End","attR_Start","attR_End","attL_Seq","attR_Seq","Longest_Repeat_flanking_phage")
				coordinates_pd = pandas.read_csv(f, sep="\t", header=None, names=columns)
	
				## print in file			
				new_name = folder + "/" + name + '_PhiSpy-prophage-coordinates.tsv'
				coordinates_pd.to_csv(new_name, sep="\t")
	
				## remove original
				os.remove(f)
				
			elif baseName == "prophage.tbl":
				## redundant information. move to sup_folder
				shutil.move(f, sup_folder)
				
	
			elif baseName == "testSet.txt":
				# move to sup_folder
				shutil.move(f, sup_folder)
				
			elif baseName == ".PhiSpy":
				continue
				
			
		## create excel
		## coordinates_pd, prophage_tsv
		name_excel = folder + '/' + name + '_bacteriophage_summary.xlsx'
		writer = pandas.ExcelWriter(name_excel, engine='xlsxwriter') 		
		prophage_tsv.to_excel(writer, sheet_name="Gene classification")
		coordinates_pd.to_excel(writer, sheet_name="Coordinates")
		
		## close
		writer.save()
		
		## print stamp time file
		stamp =	functions.print_time_stamp(filename_stamp)
		
	return ()

######
def ident_bacteriophage(gbk_file, name, output_dir, training_set, Debug=False, 
					min_contig_size=5000, window_size=30, nonprophage_genegaps=10, 
					number_phage_genes=5,randomforest_trees=500, expand_slope=False, 
					kmers_type="all", keep=False):

	"""Identify putative bacteriophages inserted.
	
	Function to call PhiSpy_ functions to generate the identification and annotation 
	of bacteriophages inserted in bacterial genomes.
	
	:param gbk_file: GenBank format file
	:param name: Sample name id
	:param output_dir: Absolute path to output directory
	:param training_set: trainingSet number id. [Default: 0 (Generic)]
	:param Debug: True/False for debugging messages [Default: False]
	:param min_contig_size: Minimum contig size (in bp) to be included in the analysis. Smaller contigs will be dropped. [Default: 5000 ]
	:param window_size: Window size of consecutive genes to look through to find phages. [Default: 30]
	:param nonprophage_genegaps: The number of non phage genes betweeen prophages. [Default: 10]
	:param number_phage_genes: Number of consecutive genes in a region of window size that must be prophage genes to be called. [Default: 5]
	:param randomforest_trees: Number of trees generated by Random Forest classifier. [Default: 500]
	:param expland_slope: Use the product of the slope of the Shannon scores in making test sets [Default: False]
	:param kmers_type: Type of kmers used for calculating Shannon scores. [all, codon, simple] [Default: all]
	:param keep: Do not delete temporary files [Default: False]

	:type gbk_file: string
	:type name: string
	:type output_dir: string
	:type training_set: integer
	:type Debug: Boolean 
	:type min_contig_size: integer
	:type window_size: integer
	:type nonprophage_genegaps: integer
	:type number_phage_genes: integer 
	:type randomforest_trees: integer
	:type expland_slope: Boolean
	:type kmers_type: string
	:type keep: Boolean

	The parameter `training_set` is the id for the training set of interest among the different available provided by PhiSpy within the file 
	:file:`data/trainingGenome_list.txt` and summarized here by :func:`BacterialTyper.scripts.bacteriophage.get_list_PhiSpy_trainingSets`.
	
	.. note:: See available training sets in :ref:`PhiSpy-training-sets`

	.. attention:: Be aware of Copyright
	
		The code implemented here is very similary to the main function under PhiSpy.py 
		main script: https://github.com/linsalrob/PhiSpy/blob/master/PhiSpy.py
		
		We basically implemented it here again to modulate and control accordingly to 
		our project and needs. Give credit to the authors accordingly.
		
		Copyrigth: 2008-2018 Sajia Akhter, Katelyn McNair, Rob Edwards, 
		
		San Diego State University, San Diego, CA

		
	.. note:: This functions relies on several PhiSpy functions such as:
	
		- :func:`PhiSpy.PhiSpyModules.SeqioFilter`
		
		- :func:`PhiSpy.PhiSpyModules.make_test_set`
		
		- :func:`PhiSpy.PhiSpyModules.call_randomforest`
		
		- :func:`PhiSpy.PhiSpyModules.make_initial_tbl`
		
		- :func:`PhiSpy.PhiSpyModules.consider_unknown`
		
		- :func:`PhiSpy.PhiSpyModules.fixing_start_end`		
			
	"""

	## filename stamp of the process
	filename_stamp = output_dir + '/.PhiSpy'

	# check if previously done
	if os.path.isfile(filename_stamp):
		stamp =	functions.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [%s -- PhiSpy]" %(stamp, name), 'yellow'))
	else:	
		## debug message
		if (Debug):
			print (colored("**DEBUG: Call PhiSpy for sample %s " %gbk_file + "**", 'yellow'))

			print('+ Generate PhiSpy call for file %s...' %gbk_file)

		### create folder
		functions.create_folder(output_dir)
		
		## Filter to remove short contigs
		gbk_filtered = PhiSpyModules.SeqioFilter(filter(lambda x: len(x.seq) > min_contig_size, SeqIO.parse(gbk_file, "genbank")))
		ncontigs = reduce(lambda sum, element: sum + 1, gbk_filtered, 0)
		if ncontigs == 0:
			## debug message
			if (Debug):
				print (colored("**DEBUG: 100% contigs in %i are less than %i bp" 
							%(gbk_file, min_contig_size)), 'yellow')
			
			return(0)

		## get available trainingSets within PhiSpy
		sets_Available = get_list_PhiSpy_trainingSets()

		set_name = sets_Available.loc[int(training_set), "Set_Name"]
		training_set_formated = "data/" + set_name + ".txt"

		if (Debug):
			print ("\t+ Training Set selected: ", training_set)
			print ("\t+ Training Set name: ", set_name)
			print ("\t+ Training Set Genus: ", sets_Available.loc[int(training_set), "genus"])
			print ("\t+ Training Set Species: ", sets_Available.loc[int(training_set), "species"])
			print ("\t+ Training Set Tag: ", sets_Available.loc[int(training_set), "Tag"])
		
		## create argparse with arguments provided to call PhiSpy
		arg_parser = argparse.Namespace(infile=gbk_file, output_dir=output_dir,
								training_set = training_set_formated, min_contig_size = min_contig_size,
								window_size= window_size, nonprophage_genegaps=nonprophage_genegaps,
								number=number_phage_genes, randomforest_trees=randomforest_trees, quiet=True,
								expand_slope=expand_slope, kmers_type=kmers_type, keep=keep, record=gbk_filtered)
		## debug message
		if (Debug):
			print (colored("**DEBUG: Arguments for PhiSpy", 'yellow'))
			print (vars(arg_parser))	
		
		# make testing set: calculate Shanon slope
		my_make_test_flag = PhiSpyModules.make_test_set(**vars(arg_parser))
	
		## Classification
		PhiSpyModules.call_randomforest(**vars(arg_parser))
		PhiSpyModules.make_initial_tbl(**vars(arg_parser))
		
		## Consider unknown functions
		if (training_set == 'data/trainSet_genericAll.txt'): ## TODO: Debug test with genericAll (Default: 0)
			PhiSpyModules.consider_unknown(arg_parser.output_dir)

		## PhiSpy Evaluation
		PhiSpyModules.fixing_start_end(**vars(arg_parser))
		
		## when finished print time stamp in  output_dir + '/.PhiSpy'
		stamp =	functions.print_time_stamp(filename_stamp)

	## final results
	return (output_dir) ## contains results
######

def print_list_modified():
	"""Print PhiSpy training set information
	
	This function prints on screen the data information retrieved from PhiSpy package using 
	:func:`BacterialTyper.scripts.bacteriophage.get_list_PhiSpy_trainingSets`, 
			
	.. note:: See available training sets in :ref:`PhiSpy-training-sets`

	"""
	
	## get original
	data = get_list_PhiSpy_trainingSets()
	pandas.set_option('display.max_rows', data.shape[0]+1)
	print("\n\n")
	print(data)
	print("\n\n")


######
def get_list_PhiSpy_trainingSets():
	"""Get training Sets for PhiSpy calculations
	
	This function prints a modification of the information for better clarification to users of the different training 
	sets available within PhiSpy python package.
	
	It uses the pkg_resources resource_filename to retrieve the PhiSpy file :file:`data/trainingGenome_list.txt` 
	(within PhiSpy package) that contains all the available training sets pre-calculated for different 
	bacterial species/strains. It then manipulates the dataframe for better understanding using pandas.
	
	:returns: Pandas dataframe containing the file information
	
	.. note:: See available training sets in :ref:`PhiSpy-training-sets`

	"""
	## read into pandas dataframe the trainingGenome list dataset
	data = pandas.read_csv(pkg_resources.resource_filename('PhiSpyModules', "data/trainingGenome_list.txt"), 
						sep='\t', header=None, names=["trainingSet_ID", "Set_name", "Species", "Number_genomes"]).set_index("trainingSet_ID")
	
	## reformat table:
	## dataframe string split:
	data[["set", "ext1"]] = data["Set_name"].str.split(pat=".txt", n=1, expand=True)

	## Information regarding species specific training sets
	## Genomes == 1
	data1 = pandas.DataFrame()
	species_info = data.loc[data["Number_genomes"]==1]["Species"].str.split(pat="_", n=2, expand=True)
	data1["Set_Name"] = data.loc[data["Number_genomes"]==1]["set"]
	data1["Tag"] = "Single_species"
	#data1["Number_genomes"] = 1
	data1["genus"] = species_info[0]
	data1["species"] = species_info[1]
	data1["other1"] = species_info[2]
	data1[["other", "ext2"]] = data1["other1"].str.split(pat=".gb|.gbk", n=1, expand=True)
	data1.drop(columns=["other1", "ext2"], inplace = True)
	
	## Information regarding non-species_specific training sets
	## Genomes > 1
	data_more1 = data.loc[data["Number_genomes"]>1]
	
	## Generic Test set
	data0 = pandas.DataFrame()

	## ToDo set number as maximun if PhiSpy updates trainingSets and number 49 changes
	data0["Set_Name"] = data_more1.loc[data["Number_genomes"]==49]["set"] 
	data0["Tag"] = "Generic"

	#data0["Number_genomes"] = 49
	data0["genus"] = "All available"
	data0["species"] = "All available"
	data0["other"] = "All available"
	data_more1 = data_more1.drop(index=0)
	
	### More >1 & <49	
	data2_3 = pandas.DataFrame()
		
	for index, row in data_more1.iterrows():
		data2_3.at[index, "Set_Name"] = row["set"]
		data2_3.at[index, "Tag"] = "Multi_species"
		#data2_3.at[index, "Number_genomes"] = row["Number_genomes"]

		## extract elemnts of the list
		## split species info 
		species_info = [i.split('_', 2) for i in row["Species"].split(';')]
		data2_3.at[index, "genus"] = "/".join( list(set(functions.parse_sublist(species_info, 0))) )
		data2_3.at[index, "species"] = "/".join( list(set(functions.parse_sublist(species_info, 1))) )
		
		## remove genbank extension
		other_info = list( map( lambda x: x.replace('.gbk','').replace('.gb',''), list(set(functions.parse_sublist(species_info, 2))) ))
		data2_3.at[index, "other"] = "/".join(other_info)

	data_final = pandas.concat([data0, data1, data2_3], join='outer', sort=True)
	data_final["trainingSet"] = data_final.index
	order_list = ["trainingSet", "Tag", "Set_Name", "genus", "species", "other"]
	data_final = data_final[order_list]
	
	return (data_final)
	
######

######
def help_PhiSpy():
	"""Print additional information for PhiSpy
	
	PhiSpy_ is a program for identifying prophages from among microbial genome sequences. 
	
	Copyright: 2008-2018 
	
	Sajia Akhter, Katelyn McNair, Rob Edwards, San Diego State University, San Diego, CA
	
	This function also prints the list of training sets available and provided within PhySpy.
	
	.. note:: This function depends on the PhiSpy function:
	
		- :func:`PhiSpy.PhiSpyModules.print_list`
	
	.. include:: ../../links.inc	 	
	"""
	print ("\n** PhiSpy additional information **")
	print ("PhiSpy is a program for identifying prophages from among microbial genome sequences\n")
	print ("(c) 2008-2018 Sajia Akhter, Katelyn McNair, Rob Edwards, San Diego State University, San Diego, CA")
	print ("https://github.com/linsalrob/PhiSpy\n")
	
	print ("Choose among different training sets for a better phage identification:\n")
	print_list_modified()	
	
	print()
	print ("As stated by PhiSpy developers:")
	print ("This is training data that is used in the prediction of the prophages. If possible, you should use the") 
	print ("closest relative to your genome, however if that is not possible, you can also use the Generic Training set.")
	print ("\n")

######
def help_options():
	print ("\nUSAGE: python %s gbk_file outdir name trainingSet\n"  %os.path.realpath(__file__))
	
	print("Provide a single trainingSet number")
	print_list_modified()	


######
def main():
	## control if options provided or help
	if len(sys.argv) != 5:
		help_options()
		exit()
	
	## argv
	gbk_file = os.path.abspath(argv[1])
	outdir = os.path.abspath(argv[2])
	name=argv[3]
	training_set = argv[4]
	
	## call
	print ("\n+ Generate annotation of putative phages for sample %s in folder: %s" %(name, outdir))
	ident_bacteriophage(gbk_file=gbk_file, name=name, output_dir=outdir, training_set=training_set, Debug=False)
	results_PhiSpy(outdir, name)
	
	
	
######
'''******************************************'''
if __name__== "__main__":
	main()
