"""
This module contains the parsers for reading in PSMs
"""
import logging

import pandas as pd

from . import utils
from .dataset import LinearPsmDataset

LOGGER = logging.getLogger(__name__)

# Functions -------------------------------------------------------------------
def read_pin(pin_files, to_df=False):
    """
    Read Percolator input (PIN) tab-delimited files.

    Read PSMs from one or more Percolator input (PIN) tab-delmited
    files, aggregating them into a single
    :py:class:`~mokapot.dataset.LinearPsmDataset`. For
    more details about the PIN file format, see the
    `Percolator documentation <https://github.com/percolator/percolator/
    wiki/Interface#tab-delimited-file-format>`_.

    Specifically, mokapot requires specific columns in the
    tab-delmited files: `specid`, `scannr`, `peptide`, `proteins`, and
    `label`. Note that these column names are insensitive. In addition
    to the required columns, mokapot will look for an `expmass` column,
    which is generated by `Crux <http://crux.ms>`_, but is not
    intended to be a feature.

    Additionally, mokapot does not currently support specifying a
    default direction or feature weights in the PIN file itself.

    Parameters
    ----------
    pin_files : str or tuple of str
        One or more PIN files to read.
    to_df : bool
        Return a :py:class:`pandas.DataFrame` instead of a
        py:class:`~mokapot.dataset.LinearPsmDataset`.

    Returns
    -------
    LinearPsmDataset
        A :py:class:`~mokapot.dataset.LinearPsmDataset` object
        containing the PSMs from all of the PIN files.
    """
    logging.info("Parsing PSMs...")
    pin_df = pd.concat([read_percolator(f)
                        for f in utils.tuplize(pin_files)])

    # Find all of the necessary columns, case-insensitive:
    specid = tuple(c for c in pin_df.columns if c.lower() == "specid")
    peptides = tuple(c for c in pin_df.columns if c.lower() == "peptide")
    proteins = tuple(c for c in pin_df.columns if c.lower() == "proteins")
    labels = tuple(c for c in pin_df.columns if c.lower() == "label")
    other = tuple(c for c in pin_df.columns if c.lower() == "calcmass")
    spectra = tuple(c for c in pin_df.columns
                    if c.lower() in ["scannr", "expmass"])

    nonfeat = sum([specid, spectra, peptides, proteins, labels, other],
                  tuple())

    features = tuple(c for c in pin_df.columns if c not in nonfeat)

    # Check for errors:
    if len(labels) > 1:
        raise ValueError("More than one label column found in pin file.")

    if len(proteins) > 1:
        raise ValueError("More than one protein column found in pin file.")

    if not all([specid, peptides, proteins, labels, spectra]):
        raise ValueError("This PIN format is incompatible with mokapot. Please"
                         " verify that the required columns are present.")

    # Convert labels to the correct format.
    pin_df[labels[0]] = (pin_df[labels[0]] + 1) / 2

    if to_df:
        return pin_df

    return LinearPsmDataset(psms=pin_df,
                            target_column=labels[0],
                            spectrum_columns=spectra,
                            peptide_columns=peptides,
                            protein_column=proteins[0],
                            feature_columns=features)


# Utility Functions -----------------------------------------------------------
def read_percolator(perc_file):
    """
    Read a Percolator tab-delimited file.

    Percolator input format (PIN) files and the Percolator result files
    are tab-delimited, but also have a tab-delimited protein list as the
    final column. This function parses the file and returns a DataFrame.

    Parameters
    ----------
    perc_file : str
        The file to parse.

    Returns
    -------
    pandas.DataFrame
        A DataFrame of the parsed data.
    """
    LOGGER.info("Reading %s...", perc_file)
    pin_df = pd.read_csv(perc_file,
                         sep="\t",
                         usecols=lambda x: True,
                         header=None,
                         dtype=str,
                         low_memory=True)

    pin_df.columns = pin_df.loc[0, :].values
    pin_df.drop(index=0, inplace=True)
    return pin_df.apply(pd.to_numeric, errors="ignore").reset_index(drop=True)
