#!/usr/bin/env python
"""return URLs to sequence files at NCBI, Ensembl, LRG

Intended to be used to with ./stream-download as:
$ namespace=Ensembl-85   # or NCBI, LRG, test
$ ./get-sequence-urls $namespace | ./stream-download | seqrepo load -n $namespace

Also useful:
$ ./sbin/get-sequence-urls test | ./sbin/stream-download | gzip -c >|/tmp/load.fa.gz

"""

import itertools
import logging
import re
import sys
from urllib.request import urljoin, urlopen
from requests_html import HTMLSession, HTML

import coloredlogs

_logger = logging.getLogger(__name__)


def get_ensembl_urls(release):
    urls = [
        "http://ftp.ensembl.org/pub/release-{}/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz",
        "http://ftp.ensembl.org/pub/release-{}/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz",
        "http://ftp.ensembl.org/pub/release-{}/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz",
        "http://ftp.ensembl.org/pub/release-{}/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz",
    ]
    return [u.format(release) for u in urls]


def get_lrg_urls():
    # the ftp zipfile is faster than
    #   http://ftp.ebi.ac.uk/pub/databases/lrgex/fasta/LRG_public_fasta_files.zip
    # and faster than the individual files from
    #   http://ftp.ebi.ac.uk/pub/databases/lrgex/fasta/
    #   ftp://ftp.ebi.ac.uk/pub/databases/lrgex/fasta/
    return ["ftp://ftp.ebi.ac.uk/pub/databases/lrgex/fasta/LRG_public_fasta_files.zip"]


def get_test_urls():
    urls = [
        "http://dl.biocommons.org/.test/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz",
        "http://dl.biocommons.org/.test/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz",
        "http://dl.biocommons.org/.test/pep/Homo_sapiens.GRCh38.pep.all.fa.gz",
        "http://dl.biocommons.org/.test/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz",
        ]
    return urls
    
def get_refseq_urls():
    def _is_sequence_file(fn):
        return any(fn.endswith(sfx) for sfx in (".faa.gz", ".fna.gz", ".fa.gz", ".fasta", ".fa"))

    def extract_sequence_urls_from_page(url, regexps=None):
        r = HTMLSession().get(url)
        # N.B. requests_html absolute_links is broken on 3.5
        abs_links = [urljoin(r.html.url, rl) for rl in r.html.links]
        urls = [l for l in abs_links if _is_sequence_file(l)]
        if regexps:
            urls = [l for l in urls if any(regexp.search(l) for regexp in regexps)]
        return urls


    urls = []

    # eg https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.1.protein.faa.gz
    u = "https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/"
    new_urls = extract_sequence_urls_from_page(u)
    _logger.debug("{} urls from {}".format(len(new_urls), u))
    urls += new_urls

    # eg https://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/refseqgene.5.genomic.fna.gz
    u = "https://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/"
    new_urls = extract_sequence_urls_from_page(u)
    _logger.debug("{} urls from {}".format(len(new_urls), u))
    urls += new_urls

    return urls



if __name__ == "__main__":
    coloredlogs.install(level="INFO", logger=_logger)

    if sys.argv[1] == "NCBI":
        raise RuntimeError("NCBI no longer supported; use 'refseq'")
    elif sys.argv[1] == "refseq":
        urls = get_refseq_urls()
    elif sys.argv[1].startswith("Ensembl-"):
        release = re.match(r"Ensembl-(\d+)", sys.argv[1]).group(1)
        urls = get_ensembl_urls(release)
    elif sys.argv[1] == "LRG":
        urls = get_lrg_urls()
    elif sys.argv[1] == "test":
        urls = get_test_urls()
        
    print("\n".join(sorted(urls)))
    
