#!/usr/bin/env python3

from argparse import ArgumentParser
from urllib.parse import urlparse
from bs4 import BeautifulSoup

import time
import requests
import xml.etree.ElementTree as ET
import sys
import re

WARNING = "Warning"
ERROR = "Error"
NOTICE = "Notice"

XML_NAMESPACE = "http://www.sitemaps.org/schemas/sitemap/0.9"
XML_ETREE_NAMESPACE = "{%s}" % XML_NAMESPACE
VALID_CHANGE_FREQS = [
    "always",
    "hourly",
    "daily",
    "weekly",
    "monthly",
    "yearly",
    "never",
]


def stderr(what: str, type_err: str = WARNING):
    print("%s: %s" % (type_err, what), file=sys.stderr)


def main(url: str) -> int:
    try:
        sitemap = get_sitemap(url)
    except requests.ConnectionError:
        stderr("The given URL was unreachable. Please double-check it.", ERROR)
        return 1
    except requests.exceptions.HTTPError:
        stderr("could not find the sitemap.", ERROR)
        return 1

    sitemap_valid, n_urls, n_passed = check_sitemap(sitemap)

    if not sitemap_valid:
        print()

    print(
        "%d of %d URLs (%d%%) passed."
        % (n_passed, n_urls, int(n_passed / n_urls * 100))
    )

    return 0 if sitemap_valid else 1


def get_sitemap(place: str) -> ET:
    web_protocol = re.match(r"^https?://", place)

    if web_protocol:
        response = None
        if web_protocol[0] == "http://":
            try:
                response = requests.get(
                    place.replace("http://", "https://"), allow_redirects=False
                )
                stderr("upgraded to HTTPS automatically.", NOTICE)
            except requests.exceptions.SSLError:
                stderr(
                    "website not secure with TLS. You should consider upgrading to improve the security of your "
                    "visitors."
                )
        if response is None:
            response = requests.get(place, allow_redirects=False)

        if response.headers["content-type"].split(";")[0] != "text/xml":
            parsed = urlparse(place)
            place = "%s://%s/sitemap.xml" % (parsed.scheme, parsed.netloc)
            response = requests.get(place)

        response.raise_for_status()
        xml = response.text
    else:
        # User provided a file, let's open it directly.
        with open(place) as file:
            xml = file.read()

    return ET.fromstring(xml)


def check_sitemap(sitemap) -> (bool, int, int):
    sitemap_valid = True
    n_urls = 0
    n_passed = 0

    if not sitemap.tag.startswith(XML_ETREE_NAMESPACE):
        stderr(
            'missing XML namespace on <urlset> tag: please add xmlns="%s"'
            % XML_NAMESPACE
        )
        sitemap_valid = False

    for item in sitemap:
        n_urls += 1

        if item.tag.replace(XML_ETREE_NAMESPACE, "") != "url":
            stderr(
                "invalid tag <%s>, expected <url>"
                % item.tag.replace(XML_ETREE_NAMESPACE, ""),
                ERROR,
            )
            continue

        if is_url_correct(item, n_urls):
            n_passed += 1
        elif sitemap_valid:
            sitemap_valid = False

    return sitemap_valid, n_urls, n_passed


def is_url_correct(url, n_url: int) -> bool:
    valid = True
    has_loc = False

    for prop in url:
        tag_name = prop.tag.replace(XML_ETREE_NAMESPACE, "")

        if tag_name not in ["loc", "lastmod", "changefreq", "priority"]:
            stderr("invalid <%s> tag for URL n°%d!" % (tag_name, n_url), ERROR)

            return False

        if tag_name == "loc":
            has_loc = True
            loc = prop.text
            response, is_loc_valid = is_url_working(loc)
            valid = valid and is_loc_valid and is_indexable(response)

        if tag_name == "priority":
            priority = float(prop.text)

            if not 0 <= priority <= 1:
                stderr(
                    'invalid value "%s" for <%s> tag, must be a number between 0 and 1'
                    % (prop.text, tag_name),
                )
                valid = False

        if tag_name == "changefreq" and prop.text not in VALID_CHANGE_FREQS:
            stderr(
                "invalid value for <%s> tag, must be one of the following values: %s"
                % (tag_name, ", ".join(VALID_CHANGE_FREQS))
            )
            valid = False

    if not has_loc:
        stderr("URL n°%d has no mandatory <loc> tag!" % n_url, ERROR)
        return False

    return valid


def is_url_working(url: str, retries: int = 0) -> (requests.Response, bool):
    r = requests.get(url, allow_redirects=False)

    try:
        r.raise_for_status()
        if r.status_code >= 300:
            stderr(
                'location "%s" redirects to "%s" with %d status code. You may want to remove it from your sitemap.'
                % (url, r.headers.get("Location"), r.status_code),
                NOTICE,
            )

    except requests.HTTPError as e:
        stderr(
            'location "%s" returns an HTTP %d status code.'
            % (url, e.response.status_code),
            ERROR,
        )
        return r, False

    except requests.exceptions.ChunkedEncodingError:
        if retries == 5:
            stderr('could not reach "%s" because of a network error.', ERROR)
            return r, False

        time_wait = 2 ** retries
        stderr(
            "network error while checking %s, waiting for %d seconds."
            % (url, time_wait),
        )
        time.sleep(time_wait)

        return is_url_working(url, retries + 1)

    return r, True


def is_indexable(response: requests.Response) -> bool:
    url = response.url

    if "noindex" in response.headers.get("X-Robots-Tag", "").split(" "):
        stderr('location "%s" is not indexable based on X-Robots-Tag HTTP header' % url)
        return False

    html = BeautifulSoup(response.text, "html.parser")
    head = html.head

    if head is None:
        # this may happen sometimes, e.g. some redirection pages.
        # even though this is quite ugly, it remains valid HTML
        return True

    for meta in head.find_all("meta"):
        if meta.get("name") == "robots" and "noindex" in meta.get("content", "").split(
            ","
        ):
            stderr(
                'location "%s" is not indexable based on <meta name="robots" /> HTML tag'
                % url
            )
            return False

    return True


if __name__ == "__main__":
    try:
        args = ArgumentParser(description="Validate a sitemap.")

        args.add_argument("sitemap_location", type=str, help="the URL to the sitemap")

        args = args.parse_args()

        exit(main(args.sitemap_location))
    except KeyboardInterrupt:
        exit(-1)
