#!/usr/bin/env python3

#
# newslinkrss - RSS feed generator for generic sites
# Copyright (C) 2020  Alexandre Erwin Ittner <alexandre@ittner.com.br>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

import sys
import re
import datetime
import argparse
import logging
import traceback
from html.parser import HTMLParser
import http.cookiejar
import urllib
import urllib3

import dateutil.parser
import PyRSS2Gen
import requests

import lxml.html
import lxml.html.clean
import lxml.etree
import lxml.cssselect
import cssselect


DEFAULT_USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0"
)

USER_LOG_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", "FATAL"]
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)


def set_log_level(args):
    if not args.log:
        return
    level = args.log.upper()
    if level not in USER_LOG_LEVELS:
        raise ValueError("Bad log level string %s" % args.log)
    numlevel = getattr(logging, level, None)
    if not isinstance(numlevel, int):
        raise ValueError("Log level %s not defined" % args.log)
    logger.setLevel(numlevel)
    logger.info("Log level set to %d (%s)", numlevel, level)


def _first_valid_attr_in_list(attrs, name):
    """Interpret an attribute list from HtmlParser.handle_starttag and get the
    value of the first attribute with the given name.  It should be only one
    but, of course, nobody can force people to only write sane HTML.
    """
    for itm in attrs:
        if (len(itm) > 1) and (itm[0] == name):
            return itm[1]
    return None


def clean_url_query_string(rx_list, url):
    """Remove unwanted parameters from the URL query string.

    If the URL has a query string, remove all name/value pairs which
    names matches any of the regular expressions given in list 'rx_list'.
    Return the URL, possibly modified.
    """

    if not rx_list:
        return url

    u = urllib.parse.urlparse(url)
    query_lst = urllib.parse.parse_qsl(u.query, keep_blank_values=True)
    for rx in rx_list:
        nlst = []
        for itm in query_lst:
            if not re.match(rx, itm[0]):
                nlst.append(itm)
        query_lst = nlst

    query_str = urllib.parse.urlencode(query_lst) if query_lst else None
    new_url = urllib.parse.urlunparse(
        (u.scheme, u.netloc, u.path, u.params, query_str, u.fragment)
    )
    if new_url != url:
        logger.debug("query string cleanup: URL %s rewritten to %s", url, new_url)
    return new_url


class CollectLinksParser(HTMLParser):
    def __init__(self, url_patt=None, ignore_patt=None, max_items=None, base_url=None):
        HTMLParser.__init__(self)
        self.url_patt = url_patt
        self.ignore_patt = ignore_patt
        self.max_items = max_items
        self.base_url = base_url
        self.links = []
        self.limit_reached = False

        # List of regexes with parameters to strip from URL query strings.
        self.qs_cleanup_rx_list = []

        self._found_links = set()
        self._last_link_text = None
        self._grab_link_text = False
        self._last_link = None

    def reset_parser(self):
        """Resets the parser state, but still keeps found links, etc."""
        self._last_link_text = None
        self._grab_link_text = False
        self._last_link = None

    def handle_starttag(self, tag, attrs):
        if (self.max_items is not None) and (len(self.links) >= self.max_items):
            if not self.limit_reached:
                logger.warning("limit of %d links reached", self.max_items)
            self.limit_reached = True
            return

        if tag == "a":
            href = _first_valid_attr_in_list(attrs, "href")
            if not href:
                return

            href = href.split("#", 2)[0]  # Strip URL fragment.
            if self.base_url:
                href = requests.compat.urljoin(self.base_url, href)
            href = clean_url_query_string(self.qs_cleanup_rx_list, href)

            # Try to noe follow the same link more than once. We need to
            # repeat this check later due to redirects.
            if (
                href not in self._found_links
                and (not self.url_patt or re.match(self.url_patt, href))
                and (not self.ignore_patt or not re.match(self.ignore_patt, href))
            ):
                self._last_link_text = []
                self._grab_link_text = True
                self._last_link = href

    def handle_data(self, data):
        if self._grab_link_text:
            text = data.strip()
            if text != "":
                self._last_link_text.append(text)

    def handle_endtag(self, tag):
        if tag == "a":
            link_text = ""
            if self._grab_link_text:
                self._grab_link_text = False
                link_text = " ".join(self._last_link_text)
            if self._last_link and self._last_link not in self._found_links:
                self._found_links.add(self._last_link)
                self.links.append((self._last_link, link_text))
                logger.info("New link added: %s %s", self._last_link, link_text)
            self._last_link = False


def normalize_rfc1766_lang_tag(loc):
    """RSS2 and HTML use RFC 1766 language codes (with an "-"), while Open
    Graph uses a "_". This function fixes this mess and normalizes cases,
    spaces, etc.

    Notice that RFC 1766 is not case-sensitive, capitalization is just a
    convention. This function capitalizes country codes for easy reading.

    Returns None for (some) nonsensical values.
    """
    loc = loc.strip().lower().replace("_", "-")
    if len(loc) > 32:
        return None
    lst = loc.split("-", 1)
    if len(lst) == 2 and len(lst[1]) == 2:
        loc = lst[0] + "-" + lst[1].upper()
    return loc


class CollectAttributesParser(HTMLParser):
    """A state machine that parses HTML from a web page and extract some
    useful attributes.

    The following properties are set with useful informaiton:
    title     - String with the page title or None
    base      - String with the base URL or None
    canonical - String with the Canonical URL for the page or None
    description - String with a best-guest for a description or None
    changed   - Datetime with a best-guest for the modification time or None
    author    - String with a best-guest for the author name or None
    section   - Section where article was published or None
    tags      - Tags attached to the article
    language  - Language code (e.g. en-US) or None
    """

    def __init__(self):
        HTMLParser.__init__(self)
        self._title_lst = None
        self._in_head = False
        self.title = None
        self.base = None
        self.description = None
        self.canonical = None
        self.changed = None
        self.author = None
        self.section = None
        self.tags = []
        self.language = None

        # True is a temporary language was found in element <html>. It will
        # be used only until another one is found because too many sites
        # have nonsensical values in it.
        self._html_locale = False

    def reset_parser(self):
        """Reset current parser state, but keep collected data."""
        self._title_lst = None
        self._in_head = False
        self._html_locale = False

    def handle_starttag(self, tag, attrs):
        if tag == "html":
            lang = _first_valid_attr_in_list(attrs, "lang")
            if lang and not self.language:
                self.language = normalize_rfc1766_lang_tag(lang)
                self._html_locale = True

        if tag == "head":
            # Will fail on nested heads, but who is insane enough to do this?!
            self._in_head = True

        if self._in_head and tag == "base":
            self.base = _first_valid_attr_in_list(attrs, "href")

        if self._in_head and (tag == "title") and (not self.title):
            self._title_lst = []

        if self._in_head and tag == "link":
            # <link rel="xxxx" href="yyyy" />
            rel = _first_valid_attr_in_list(attrs, "rel")
            href = _first_valid_attr_in_list(attrs, "href")
            if rel == "canonical" and not self.canonical:
                self.canonical = href

        if self._in_head and tag.lower() == "meta":
            # <meta name="xxxx" content="yyyy" />
            # <meta property="xxxx" content="yyyy" />
            name = _first_valid_attr_in_list(attrs, "name")
            prop = _first_valid_attr_in_list(attrs, "property")
            content = _first_valid_attr_in_list(attrs, "content")
            if name:
                name = name.lower()
            if prop:
                prop = prop.lower()

            # Attributes defined by the Open Graph Protocol: A lot of sites
            # which refuse to provide feeds have this nice attributes so their
            # contents appear nicely when linked on Facebook, Twitter and so.
            # These can provide a lot of useful information.

            if (
                prop == "article:published_time"
                or prop == "article:modified_time"
                or prop == "og:updated_time"
                or name == "article:published_time"
                or name == "article:modified_time"
                or name == "og:updated_time"
            ):
                # Content is a date in ISO format.
                # <meta property="article:published_time" content="2020-09-13T20:00:00+00:00" />
                # <meta property="article:modified_time" content="2020-09-13T20:01:42+00:00" />
                try:
                    dt = dateutil.parser.parse(content)
                    if (not self.changed) or (self.changed < dt):
                        self.changed = dt
                        logger.debug("Found new changed date %s", dt)
                except:
                    logger.exception("When parsing changed date")

            if prop == "og:url" and not self.canonical:
                # <meta property="og:url" content="xxxxx">
                self.canonical = content

            if prop in ("og:description", "twitter:description") or (
                name == "description"
            ):
                if len(content) > 8 and (
                    (not self.description) or (len(content) > len(self.description))
                ):
                    self.description = content

            if not self.author and (name == "author" or prop == "article:author"):
                self.author = content

            if name == "article:tag" or prop == "article:tag":
                if content and content not in self.tags:
                    self.tags.append(content)

            if (name == "article:section" or prop == "article:section") and content:
                self.section = content

            if (name == "og:locale" or prop == "og:locale") and content:
                lang = normalize_rfc1766_lang_tag(content)
                if self._html_locale and self.language:
                    self.language = lang
                    self._html_locale = False
                elif not self.language:
                    self.language = lang

    def handle_data(self, data):
        if self._title_lst is not None:
            self._title_lst.append(data.strip())

    def handle_endtag(self, tag):
        if tag == "head":
            self._in_head = False

        if tag == "title" and self._title_lst is not None:
            self.title = "".join(self._title_lst)
            self._title_lst = None


def try_date_from_str(src, date_rx, date_fmt):
    rdate = None
    try:
        m = re.match(date_rx, src)
        if not m:
            return None
        date_txt = m.group(1)
        logger.debug(
            "date regex matched: src=%s, rx=%s, result=%s", src, date_rx, date_txt
        )
        if date_fmt:
            rdate = datetime.datetime.strptime(date_txt, date_fmt)
        else:
            # No date format, use dateutil's best guess.
            rdate = dateutil.parser.parse(date_txt)
    except (AttributeError, IndexError, ValueError, dateutil.parser.ParserError):
        logger.exception(
            "when parsing date with src=%s, fmt=%s, rx=%s", src, date_fmt, date_rx
        )

    return rdate


def make_clean_title(args, title):
    if args.title_regex:
        m = re.match(args.title_regex, title)
        if m:
            try:
                title = m[1]
            except IndexError:
                pass
    return title[: args.max_title_length]


def remove_unwanted_body_elements(args, body):
    if args.body_remove_tag:
        lxml.etree.strip_tags(body, *args.body_remove_tag)
    if args.body_remove_xpath:
        for expr in args.body_remove_xpath:
            res = body.xpath(expr)
            if res:
                for elem in res:
                    logger.debug(
                        "body-remove-xpath %s matched: deleting element %s", expr, elem
                    )
                    elem.getparent().remove(elem)
    if args.body_remove_csss:
        for expr in args.body_remove_csss:
            res = body.cssselect(expr)
            if res:
                for elem in res:
                    logger.debug(
                        "body-remove-csss %s matched: deleting element %s", expr, elem
                    )
                    elem.getparent().remove(elem)


def make_item_body(args, page_text, tree):
    bodyhtml = None
    try:
        lst = None
        if args.body_xpath:
            lst = tree.xpath(args.body_xpath)
        if (not lst) and args.body_csss:
            lst = tree.cssselect(args.body_csss)
        if not args.body_xpath and not args.body_csss:
            lst = tree.xpath("/html/body/*")
        if lst:
            if len(lst) > 1:
                body = lxml.html.Element("div")
                body.extend(lst)
            else:
                body = lst[0]
            remove_unwanted_body_elements(args, body)
            cleaner = lxml.html.clean.Cleaner()
            body = cleaner.clean_html(body)
            if isinstance(body, str):
                bodyhtml = body
            else:
                bodyhtml = lxml.html.tostring(
                    body, pretty_print=False, encoding="unicode"
                )
    except (
        lxml.etree.ParserError,
        cssselect.parser.SelectorSyntaxError,
        lxml.etree.XPathEvalError,
    ):
        logger.exception("When trying to get document body")

    return bodyhtml


def find_item_date(args, attr_parser, request, tree, anchor_text, orig_url):
    """Try to get a meaningful last modification date for an item.
    Only argument 'args' is required, everything else can be set to None and
    will be tried according to availability.
    """
    date = None
    if not date and args.date_from_xpath and tree is not None:
        try:
            for res in tree.xpath(args.date_from_xpath):
                logger.debug("date-from-xpath found candidate text: '%s'", res)
                date = try_date_from_str(
                    res, args.xpath_date_regex, args.xpath_date_fmt
                )
                if date:
                    logger.debug("Found date from XPath %s", date)
                    break
        except lxml.etree.XPathEvalError:
            pass
    if not date and args.date_from_csss and tree is not None:
        try:
            for res in tree.cssselect(args.date_from_csss):
                etext = res.text
                logger.debug("date-from-csss found candidate text: '%s'", etext)
                date = try_date_from_str(
                    etext, args.csss_date_regex, args.csss_date_fmt
                )
                if date:
                    logger.debug("Found date from CSS Selector %s", date)
                    break
        except (cssselect.parser.SelectorSyntaxError, lxml.etree.XPathEvalError):
            logger.exception("When handling a CSS selector")
    if not date and args.date_from_text and anchor_text:
        date = try_date_from_str(anchor_text, args.date_from_text, args.text_date_fmt)
    if not date and args.date_from_url and orig_url:
        date = try_date_from_str(orig_url, args.date_from_url, args.url_date_fmt)
    if not date and attr_parser and attr_parser.changed:
        date = attr_parser.changed
    if not date and request and ("Last-Modified" in request.headers):
        last_mod = request.headers["Last-Modified"]
        try:
            date = dateutil.parser.parse(last_mod)
            logger.debug(
                "No date was found but an HTTP header 'Last-Modified' was. "
                "Assuming its value %s as the date %s",
                last_mod,
                date,
            )
        except dateutil.parser.ParserError:
            logger.exception('Invalid date in HTTP header "Last-modified"')
    return date


def make_feed_item_follow(session, url, used_urls, args, link_text, base_attrs):
    attr_parser = CollectAttributesParser()
    description = ""
    req = None
    page_text = ""
    tree = None
    try:
        logger.info("Following URL %s", url)
        req = session.get(url, timeout=args.http_timeout, stream=True)
        if req.url in used_urls:
            return None
        if args.encoding:
            req.encoding = args.encoding
        used_urls.add(req.url)
        chunk_size = 1024 * min(100, args.max_page_length)
        if req.status_code == 200:
            consumed_size = 0
            for chunk in req.iter_content(chunk_size=chunk_size, decode_unicode=True):
                if consumed_size >= 1024 * args.max_page_length:
                    break
                consumed_size += len(chunk)
                attr_parser.feed(chunk)
                page_text += chunk
        else:
            description += "Page returned status code %d<br/>" % req.status_code
    except (
        urllib3.exceptions.ReadTimeoutError,
        requests.exceptions.Timeout,
    ):
        logger.exception("When downloading %s", url)
        # We should handle this somehow.
        return None
    finally:
        if req:
            req.close()

    if attr_parser.description:
        description = attr_parser.description

    # Give a meaningful title for this entry.
    orig_title = attr_parser.title or link_text or attr_parser.canonical or req.url
    clean_title = make_clean_title(args, orig_title)
    if clean_title == base_attrs.title:
        # The title is the same as the one from base url, a typical thing
        # from horribly-designed news pages (and also happens on a
        # government site I need to consult from time to time), so
        # replace it with the contents of the link text, which should
        # give a bit more useful information for the reader.
        description += "Original title: %s<br/>" % orig_title
        clean_title = link_text[: args.max_title_length]

    description += "Link text: %s" % link_text
    item_url = attr_parser.canonical or req.url

    tree = None
    try:
        tree = lxml.html.document_fromstring(page_text)
    except lxml.etree.ParserError:
        logger.exception(
            "Failed to parse document, some information won't be available"
        )

    date = find_item_date(args, attr_parser, req, tree, link_text, item_url)
    if args.require_dates and not date:
        # We need a date but the page have none. Skip this entry.
        logger.info("Ignoring feed entry without date %s", url)
        return None
    if args.with_body and tree is not None:
        bodyhtml = make_item_body(args, page_text, tree)
        if bodyhtml:
            description = bodyhtml
    if attr_parser.tags:
        categories = attr_parser.tags
    elif attr_parser.section:
        categories = [attr_parser.section]
    else:
        categories = None
    if date:
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        date = datetime.datetime.fromtimestamp(date.timestamp(), datetime.timezone.utc)
    return PyRSS2Gen.RSSItem(
        title=clean_title,
        link=item_url,
        author=attr_parser.author,
        description=description,
        guid=PyRSS2Gen.Guid(req.url),
        categories=categories,
        pubDate=date,
    )


def make_feed_item_nofollow(url, used_urls, args, link_text, base_attrs):
    if url in used_urls:
        return None
    used_urls.add(url)
    clean_title = make_clean_title(args, link_text)
    date = find_item_date(args, None, None, None, link_text, url)
    # We need a date but the page have none. Skip this entry.
    if args.require_dates and not date:
        logger.info("Ignoring feed entry without date %s", url)
        return None

    if date:
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        date = datetime.datetime.fromtimestamp(date.timestamp(), datetime.timezone.utc)
    return PyRSS2Gen.RSSItem(
        title=clean_title,
        link=url,
        description=link_text,
        guid=PyRSS2Gen.Guid(url),
        pubDate=date,
    )


def write_feed(rss, args):
    if args.output:
        logger.debug("Writing feed to %s", args.output)
        with open(args.output, "w") as fp:
            rss.write_xml(fp, encoding="utf-8")
    else:
        logger.debug("Writing feed to stdout")
        rss.write_xml(sys.stdout, encoding="utf-8")


def make_exception_feed(exc, args=None):
    logger.warning("Writing exception information to an exception feed.")
    cmdline = " ".join(sys.argv)
    stack_trace = traceback.format_exc()
    msg = (
        "An error occurred when generating this feed."
        + "<br/> <br/>"
        + "<strong>Command line:</strong> <code>"
        + cmdline
        + "</code>"
        + "<br /><br />"
        + "<strong>Exception:</strong> "
        + str(exc)
        + "<br /><br />"
        + "<strong>Stack trace:</strong> <pre>"
        + stack_trace
        + "\n</pre>"
    )

    itm = PyRSS2Gen.RSSItem(
        title="newslinkrss error: " + str(exc)[:64],
        link="data:" + cmdline,
        description=msg,
    )

    rss = PyRSS2Gen.RSS2(
        title="Error: " + cmdline,
        link=args.urls[0] if args else None,
        description="Failed to generate feed.",
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        lastBuildDate=datetime.datetime.now(datetime.timezone.utc),
        items=[itm],
    )
    write_feed(rss, args)


def test_links(link_grabber, args):
    args.no_exception_feed = True
    if link_grabber.limit_reached:
        print("# Limit of %d links was reached." % (link_grabber.max_items))
    for itm in link_grabber.links:
        print("- " + itm[0])
        if itm[1] and itm[1] != "":
            print("    text: " + itm[1])
        if args.date_from_url:
            date = try_date_from_str(itm[0], args.date_from_url, args.url_date_fmt)
            if date:
                print("    url-date:  " + str(date))
        if itm[1] and args.date_from_text:
            date = try_date_from_str(itm[1], args.date_from_text, args.text_date_fmt)
            if date:
                print("    text-date: " + str(date))
        print("")


class EmptyCookieJar(http.cookiejar.CookieJar):
    """A cookie jar that ignores all cookies."""

    def set_cookie(self, cookie, *args, **kwargs):
        """Method to "set" a cookie.  Actually ignores it."""
        return


def get_start_page(args, session, base_attrs, link_grabber, base_url):
    logger.info("Downloading start URL %s", base_url)
    page_content = ""
    req = None
    try:
        req = session.get(base_url, timeout=args.http_timeout, stream=True)
        if args.encoding:
            req.encoding = args.encoding
        chunk_size = 1024 * min(100, args.max_first_page_length)
        consumed_size = 0
        for chunk in req.iter_content(chunk_size=chunk_size, decode_unicode=True):
            if consumed_size >= 1024 * args.max_first_page_length:
                break
            consumed_size += len(chunk)
            page_content += chunk
    finally:
        if req:
            req.close()

    base_attrs.reset_parser()
    base_attrs.feed(page_content)

    link_grabber.reset_parser()
    link_grabber.base_url = base_attrs.base or req.url
    link_grabber.feed(page_content)

    return req


def make_feed(args):
    session = requests.Session()
    session.headers = {
        "User-Agent": args.user_agent,
    }

    if args.no_cookies:
        session.cookies = EmptyCookieJar()

    base_attrs = CollectAttributesParser()
    link_grabber = CollectLinksParser(
        args.link_pattern, args.ignore_pattern, args.max_links, None
    )
    link_grabber.qs_cleanup_rx_list = args.qs_remove_param

    for curr_url in args.urls:
        req = get_start_page(args, session, base_attrs, link_grabber, curr_url)
        if link_grabber.limit_reached:
            break
        if not "Referer" in session.headers:
            session.headers["Referer"] = req.url

    if args.test:
        test_links(link_grabber, args)
        return

    # URLs that where already processed (considering redirects).
    used_urls = set()
    base_links = link_grabber.links

    rss_items = []
    for itm in base_links:
        if args.follow:
            ret_item = make_feed_item_follow(
                session, itm[0], used_urls, args, itm[1], base_attrs
            )
        else:
            ret_item = make_feed_item_nofollow(
                itm[0], used_urls, args, itm[1], base_attrs
            )
        if ret_item:
            rss_items.append(ret_item)

    title = base_attrs.title or ", ".join(args.urls)
    title = title[: args.max_title_length]

    rss = PyRSS2Gen.RSS2(
        title=args.title or title,
        link=args.urls[0],
        description=base_attrs.description
        or base_attrs.title
        or base_attrs.canonical
        or ", ".join(args.urls),
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        lastBuildDate=datetime.datetime.now(datetime.timezone.utc),
        language=base_attrs.language,
        items=rss_items,
    )
    write_feed(rss, args)


def main():
    parser = argparse.ArgumentParser(
        description=(
            "newslinkrss generates RSS feeds from websites that do not "
            "provide their own. This is done by loading a given URL and "
            "collecting links that matches a pattern, given as a regular "
            "expression, to gather the relevant information, optionally "
            "visiting them to get more details and even processing the "
            "target pages with XPath and CSS Selectors if required. "
            "It basically works as a purpose specific crawler or scraper."
        ),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument(
        "-n",
        "--max-links",
        action="store",
        default=50,
        metavar="NUMBER",
        type=int,
        help="Maximum number of links to follow.",
    )

    parser.add_argument(
        "-l",
        "--max-title-length",
        action="store",
        default=150,
        metavar="NUMBER",
        type=int,
        help="Maximum length of a feed title, in characters.",
    )

    parser.add_argument(
        "-p",
        "--link-pattern",
        action="store",
        default=".+",
        metavar="REGEX",
        help=(
            "A regular expression to filter the URLs of links that the "
            "script will follow or capture to generate every feed item."
        ),
    )

    parser.add_argument(
        "-i",
        "--ignore-pattern",
        action="store",
        default=None,
        metavar="REGEX",
        help=(
            "A regular expression used to ignore URLs even if they match "
            "--link-pattern. This may be used to prevent unwanted items "
            "from appearing in the feed while keeping the link pattern "
            "simpler (i.e. no need to make that regex excessively complex "
            "by embedding the ignored patterns in it)."
        ),
    )

    parser.add_argument(
        "-T",
        "--title",
        action="store",
        metavar="ALTERNATE_TITLE",
        default=None,
        help=(
            "Use this alternate title for the feed instead of the one "
            "discovered from the URL."
        ),
    )

    parser.add_argument(
        "--title-regex",
        action="store",
        default=None,
        metavar="REGEX",
        help=(
            "Use this regular expression to select only part of the original "
            "title as the item title. This can be used to remove redundant "
            "or irrelevant parts from the title.Regex must have at least one "
            "group and, if it matches, the content of the first group will "
            "be used as the title. If it does not match, the original title "
            "will be used."
        ),
    )

    parser.add_argument(
        "-A",
        "--date-from-url",
        action="store",
        default=None,
        metavar="REGEX",
        help=(
            "Interpret date and time of the feed item from the URL. This can "
            "spare us from downloading the entire target page in some sites "
            "and blogs that put a parseable date in its URL (e.g.: "
            '"https://example.org/posts/2020/09/22/happy-hobbit-day/") or '
            "be our only option if the website provides no date at all. The "
            "argument is a regular expression containing a single capture "
            "group that extracts the date (or date and time) from the URL. "
            "For the previous example, this regex would be "
            '"/posts/(\\d{4}/\\d{2}/\\d{2})/", which will return "2020/09/22"'
            "to be interpreted as year/month/day. For other formats, see "
            "option --url-date-fmt. The date/time will be used only as it "
            "returns at least the year, month and day (this script will use "
            "hours and minutes if they are available, but it is very rare "
            "for sites to put this information in URLs). "
            "When using --follow, the date detected from this option will "
            "only be used if the page provides no date of its own. "
            "KNOWN BUG: Currently the code assumes that the date is in the "
            "same timezone as the system running this script."
        ),
    )

    parser.add_argument(
        "--url-date-fmt",
        action="store",
        default="%Y/%m/%d",
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --date-from-url. This "
            "value is a format string as specified by strftime(), and *must* "
            "contain at least the formats for year, month and date. This "
            "script can use hours and minutes if they are available, but "
            "it is very rare for sites to put this information in URLs. "
            "If this format is empty, the code will try to interpret it as "
            "some common date/time formats."
        ),
    )

    parser.add_argument(
        "-a",
        "--date-from-text",
        action="store",
        default=None,
        metavar="REGEX",
        help=(
            "Interpret date and time of the feed item from the link text. "
            "This can spare us from downloading the entire target page in "
            "some sites and blogs that put a parseable its links or be our "
            "only option if the website provides no date at all. The argument "
            "is a regular expression containing a single capture group that "
            "extracts the date (or date and time) from the text, and the "
            "resulting capture will be interpreted according to the format "
            "given in option --text-date-fmt (unlike --url-date-fmt, there "
            "is no commonly used format, so the default will probably not "
            "work for you). The date/time will be used only as it returns at "
            "least the year, month and day. "
            "When using --follow, the date detected from this option will "
            "only be used if the page provides no date of its own. "
            "KNOWN BUG: Currently the code assumes that the date is in the "
            "same timezone as the system running this script."
        ),
    )

    parser.add_argument(
        "--text-date-fmt",
        action="store",
        default=None,
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --text-from-url. This "
            "value is a format string as specified by strftime(), and *must* "
            "contain at least the formats for year, month and date. "
            "If this format is not given or empty, the code will try to "
            "interpret it as some common date/time formats."
        ),
    )

    parser.add_argument(
        "--date-from-xpath",
        action="store",
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "Use a XPath expression to get the text containing the date "
            "and time the page was published. This allows picking the "
            "date from any element present in the page, but at the cost "
            "of some complexity and *requires* downloading the candidate "
            "pages by passing option --follow (--max-page-length can "
            "also be used to limit the amount of downloaded data, but "
            "if the required element is not in it, the date won't be "
            "available). "
            "Notice that options --date-from-xpath, --xpath-date-regex, "
            "and --xpath-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. "
            "Example: the XPath expression '//span[@class=\"published-date\"]/@datetime' "
            'will pick the date from attribute "datetime" from the first '
            '"span" tag that has an attribute named "class" with value '
            'equal to "published-date".'
        ),
    )

    parser.add_argument(
        "--xpath-date-regex",
        action="store",
        default="(.+)",
        metavar="REGEX",
        help=(
            "A regular expression containing a single group that is used "
            "to select the part of the text returned by --date-from-xpath, "
            "that will then be parsed as a date using the format given "
            "by option --xpath-date-fmt. This option may be useful when "
            "the XPath expression required to select the exact text just "
            "becomes too complicated or verbose and doing it in two steps "
            "just becomes easier. This option can be safely omitted if "
            "this step is not necessary, as the default regex select the "
            "complete input. "
            "Notice that options --date-from-xpath, --xpath-date-regex, "
            "and --xpath-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. "
        ),
    )

    parser.add_argument(
        "--xpath-date-fmt",
        action="store",
        default=None,
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --date-from-xpath. "
            "This value is a format string as specified by strftime(), and "
            "*must* contain at least the formats for year, month and date. "
            "Notice that options --date-from-xpath, --xpath-date-regex, "
            "and --xpath-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. If this format is not given or empty, the code "
            "will try to interpret it as some common date/time formats."
        ),
    )

    parser.add_argument(
        "--date-from-csss",
        action="store",
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "Use a CSS Selector to get the text containing the date and "
            "time the page was published. This allows picking the date "
            "from any element present in the page, but at the cost of some "
            "complexity and *requires* downloading the candidate pages by "
            "passing option --follow (--max-page-length can also be used "
            "to limit the amount of downloaded data, but if the required "
            "element is not in it, the date won't be available). "
            "Notice that options --date-from-csss, --csss-date-regex, and "
            "--csss-date-fmt work together as a pipeline, first getting the "
            "text from elements in the page, then optionally selecting a "
            "substring from it, and then parsing it as date and time. "
            "Example: the CSS Selector 'span.published-date' will pick the "
            "date from the inner text from the first 'span' tag with class "
            "'published-date' that generates a valid date according to the "
            "regular expression given in option --csss-date-regex and date "
            "format from option --csss-date-fmt."
        ),
    )

    parser.add_argument(
        "--csss-date-regex",
        action="store",
        default="(.+)",
        metavar="REGEX",
        help=(
            "A regular expression containing a single group that is used "
            "to select the part of the text returned by --date-from-csss, "
            "that will then be parsed as a date using the format given "
            "by option --csss-date-fmt. This option may be useful when "
            "the CSS Selector can not select the exact text with the date. "
            "This option can be safely omitted if this step is not "
            "necessary, as the default regex select the complete input. "
            "Notice that options --date-from-csss, --csss-date-regex, "
            "and --csss-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. "
        ),
    )

    parser.add_argument(
        "--csss-date-fmt",
        action="store",
        default=None,
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --date-from-csss. "
            "This value is a format string as specified by strftime(), and "
            "*must* contain at least the formats for year, month and date. "
            "Notice that options --date-from-csss, --csss-date-regex, and "
            "--csss-date-fmt work together as a pipeline, first getting "
            "the text from elements in the page, then optionally selecting "
            "a substring from it, and then parsing it as date and time. If "
            "this format is not given or empty, the code will try to "
            "interpret it as some common date/time formats."
        ),
    )

    parser.add_argument(
        "--log",
        action="store",
        type=str,
        default="WARNING",
        metavar="LOG_LEVEL",
        help=("Define a log level. Valid values are " + ", ".join(USER_LOG_LEVELS)),
    )

    parser.add_argument(
        "--test",
        action="store_true",
        default=False,
        help=(
            "Do not generate the feed, but just print to stdout the "
            "information that was discovered and would be used to generate "
            "the feed. Useful for debugging link and date patterns."
        ),
    )

    parser.add_argument(
        "-f",
        "--follow",
        action="store_true",
        default=False,
        help=(
            "Follow every link matching the pattern and download the page to "
            "gather more information. It is slower, sends extra requests to "
            "the site and transfers more data (sometimes a lot more!), but "
            "allows higher quality feeds."
        ),
    )

    parser.add_argument(
        "-B",
        "--with-body",
        action="store_true",
        default=False,
        help=(
            "Include the page body in the feed, i.e., build a complete "
            "feed. This option requires --follow to work and some "
            "caution is required as it is pretty easy to generate "
            "gigantic feeds by following too many links or reading too "
            "much data from them. Careful usage of options --max-links "
            "and --max-page-length is required! The program will only "
            "pick the contents of the <body> element of the pages, up to "
            "the point that --max-page-length allowed it to be loaded. "
            "SECURITY: this program does some effort to remove malicious "
            "content (e.g. scripts) inserted by the page, but the output "
            "is still considered unsafe and we understand that feed "
            "readers *must* handle it in the same way they deal with "
            "potentially malicious feeds loaded from the network. If "
            "your feed reader treats feeds generated by local commands "
            "more liberally, please do not use this option."
        ),
    )

    parser.add_argument(
        "--body-xpath",
        action="store",
        type=str,
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "A XPath expression selecting the HTML elements to be included "
            "in the body of the feed entry when option --with-body is used. "
            'By default, newslinkrss will use the entire "body" element, '
            "but sometimes a more restricted selection is welcome, for "
            "example, one that includes only the relevant text of a news "
            "article, leaving out menus, headers, related news, etc. "
            "This option can be used together with --body-csss and any "
            "that matches will be used, with XPath having priority."
        ),
    )

    parser.add_argument(
        "--body-csss",
        action="store",
        type=str,
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "A CSS Selector to pick the HTML elements to be included in the "
            "body of the feed entry when option --with-body is used. By "
            'default, newslinkrss will use the entire "body" HTML element, '
            "but sometimes a more restricted selection is welcome, for "
            "example, one that includes only the relevant text of a news "
            "article, leaving out menus, headers, related news, etc. "
            "This option can be used together with --body-xpath and any "
            "that matches will be used, with XPath having priority."
        ),
    )

    parser.add_argument(
        "-R",
        "--body-remove-tag",
        action="append",
        type=str,
        default=None,
        metavar="TAG_NAME",
        help=(
            "Remove all occurrences of the given tag from the feed body and "
            "move their child elements to their parents. This only makes "
            "sense if --with-body is used. This option can be used as many "
            "times as required to remove all unwanted elements and all "
            "children elements. For a more complex operations that allow "
            "arbitrary expressions, see --body-remove-csss and "
            "--body-remove-xpath."
        ),
    )

    parser.add_argument(
        "-X",
        "--body-remove-xpath",
        action="append",
        type=str,
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "Delete the elements specified by the XPath argument from the "
            "feed body, including all their children. This only makes sense "
            "if --with-body is used. This option can be used as many times "
            "as required to remove all unwanted elements. For a CSS Selector "
            "equivalent, see --body-remove-csss. For a simpler version, "
            "to only remove tags but preserve child elements, see "
            "--body-remove-tag."
        ),
    )

    parser.add_argument(
        "-C",
        "--body-remove-csss",
        action="append",
        type=str,
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "Delete the elements specified by the CSS Selector from the "
            "feed body, including all their children. This only makes sense "
            "if --with-body is used. This option can be used as many times "
            "as required to remove all unwanted elements. For a XPath "
            "equivalent, see --body-remove-xpath. For a simpler version, "
            "to only remove tags but preserve child elements, see "
            "--body-remove-tag."
        ),
    )

    parser.add_argument(
        "-Q",
        "--qs-remove-param",
        action="append",
        type=str,
        default=None,
        metavar="REGEX",
        help=(
            "If a URL captured from the source page has a query string, "
            "remove parameters with names matching the regular expression "
            "given in this option. This allows striping tracking parameters "
            "or detecting duplicate URLs that only differ by irrelevant "
            "parameters. Notice that these are regular expressions matching "
            "only against the *name* or the parameter and be aware of the "
            "anchors required to match prefixes only, example: '^utm_.+' . "
            "This option may be used several times if required."
        ),
    )

    parser.add_argument(
        "--require-dates",
        action="store_true",
        default=False,
        help=(
            "Only include an entry in the feed if it has a valid date found "
            "from any supported method. Very useful when filtering blogs and "
            "news sites."
        ),
    )

    parser.add_argument(
        "-U",
        "--user-agent",
        action="store",
        metavar="UA_STRING",
        default=DEFAULT_USER_AGENT,
        help=(
            "Set the user agent to identify ourselves to the site. Some sites "
            "can send different types of content according to it or just "
            "deny access do unknown UAs, so the best option is just "
            "impersonate a commonly used browser."
        ),
    )

    parser.add_argument(
        "--max-page-length",
        action="store",
        default=2048,
        metavar="NUMBER",
        type=int,
        help=(
            "Maximum amount of data, in kilobytes, to download from a single "
            "HTTP request for the pages followed when using option --follow. "
            "If this limit is exceeded, any remaining data will be discarded. "
            "Very important when following links because any of them can led "
            "us into downloading a DVD ISO or something like. This option "
            'does not applies to the "first" page, i.e. the one which URL '
            "is given in command line and it is used as starting point for "
            "the entire process; for this limit, use option "
            "--max-first-page-length"
        ),
    )

    parser.add_argument(
        "--max-first-page-length",
        action="store",
        default=2048,
        metavar="NUMBER",
        type=int,
        help=(
            "Maximum amount of data, in kilobytes, to download from the "
            "first pages, i.e. the ones which URLs are given in command line "
            "to start the process. This is important because the server can "
            "generate and infinite amount of data, redirect us to a "
            "DVD ISO or anything else. For limiting the pages downloaded "
            "when following links (option --follow), see option "
            "--max-page-length"
        ),
    )

    parser.add_argument(
        "--encoding",
        action="store",
        type=str,
        metavar="CHARSET",
        help="Use this explicit character encoding instead of detecting it "
        "automatically. Usually only required for pages with incorrect "
        "charset information which cause the feed to also be presented "
        'in the wrong encoding (aka "mojibake").',
    )

    parser.add_argument(
        "-t",
        "--http-timeout",
        action="store",
        default=2.0,
        type=float,
        metavar="SECONDS",
        help="Timeout for HTTP(S) requests, in seconds",
    )

    parser.add_argument(
        "--no-cookies",
        action="store_true",
        default=False,
        help=(
            "Do not remember cookies among requests. As cookies are never "
            "persisted across invocations of this command, this will only "
            "have any effect when using --follow, typically for sites that "
            "use cookies to detect too many requests in a row."
        ),
    )

    parser.add_argument(
        "-E",
        "--no-exception-feed",
        action="store_true",
        default=False,
        help=(
            "Do not generate feed entries for runtime errors. "
            "The default behavior is to have the information about any "
            "failures that happen when processing the feed returned as the "
            "feed itself, so the user will see it there. However, this "
            "option allows disabling this for moments where it makes more "
            "sense, e.g. when debugging."
        ),
    )

    parser.add_argument(
        "-o",
        "--output",
        action="store",
        metavar="FILENAME",
        help=(
            "Output file to save the feed. If not given, it will be written "
            "to stdout."
        ),
    )

    parser.add_argument(
        "urls",
        action="store",
        nargs="+",
        metavar="URL",
        help="URL of the website to generate the feed.",
    )

    args = parser.parse_args()
    set_log_level(args)

    try:
        make_feed(args)
    except Exception as exc:
        logger.exception("Unhandled exception")
        if args.no_exception_feed:
            raise exc
        make_exception_feed(exc, args)
        return 1
    return 0


if __name__ == "__main__":
    sys.exit(main())
