#!/usr/bin/env python3

#
# newslinkrss - RSS feed generator for generic sites
# Copyright (C) 2020  Alexandre Erwin Ittner <alexandre@ittner.com.br>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

import sys
import re
import os
import datetime
import copy
import argparse
import locale
import logging
import traceback
from html.parser import HTMLParser
import http.cookiejar
import http.cookies
import urllib
import urllib3

import dateutil.parser
import PyRSS2Gen
import requests

import lxml.html
import lxml.html.clean
import lxml.etree
import lxml.cssselect
import cssselect


DEFAULT_USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0"
)

USER_LOG_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", "FATAL"]
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)


def set_log_level(args):
    if not args.log:
        return
    level = args.log.upper()
    if level not in USER_LOG_LEVELS:
        raise ValueError("Bad log level string %s" % args.log)
    numlevel = getattr(logging, level, None)
    if not isinstance(numlevel, int):
        raise ValueError("Log level %s not defined" % args.log)
    logger.setLevel(numlevel)
    logger.info("Log level set to %d (%s)", numlevel, level)


def _first_valid_attr_in_list(attrs, name):
    """Interpret an attribute list from HtmlParser.handle_starttag and get the
    value of the first attribute with the given name.  It should be only one
    but, of course, nobody can force people to only write sane HTML.
    """
    for itm in attrs:
        if (len(itm) > 1) and (itm[0] == name):
            return itm[1]
    return None


def clean_url_query_string(rx_list, url):
    """Remove unwanted parameters from the URL query string.

    If the URL has a query string, remove all name/value pairs with names
    matching any of the regular expressions given in list 'rx_list'.
    Return the URL, possibly modified.
    """

    if not rx_list:
        return url

    u = urllib.parse.urlparse(url)
    query_lst = urllib.parse.parse_qsl(u.query, keep_blank_values=True)
    for rx in rx_list:
        nlst = []
        for itm in query_lst:
            if not re.match(rx, itm[0]):
                nlst.append(itm)
        query_lst = nlst

    query_str = urllib.parse.urlencode(query_lst) if query_lst else None
    new_url = urllib.parse.urlunparse(
        (u.scheme, u.netloc, u.path, u.params, query_str, u.fragment)
    )
    if new_url != url:
        logger.debug("query string cleanup: URL %s rewritten to %s", url, new_url)
    return new_url


class CollectLinksParser(HTMLParser):
    def __init__(self, url_patt=None, ignore_patt=None, max_items=None, base_url=None):
        HTMLParser.__init__(self)
        self.url_patt = url_patt
        self.ignore_patt = ignore_patt
        self.max_items = max_items
        self.base_url = base_url
        self.links = []
        self.limit_reached = False

        # List of regexes with parameters to strip from URL query strings.
        self.qs_cleanup_rx_list = []

        self._found_links = set()
        self._last_link_text = None
        self._grab_link_text = False
        self._last_link = None

    def reset_parser(self):
        """Resets the parser state, but still keeps found links, etc."""
        self._last_link_text = None
        self._grab_link_text = False
        self._last_link = None

    def handle_starttag(self, tag, attrs):
        if (self.max_items is not None) and (len(self.links) >= self.max_items):
            if not self.limit_reached:
                logger.warning("limit of %d links reached", self.max_items)
            self.limit_reached = True
            return

        if tag == "a":
            href = _first_valid_attr_in_list(attrs, "href")
            if not href:
                return

            href = href.split("#", 2)[0]  # Strip URL fragment.
            if self.base_url:
                href = requests.compat.urljoin(self.base_url, href)
            href = clean_url_query_string(self.qs_cleanup_rx_list, href)

            # Try to noe follow the same link more than once. We need to
            # repeat this check later due to redirects.
            if href not in self._found_links and self.test_url_patterns(href):
                self._last_link_text = []
                self._grab_link_text = True
                self._last_link = href

    def test_url_patterns(self, url):
        """Return True if url is valid for visiting (i.e. matches at least one
        accept pattern and do not match any ignore pattern.
        """

        if self.ignore_patt and any(re.match(patt, url) for patt in self.ignore_patt):
            return False
        return not self.url_patt or any(re.match(patt, url) for patt in self.url_patt)

    def handle_data(self, data):
        if self._grab_link_text:
            text = data.strip()
            if text != "":
                self._last_link_text.append(text)

    def handle_endtag(self, tag):
        if tag == "a":
            link_text = ""
            if self._grab_link_text:
                self._grab_link_text = False
                link_text = " ".join(self._last_link_text)
            if self._last_link and self._last_link not in self._found_links:
                self._found_links.add(self._last_link)
                self.links.append((self._last_link, link_text))
                logger.info("New link added: %s %s", self._last_link, link_text)
            self._last_link = False


def normalize_rfc1766_lang_tag(loc):
    """RSS2 and HTML use RFC 1766 language codes (with an "-"), while Open
    Graph and "LANG" environment variable use a "_". This function fixes
    this difference and normalizes cases, spaces, etc.

    Notice that RFC 1766 is not case-sensitive, capitalization is just a
    convention. This function capitalizes country codes for easy reading.

    Returns None for (some) nonsensical values.
    """
    loc = loc.strip().lower().replace("_", "-")
    if len(loc) > 32:
        return None
    lst = loc.split("-", 1)
    if len(lst) == 2 and len(lst[1]) == 2:
        loc = lst[0] + "-" + lst[1].upper()
    return loc


class CollectAttributesParser(HTMLParser):
    """A state machine that parses HTML from a web page and extract some
    useful attributes.

    The following properties are set with useful informaiton:
    title     - String with the page title or None
    base      - String with the base URL or None
    canonical - String with the Canonical URL for the page or None
    description - String with a best-guest for a description or None
    changed   - Datetime with a best-guest for the modification time or None
    author    - String with a best-guest for the author name or None
    section   - Section where article was published or None
    tags      - Tags attached to the article
    language  - Language code (e.g. en-US) or None
    """

    def __init__(self):
        HTMLParser.__init__(self)
        self._title_lst = None
        self._in_head = False
        self.title = None
        self.base = None
        self.description = None
        self.canonical = None
        self.changed = None
        self.author = None
        self.section = None
        self.tags = []
        self.language = None

        # True is a temporary language was found in element <html>. It will
        # be used only until another one is found because too many sites
        # have nonsensical values in it.
        self._html_locale = False

    def reset_parser(self):
        """Reset current parser state, but keep collected data."""
        self._title_lst = None
        self._in_head = False
        self._html_locale = False

    def handle_starttag(self, tag, attrs):
        if tag == "html":
            lang = _first_valid_attr_in_list(attrs, "lang")
            if lang and not self.language:
                self.language = normalize_rfc1766_lang_tag(lang)
                self._html_locale = True

        if tag == "head":
            # Will fail on nested heads, but who is insane enough to do this?!
            self._in_head = True

        if self._in_head and tag == "base":
            self.base = _first_valid_attr_in_list(attrs, "href")

        if self._in_head and (tag == "title") and (not self.title):
            self._title_lst = []

        if self._in_head and tag == "link":
            # <link rel="xxxx" href="yyyy" />
            rel = _first_valid_attr_in_list(attrs, "rel")
            href = _first_valid_attr_in_list(attrs, "href")
            if rel == "canonical" and not self.canonical:
                self.canonical = href

        if self._in_head and tag.lower() == "meta":
            # <meta name="xxxx" content="yyyy" />
            # <meta property="xxxx" content="yyyy" />
            name = _first_valid_attr_in_list(attrs, "name")
            prop = _first_valid_attr_in_list(attrs, "property")
            content = _first_valid_attr_in_list(attrs, "content")
            if name:
                name = name.lower()
            if prop:
                prop = prop.lower()
            # Many sites just mix "name" and "property".
            name_or_prop = name or prop

            # Attributes defined by the Open Graph Protocol: A lot of sites
            # which refuse to provide feeds have this nice attributes so their
            # contents appear nicely when linked on Facebook, Twitter and so.
            # These can provide a lot of useful information.

            if name_or_prop in (
                "article:published_time",
                "article:modified_time",
                "og:updated_time",
            ):
                # Content is a date in ISO format.
                # <meta property="article:published_time" content="2020-09-13T20:00:00+00:00" />
                # <meta property="article:modified_time" content="2020-09-13T20:01:42+00:00" />
                try:
                    dt = dateutil.parser.parse(content)
                    if (not self.changed) or (self.changed < dt):
                        self.changed = dt
                        logger.debug("Found new changed date %s", dt)
                except:
                    logger.exception("When parsing changed date")

            if prop == "og:url" and not self.canonical:
                # <meta property="og:url" content="xxxxx">
                self.canonical = content

            if name_or_prop in ("og:description", "twitter:description", "description"):
                if (
                    content
                    and len(content) > 8
                    and (
                        (not self.description) or (len(content) > len(self.description))
                    )
                ):
                    self.description = content

            if not self.author and name_or_prop in ("author", "article:author"):
                self.author = content

            if name_or_prop == "article:tag":
                if content and content not in self.tags:
                    self.tags.append(content)

            if (name_or_prop == "article:section") and content:
                self.section = content

            if (name_or_prop == "og:locale") and content:
                lang = normalize_rfc1766_lang_tag(content)
                if self._html_locale and self.language:
                    self.language = lang
                    self._html_locale = False
                elif not self.language:
                    self.language = lang

    def handle_data(self, data):
        if self._title_lst is not None:
            self._title_lst.append(data.strip())

    def handle_endtag(self, tag):
        if tag == "head":
            self._in_head = False

        if tag == "title" and self._title_lst is not None:
            self.title = "".join(self._title_lst)
            self._title_lst = None


def try_date_from_str(src, date_rx, date_fmt):
    rdate = None
    try:
        m = re.match(date_rx, src, re.M | re.S)
        if not m:
            return None
        date_txt = m.group(1)
        logger.debug(
            "date regex matched: src=%s, rx=%s, result=%s", src, date_rx, date_txt
        )
        if date_fmt:
            rdate = datetime.datetime.strptime(date_txt, date_fmt)
        else:
            # No date format, use dateutil's best guess.
            rdate = dateutil.parser.parse(date_txt)
    except (AttributeError, IndexError, ValueError, dateutil.parser.ParserError):
        logger.exception(
            "when parsing date with src=%s, fmt=%s, rx=%s", src, date_fmt, date_rx
        )

    return rdate


def get_regex_first_group(regex, srcstr):
    """If a regex with one capture group is given and it matches the source
    string, returns this group. Otherwise, returns None. This is used for a
    few "clean up" filters through the code.

    regex: regular expression string or None
    srcstr: source string or None
    """
    if regex and srcstr:
        m = re.match(regex, srcstr, re.M | re.S)
        if m:
            try:
                return m[1]
            except IndexError:
                pass
    return None


def make_clean_title(args, title):
    clean_title = get_regex_first_group(args.title_regex, title) or title
    return clean_title[: args.max_title_length]


def post_process_item_body(args, body):
    if args.body_remove_tag:
        lxml.etree.strip_tags(body, *args.body_remove_tag)
    if args.body_remove_xpath:
        for expr in args.body_remove_xpath:
            res = body.xpath(expr)
            if res:
                for elem in res:
                    logger.debug(
                        "body-remove-xpath %s matched: deleting element %s", expr, elem
                    )
                    elem.getparent().remove(elem)
    if args.body_remove_csss:
        for expr in args.body_remove_csss:
            res = body.cssselect(expr)
            if res:
                for elem in res:
                    logger.debug(
                        "body-remove-csss %s matched: deleting element %s", expr, elem
                    )
                    elem.getparent().remove(elem)
    if args.body_rename_tag:
        for old_tag, new_tag in args.body_rename_tag:
            for e in body.iter(old_tag):
                e.tag = new_tag
    if args.body_rename_attr:
        for tag, old_attr_name, new_attr_name in args.body_rename_attr:
            for e in body.iter(tag):
                if old_attr_name in e.attrib:
                    e.attrib[new_attr_name] = e.attrib[old_attr_name]
                    del e.attrib[old_attr_name]


def make_item_body(args, page_text, tree):
    bodyhtml = None
    try:
        lst = None
        if args.body_xpath:
            lst = tree.xpath(args.body_xpath)
        if (not lst) and args.body_csss:
            lst = tree.cssselect(args.body_csss)
        if not args.body_xpath and not args.body_csss:
            lst = tree.xpath("/html/body/*")
        if lst:
            if len(lst) > 1:
                body = lxml.html.Element("div")
                body.extend(lst)
            else:
                body = lst[0]
            body = copy.deepcopy(body)
            post_process_item_body(args, body)
            cleaner = lxml.html.clean.Cleaner()
            body = cleaner.clean_html(body)
            if isinstance(body, str):
                bodyhtml = body
            else:
                bodyhtml = lxml.html.tostring(
                    body, pretty_print=False, encoding="unicode"
                )
    except (
        lxml.etree.ParserError,
        cssselect.parser.SelectorSyntaxError,
        lxml.etree.XPathEvalError,
    ):
        logger.exception("When trying to get document body")

    return bodyhtml


def find_item_title(args, attr_parser, request, tree, anchor_text, base_attrs):
    title = None
    if not title and args.title_from_xpath and tree is not None:
        try:
            for res in tree.xpath(args.title_from_xpath):
                if res:
                    title = str(res)
                    break
        except lxml.etree.XPathEvalError:
            logger.exception("When trying to find title from XPath")
    if not title and args.title_from_csss and tree is not None:
        try:
            for res in tree.cssselect(args.title_from_csss):
                etext = res.text_content()
                if etext:
                    title = etext
                    break
        except (cssselect.parser.SelectorSyntaxError, lxml.etree.XPathEvalError):
            logger.exception("When trying to find title from CSS selector")
    if not title:
        title = attr_parser.title or anchor_text or attr_parser.canonical or request.url
    return make_clean_title(args, title)


def find_item_date(args, attr_parser, request, tree, anchor_text, orig_url):
    """Try to get a meaningful last modification date for an item.
    Only argument 'args' is required, everything else can be set to None and
    will be tried according to availability.
    """
    date = None
    if not date and args.date_from_xpath and tree is not None:
        try:
            for res in tree.xpath(args.date_from_xpath):
                logger.debug("date-from-xpath found candidate text: '%s'", res)
                date = try_date_from_str(
                    res, args.xpath_date_regex, args.xpath_date_fmt
                )
                if date:
                    logger.debug("Found date from XPath %s", date)
                    break
        except lxml.etree.XPathEvalError:
            pass
    if not date and args.date_from_csss and tree is not None:
        try:
            for res in tree.cssselect(args.date_from_csss):
                etext = res.text_content()
                if etext is None:
                    continue
                logger.debug("date-from-csss found candidate text: '%s'", etext)
                date = try_date_from_str(
                    etext, args.csss_date_regex, args.csss_date_fmt
                )
                if date:
                    logger.debug("Found date from CSS Selector %s", date)
                    break
        except (cssselect.parser.SelectorSyntaxError, lxml.etree.XPathEvalError):
            logger.exception("When handling a CSS selector")
    if not date and args.date_from_text and anchor_text:
        date = try_date_from_str(anchor_text, args.date_from_text, args.text_date_fmt)
    if not date and args.date_from_url and orig_url:
        date = try_date_from_str(orig_url, args.date_from_url, args.url_date_fmt)
    if not date and attr_parser and attr_parser.changed:
        date = attr_parser.changed
    if not date and request and ("Last-Modified" in request.headers):
        last_mod = request.headers["Last-Modified"]
        try:
            date = dateutil.parser.parse(last_mod)
            logger.debug(
                "No date was found but an HTTP header 'Last-Modified' was. "
                "Assuming its value %s as the date %s",
                last_mod,
                date,
            )
        except dateutil.parser.ParserError:
            logger.exception('Invalid date in HTTP header "Last-modified"')
    return date


def find_item_author(args, attr_parser, tree):
    """Try to get the author of an item.

    Finds the author from explicitly requested elements and falls back to
    metadata if these are not available.
    """
    author = None
    if not author and args.author_from_xpath and tree is not None:
        try:
            for res in tree.xpath(args.author_from_xpath):
                if res:
                    author = get_regex_first_group(args.xpath_author_regex, str(res))
                    break
        except lxml.etree.XPathEvalError:
            logger.exception("When trying to find author from XPath")

    if not author and args.author_from_csss and tree is not None:
        try:
            for res in tree.cssselect(args.author_from_csss):
                text = res.text_content()
                if text is not None:
                    author = get_regex_first_group(args.csss_author_regex, str(text))
                    break
        except (cssselect.parser.SelectorSyntaxError, lxml.etree.XPathEvalError):
            logger.exception("When trying to find author from a CSS selector")
    return author or attr_parser.author


def do_session_http_get(session, url, timeout=2, max_len_kb=0, encoding=None):
    """Do a HTTP(S) GET request for the URL in the context of session,
    subjected to the limits imposed for timeout (in seconds), max_len_kb
    (in kilobytes) and using the given encoding to return the resulting page
    as a *text* string.

    Returns the text and the request object. For exceptions, the text will be
    None and more error information must be inferred from the request object.
    """
    page_text = None
    req = None
    try:
        logger.info("Following URL %s", url)
        req = session.get(url, timeout=timeout, stream=True)
        logger.debug("Request returned status code: %d", req.status_code)
        logger.debug("Request headers: %s", req.request.headers)
        logger.debug("Response headers: %s", req.headers)
        logger.debug("Cookies: %s", session.cookies)
        if encoding:
            req.encoding = encoding
        chunk_size = 1024 * min(100, max_len_kb)
        if req.status_code == 200:
            page_text = ""
            consumed_size = 0
            for chunk in req.iter_content(chunk_size=chunk_size, decode_unicode=True):
                if consumed_size >= 1024 * max_len_kb:
                    break
                consumed_size += len(chunk)
                if type(chunk) == bytes:
                    logger.warning("Unexpected binary return, trying to fix.")
                    chunk = chunk.decode("utf-8")
                page_text += chunk
    except (
        urllib3.exceptions.ReadTimeoutError,
        requests.exceptions.Timeout,
    ):
        logger.exception("When downloading %s", url)
        # We should handle this somehow.
        page_text = None
    finally:
        if req:
            req.close()
    return page_text, req


def make_feed_item_follow(session, url, used_urls, args, link_text, base_attrs):
    page_text, req = do_session_http_get(
        session, url, args.http_timeout, args.max_page_length, args.encoding
    )
    if not page_text:
        return None
    if req.url in used_urls:
        return None

    used_urls.add(req.url)
    attr_parser = CollectAttributesParser()
    description = ""
    if req.status_code == 200:
        attr_parser.feed(page_text)
    else:
        description += "Page returned status code %d<br/>" % req.status_code
    if attr_parser.description:
        description = attr_parser.description
    else:
        description = link_text

    item_url = attr_parser.canonical or req.url
    tree = None
    try:
        tree = lxml.html.document_fromstring(page_text)
    except lxml.etree.ParserError:
        logger.exception(
            "Failed to parse document, some information won't be available"
        )

    title = find_item_title(args, attr_parser, req, tree, link_text, base_attrs)
    date = find_item_date(args, attr_parser, req, tree, link_text, item_url)
    if args.require_dates and not date:
        # We need a date but the page have none. Skip this entry.
        logger.info("Ignoring feed entry without date %s", url)
        return None
    author = find_item_author(args, attr_parser, tree)
    if args.with_body and tree is not None:
        bodyhtml = make_item_body(args, page_text, tree)
        if bodyhtml:
            description = bodyhtml
    if attr_parser.tags:
        categories = attr_parser.tags
    elif attr_parser.section:
        categories = [attr_parser.section]
    else:
        categories = None
    if date:
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        date = datetime.datetime.fromtimestamp(date.timestamp(), datetime.timezone.utc)
    return PyRSS2Gen.RSSItem(
        title=title,
        link=item_url,
        author=author,
        description=description,
        guid=PyRSS2Gen.Guid(req.url),
        categories=categories,
        pubDate=date,
    )


def make_feed_item_nofollow(url, used_urls, args, link_text, base_attrs):
    if url in used_urls:
        return None
    used_urls.add(url)
    clean_title = make_clean_title(args, link_text)
    date = find_item_date(args, None, None, None, link_text, url)
    # We need a date but the page have none. Skip this entry.
    if args.require_dates and not date:
        logger.info("Ignoring feed entry without date %s", url)
        return None

    if date:
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        date = datetime.datetime.fromtimestamp(date.timestamp(), datetime.timezone.utc)
    return PyRSS2Gen.RSSItem(
        title=clean_title,
        link=url,
        description=link_text,
        guid=PyRSS2Gen.Guid(url),
        pubDate=date,
    )


def write_feed(rss, args):
    if args.output:
        logger.debug("Writing feed to %s", args.output)
        with open(args.output, "w", encoding="utf-8") as fp:
            rss.write_xml(fp, encoding="utf-8")
    else:
        logger.debug("Writing feed to stdout")
        rss.write_xml(sys.stdout, encoding="utf-8")


def make_exception_feed(exc, args=None):
    logger.warning("Writing exception information to an exception feed.")
    cmdline = " ".join(sys.argv)
    stack_trace = traceback.format_exc()
    msg = (
        "An error occurred when generating this feed."
        + "<br/> <br/>"
        + "<strong>Command line:</strong> <code>"
        + cmdline
        + "</code>"
        + "<br /><br />"
        + "<strong>Exception:</strong> "
        + str(exc)
        + "<br /><br />"
        + "<strong>Stack trace:</strong> <pre>"
        + stack_trace
        + "\n</pre>"
    )

    itm = PyRSS2Gen.RSSItem(
        title="newslinkrss error: " + str(exc)[:64],
        link="data:" + cmdline,
        description=msg,
    )

    rss = PyRSS2Gen.RSS2(
        title="Error: " + cmdline,
        link=args.urls[0] if args else None,
        description="Failed to generate feed.",
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        lastBuildDate=datetime.datetime.now(datetime.timezone.utc),
        items=[itm],
    )
    write_feed(rss, args)


def test_links(link_grabber, args):
    args.no_exception_feed = True
    if link_grabber.limit_reached:
        print("# Limit of %d links was reached." % (link_grabber.max_items))
    for itm in link_grabber.links:
        print("- " + itm[0])
        if itm[1] and itm[1] != "":
            print("    text: " + itm[1])
        if args.date_from_url:
            date = try_date_from_str(itm[0], args.date_from_url, args.url_date_fmt)
            if date:
                print("    url-date:  " + str(date))
        if itm[1] and args.date_from_text:
            date = try_date_from_str(itm[1], args.date_from_text, args.text_date_fmt)
            if date:
                print("    text-date: " + str(date))
        print("")


def get_start_page(args, session, base_attrs, link_grabber, base_url):
    logger.info("Downloading start URL %s", base_url)
    page_content, req = do_session_http_get(
        session, base_url, args.http_timeout, args.max_first_page_length, args.encoding
    )

    base_attrs.reset_parser()
    base_attrs.feed(page_content)

    link_grabber.reset_parser()
    link_grabber.base_url = base_attrs.base or req.url
    link_grabber.feed(page_content)

    return req


def make_accept_language_header(args):
    """Build a acceptable Accept-Language HTTP header."""
    langs = []
    if args.lang:
        for lang in args.lang:
            normalized = normalize_rfc1766_lang_tag(lang)
            langs.append(normalized if normalized else lang)

    if not langs:
        locale_name = os.getenv("LANG")
        if locale_name:
            locale_name = locale_name.split(".")[0]
            normalized = normalize_rfc1766_lang_tag(locale_name)
            if normalized:
                langs.append(normalized)

    q = 0.8
    qualified = []
    for lang in langs:
        qualified.append("%s;q=%.01f" % (lang, q))
        if q > 0.3:
            q -= 0.2

    if qualified:
        return ",".join(qualified)
    else:
        return None


def make_default_http_headers(args):
    headers = {
        "User-Agent": args.user_agent,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "TE": "trailers",
    }

    accept_language = make_accept_language_header(args)
    if accept_language:
        headers["Accept-Language"] = accept_language

    if args.header:
        for header_value in args.header:
            pair = header_value.split(":", 1)
            value = pair[1].lstrip() if len(pair) == 2 else ""
            headers[pair[0].strip()] = value

    return headers


class ControlledCookiePolicy(http.cookiejar.DefaultCookiePolicy):
    """A cookie policy with a simple read-only/read-write switch."""

    def __init__(self):
        self.read_only = False
        http.cookiejar.DefaultCookiePolicy.__init__(self)

    def set_ok(self, cookie, request):
        if self.read_only:
            return False
        return http.cookiejar.DefaultCookiePolicy.set_ok(self, cookie, request)


def set_cookie_options_for_session(session, args):
    """Set the cookie options for the session."""

    policy = ControlledCookiePolicy()
    session.cookies = requests.cookies.RequestsCookieJar(policy=policy)

    if args.cookie:
        policy.read_only = False
        for cookie_spec in args.cookie:
            c = http.cookies.SimpleCookie(cookie_spec)
            logger.info("New custom cookie parsed as %s", repr(c))
            for key, value in c.items():
                session.cookies[key] = value

    policy.read_only = bool(args.no_cookies)


def make_feed(args):
    session = requests.Session()
    session.headers = make_default_http_headers(args)
    set_cookie_options_for_session(session, args)

    base_attrs = CollectAttributesParser()
    link_grabber = CollectLinksParser(
        args.link_pattern, args.ignore_pattern, args.max_links, None
    )
    link_grabber.qs_cleanup_rx_list = args.qs_remove_param

    for curr_url in args.urls:
        req = get_start_page(args, session, base_attrs, link_grabber, curr_url)
        if link_grabber.limit_reached:
            break
        if not "Referer" in session.headers:
            session.headers["Referer"] = req.url

    # Handle fetch metadata headers according to
    # https://w3c.github.io/webappsec-fetch-metadata/
    if "Sec-Fetch-Site" in session.headers:
        session.headers["Sec-Fetch-Site"] = "same-origin"

    if args.test:
        test_links(link_grabber, args)
        return

    # URLs that where already processed (considering redirects).
    used_urls = set()
    base_links = link_grabber.links

    rss_items = []
    for itm in base_links:
        if args.follow:
            ret_item = make_feed_item_follow(
                session, itm[0], used_urls, args, itm[1], base_attrs
            )
        else:
            ret_item = make_feed_item_nofollow(
                itm[0], used_urls, args, itm[1], base_attrs
            )
        if ret_item:
            rss_items.append(ret_item)

    title = base_attrs.title or ", ".join(args.urls)
    title = title[: args.max_title_length]

    rss = PyRSS2Gen.RSS2(
        title=args.title or title,
        link=args.urls[0],
        description=base_attrs.description
        or base_attrs.title
        or base_attrs.canonical
        or ", ".join(args.urls),
        # PyRSS2Gen ignores tzinfos and requires the date to be explicitly in UTC.
        lastBuildDate=datetime.datetime.now(datetime.timezone.utc),
        language=base_attrs.language,
        items=rss_items,
    )
    write_feed(rss, args)


def set_locale(args):
    """Set locale for this application, using both the default "best effort"
    approach and the explicit locale from command line.
    """

    if args.locale:
        locale.setlocale(locale.LC_TIME, args.locale)
        return

    loc = None
    candidates = ["LC_ALL", "LC_TIME", "LANG"]
    for cand in candidates:
        loc = os.getenv(cand)
        if loc:
            break
    if loc:
        try:
            locale.setlocale(locale.LC_TIME, loc)
        except locale.Error:
            logger.warning("Ignoring wrong/unknown locale %s", loc)


def main():
    parser = argparse.ArgumentParser(
        description=(
            "newslinkrss generates RSS feeds for websites that do not "
            "provide their own. This is done by loading URLs and collecting "
            "links that matches patterns to the of feed items, given as "
            "regular expressions, and optionally visiting them to get more "
            "details and even processing the target pages with XPath and CSS "
            "Selectors if required. It basically works as a purpose specific "
            "crawler or scraper."
        ),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument(
        "-n",
        "--max-links",
        action="store",
        default=50,
        metavar="NUMBER",
        type=int,
        help="Maximum number of links to follow.",
    )

    parser.add_argument(
        "-l",
        "--max-title-length",
        action="store",
        default=150,
        metavar="NUMBER",
        type=int,
        help="Maximum length of a feed title, in characters.",
    )

    parser.add_argument(
        "-p",
        "--link-pattern",
        action="append",
        default=None,
        metavar="REGEX",
        help=(
            "A regular expression to filter the URLs of links that the "
            "script will follow or capture to generate every feed item. "
            "This option can be used multiple times, a URL matching any "
            "expression will be accepted."
        ),
    )

    parser.add_argument(
        "-i",
        "--ignore-pattern",
        action="append",
        default=None,
        metavar="REGEX",
        help=(
            "A regular expression used to ignore URLs even if they match "
            "--link-pattern. This may be used to prevent unwanted items "
            "from appearing in the feed while keeping the link pattern "
            "simpler (i.e. no need to make that regex excessively complex "
            "by embedding the ignored patterns in it). This option can be "
            "used multiple times, a URL matching any expression will be "
            "ignored."
        ),
    )

    parser.add_argument(
        "-T",
        "--title",
        action="store",
        metavar="ALTERNATE_TITLE",
        default=None,
        help=(
            "Use this alternate title for the feed instead of the one "
            "discovered from the URL."
        ),
    )

    parser.add_argument(
        "--title-regex",
        action="store",
        default=None,
        metavar="REGEX",
        help=(
            "Use this regular expression to select only part of the original "
            "title as the item title. This can be used to remove redundant "
            "or irrelevant parts from the title.Regex must have at least one "
            "group and, if it matches, the content of the first group will "
            "be used as the title. If it does not match, the original title "
            "will be used."
        ),
    )

    parser.add_argument(
        "--title-from-xpath",
        action="store",
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "Capture the title from the element given by this XPath "
            "expression in the target document instead of using the usual "
            "document title. This may be useful for sites where the title is "
            "too polluted but there is an alternate element with a "
            "descriptive title readily available. This requires the document "
            "body, so it will only work if used with option --follow."
        ),
    )

    parser.add_argument(
        "--title-from-csss",
        action="store",
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "Capture the title from the element given by this CSS Selector "
            "in the target document instead of using the usual document "
            "title. This may be useful for sites where the title is too "
            "polluted but there is an alternate element with a descriptive "
            "title readily available. This requires the document body, so "
            "it will only work if used with option --follow."
        ),
    )

    parser.add_argument(
        "-A",
        "--date-from-url",
        action="store",
        default=None,
        metavar="REGEX",
        help=(
            "Interpret date and time of the feed item from the URL. This can "
            "spare us from downloading the entire target page in some sites "
            "and blogs that put a parseable date in its URL (e.g.: "
            '"https://example.org/posts/2020/09/22/happy-hobbit-day/") or '
            "be our only option if the website provides no date at all. The "
            "argument is a regular expression containing a single capture "
            "group that extracts the date (or date and time) from the URL. "
            "For the previous example, this regex would be "
            '"/posts/(\\d{4}/\\d{2}/\\d{2})/", which will return "2020/09/22"'
            "to be interpreted as year/month/day. For other formats, see "
            "option --url-date-fmt. The date/time will be used only as it "
            "returns at least the year, month and day (this script will use "
            "hours and minutes if they are available, but it is very rare "
            "for sites to put this information in URLs). "
            "When using --follow, the date detected from this option will "
            "only be used if the page provides no date of its own. "
            "KNOWN BUG: Currently the code assumes that the date is in the "
            "same timezone as the system running this script."
        ),
    )

    parser.add_argument(
        "--url-date-fmt",
        action="store",
        default="%Y/%m/%d",
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --date-from-url. This "
            "value is a format string as specified by strftime(), and *must* "
            "contain at least the formats for year, month and date. This "
            "script can use hours and minutes if they are available, but "
            "it is very rare for sites to put this information in URLs. "
            "If this format is empty, the code will try to interpret it as "
            "some common date/time formats."
        ),
    )

    parser.add_argument(
        "-a",
        "--date-from-text",
        action="store",
        default=None,
        metavar="REGEX",
        help=(
            "Interpret date and time of the feed item from the link text. "
            "This can spare us from downloading the entire target page in "
            "some sites and blogs that put a parseable its links or be our "
            "only option if the website provides no date at all. The argument "
            "is a regular expression containing a single capture group that "
            "extracts the date (or date and time) from the text, and the "
            "resulting capture will be interpreted according to the format "
            "given in option --text-date-fmt (unlike --url-date-fmt, there "
            "is no commonly used format, so the default will probably not "
            "work for you). The date/time will be used only as it returns at "
            "least the year, month and day. "
            "When using --follow, the date detected from this option will "
            "only be used if the page provides no date of its own. "
            "KNOWN BUG: Currently the code assumes that the date is in the "
            "same timezone as the system running this script."
        ),
    )

    parser.add_argument(
        "--text-date-fmt",
        action="store",
        default=None,
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --text-from-url. This "
            "value is a format string as specified by strftime(), and *must* "
            "contain at least the formats for year, month and date. "
            "If this format is not given or empty, the code will try to "
            "interpret it as some common date/time formats."
        ),
    )

    parser.add_argument(
        "--date-from-xpath",
        action="store",
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "Use a XPath expression to get the text containing the date "
            "and time the page was published. This allows picking the "
            "date from any element present in the page, but at the cost "
            "of some complexity and *requires* downloading the candidate "
            "pages by passing option --follow (--max-page-length can "
            "also be used to limit the amount of downloaded data, but "
            "if the required element is not in it, the date won't be "
            "available). "
            "Notice that options --date-from-xpath, --xpath-date-regex, "
            "and --xpath-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. "
            "Example: the XPath expression '//span[@class=\"published-date\"]/@datetime' "
            'will pick the date from attribute "datetime" from the first '
            '"span" tag that has an attribute named "class" with value '
            'equal to "published-date".'
        ),
    )

    parser.add_argument(
        "--xpath-date-regex",
        action="store",
        default="(.+)",
        metavar="REGEX",
        help=(
            "A regular expression containing a single group that is used "
            "to select the part of the text returned by --date-from-xpath, "
            "that will then be parsed as a date using the format given "
            "by option --xpath-date-fmt. This option may be useful when "
            "the XPath expression required to select the exact text just "
            "becomes too complicated or verbose and doing it in two steps "
            "just becomes easier. This option can be safely omitted if "
            "this step is not necessary, as the default regex select the "
            "complete input. "
            "Notice that options --date-from-xpath, --xpath-date-regex, "
            "and --xpath-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. "
        ),
    )

    parser.add_argument(
        "--xpath-date-fmt",
        action="store",
        default=None,
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --date-from-xpath. "
            "This value is a format string as specified by strftime(), and "
            "*must* contain at least the formats for year, month and date. "
            "Notice that options --date-from-xpath, --xpath-date-regex, "
            "and --xpath-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. If this format is not given or empty, the code "
            "will try to interpret it as some common date/time formats."
        ),
    )

    parser.add_argument(
        "--date-from-csss",
        action="store",
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "Use a CSS Selector to get the text containing the date and "
            "time the page was published. This allows picking the date "
            "from any element present in the page, but at the cost of some "
            "complexity and *requires* downloading the candidate pages by "
            "passing option --follow (--max-page-length can also be used "
            "to limit the amount of downloaded data, but if the required "
            "element is not in it, the date won't be available). "
            "Notice that options --date-from-csss, --csss-date-regex, and "
            "--csss-date-fmt work together as a pipeline, first getting the "
            "text from elements in the page, then optionally selecting a "
            "substring from it, and then parsing it as date and time. "
            "Example: the CSS Selector 'span.published-date' will pick the "
            "date from the inner text from the first 'span' tag with class "
            "'published-date' that generates a valid date according to the "
            "regular expression given in option --csss-date-regex and date "
            "format from option --csss-date-fmt."
        ),
    )

    parser.add_argument(
        "--csss-date-regex",
        action="store",
        default="(.+)",
        metavar="REGEX",
        help=(
            "A regular expression containing a single group that is used "
            "to select the part of the text returned by --date-from-csss, "
            "that will then be parsed as a date using the format given "
            "by option --csss-date-fmt. This option may be useful when "
            "the CSS Selector can not select the exact text with the date. "
            "This option can be safely omitted if this step is not "
            "necessary, as the default regex select the complete input. "
            "Notice that options --date-from-csss, --csss-date-regex, "
            "and --csss-date-fmt work together as a pipeline, first "
            "getting the text from elements in the page, then optionally "
            "selecting a substring from it, and then parsing it as date "
            "and time. "
        ),
    )

    parser.add_argument(
        "--csss-date-fmt",
        action="store",
        default=None,
        metavar="DATE_FORMAT",
        help=(
            "The date format to be used with option --date-from-csss. "
            "This value is a format string as specified by strftime(), and "
            "*must* contain at least the formats for year, month and date. "
            "Notice that options --date-from-csss, --csss-date-regex, and "
            "--csss-date-fmt work together as a pipeline, first getting "
            "the text from elements in the page, then optionally selecting "
            "a substring from it, and then parsing it as date and time. If "
            "this format is not given or empty, the code will try to "
            "interpret it as some common date/time formats."
        ),
    )

    parser.add_argument(
        "--author-from-xpath",
        action="store",
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "Use a XPath expression to get the author of an item, allowing to "
            "find authors from any element in the page, which is particularly "
            "useful for sites that do not cite them in the standard metadata. "
            "Notice that options --author-from-xpath and --xpath-author-regex "
            "work together as a pipeline, first getting the author name "
            "from the page and the second filtering optionally selecting a "
            "substring from it."
        ),
    )

    parser.add_argument(
        "--xpath-author-regex",
        action="store",
        default="(.+)",
        metavar="REGEX",
        help=(
            "A regular expression containing a single capture group that "
            "will be used to select the part of the text returned by "
            "--author-from-xpath and then used as the author name. Example: "
            "'by\\s+(.+)' will remove the 'by ' prefix from a byline, "
            "returning only the name."
        ),
    )

    parser.add_argument(
        "--author-from-csss",
        action="store",
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "Use a CSS Selector to get the author of an item, allowing to "
            "find authors from any element in the page, which is particularly "
            "useful for sites that do not cite them in the standard metadata. "
            "Notice that options --author-from-csss and --csss-author-regex "
            "work together as a pipeline, first getting the author name "
            "from the page and the second filtering optionally selecting a "
            "substring from it."
        ),
    )

    parser.add_argument(
        "--csss-author-regex",
        action="store",
        default="(.+)",
        metavar="REGEX",
        help=(
            "A regular expression containing a single capture group that "
            "will be used to select the part of the text returned by "
            "--author-from-csss then used as the author name. Example: "
            "'by\\s+(.+)' will remove the 'by ' prefix from a byline, "
            "returning only the name."
        ),
    )

    parser.add_argument(
        "--log",
        action="store",
        type=str,
        default="WARNING",
        metavar="LOG_LEVEL",
        help=("Define a log level. Valid values are " + ", ".join(USER_LOG_LEVELS)),
    )

    parser.add_argument(
        "--test",
        action="store_true",
        default=False,
        help=(
            "Do not generate the feed, but just print to stdout the "
            "information that was discovered and would be used to generate "
            "the feed. Useful for debugging link and date patterns."
        ),
    )

    parser.add_argument(
        "-f",
        "--follow",
        action="store_true",
        default=False,
        help=(
            "Follow every link matching the pattern and download the page to "
            "gather more information. It is slower, sends extra requests to "
            "the site and transfers more data (sometimes a lot more!), but "
            "allows higher quality feeds."
        ),
    )

    parser.add_argument(
        "-B",
        "--with-body",
        action="store_true",
        default=False,
        help=(
            "Include the page body in the feed, i.e., build a complete "
            "feed. This option requires --follow to work and some "
            "caution is required as it is pretty easy to generate "
            "gigantic feeds by following too many links or reading too "
            "much data from them. Careful usage of options --max-links "
            "and --max-page-length is required! The program will only "
            "pick the contents of the <body> element of the pages, up to "
            "the point that --max-page-length allowed it to be loaded. "
            "SECURITY: this program does some effort to remove malicious "
            "content (e.g. scripts) inserted by the page, but the output "
            "is still considered unsafe and we understand that feed "
            "readers *must* handle it in the same way they deal with "
            "potentially malicious feeds loaded from the network. If "
            "your feed reader treats feeds generated by local commands "
            "more liberally, please do not use this option."
        ),
    )

    parser.add_argument(
        "--body-xpath",
        action="store",
        type=str,
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "A XPath expression selecting the HTML elements to be included "
            "in the body of the feed entry when option --with-body is used. "
            'By default, newslinkrss will use the entire "body" element, '
            "but sometimes a more restricted selection is welcome, for "
            "example, one that includes only the relevant text of a news "
            "article, leaving out menus, headers, related news, etc. "
            "This option can be used together with --body-csss and any "
            "that matches will be used, with XPath having priority."
        ),
    )

    parser.add_argument(
        "--body-csss",
        action="store",
        type=str,
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "A CSS Selector to pick the HTML elements to be included in the "
            "body of the feed entry when option --with-body is used. By "
            'default, newslinkrss will use the entire "body" HTML element, '
            "but sometimes a more restricted selection is welcome, for "
            "example, one that includes only the relevant text of a news "
            "article, leaving out menus, headers, related news, etc. "
            "This option can be used together with --body-xpath and any "
            "that matches will be used, with XPath having priority."
        ),
    )

    parser.add_argument(
        "-R",
        "--body-remove-tag",
        action="append",
        type=str,
        default=None,
        metavar="TAG_NAME",
        help=(
            "Remove all occurrences of the given tag from the feed body and "
            "move their child elements to their parents. This only makes "
            "sense if --with-body is used. This option can be used as many "
            "times as required to remove all unwanted elements and all "
            "children elements. For a more complex operations that allow "
            "arbitrary expressions, see --body-remove-csss and "
            "--body-remove-xpath."
        ),
    )

    parser.add_argument(
        "-X",
        "--body-remove-xpath",
        action="append",
        type=str,
        default=None,
        metavar="XPATH_EXPRESSION",
        help=(
            "Delete the elements specified by the XPath argument from the "
            "feed body, including all their children. This only makes sense "
            "if --with-body is used. This option can be used as many times "
            "as required to remove all unwanted elements. For a CSS Selector "
            "equivalent, see --body-remove-csss. For a simpler version, "
            "to only remove tags but preserve child elements, see "
            "--body-remove-tag."
        ),
    )

    parser.add_argument(
        "-C",
        "--body-remove-csss",
        action="append",
        type=str,
        default=None,
        metavar="CSS_SELECTOR",
        help=(
            "Delete the elements specified by the CSS Selector from the "
            "feed body, including all their children. This only makes sense "
            "if --with-body is used. This option can be used as many times "
            "as required to remove all unwanted elements. For a XPath "
            "equivalent, see --body-remove-xpath. For a simpler version, "
            "to only remove tags but preserve child elements, see "
            "--body-remove-tag."
        ),
    )

    parser.add_argument(
        "-N",
        "--body-rename-tag",
        action="append",
        type=str,
        nargs=2,
        default=None,
        metavar=("TAG_NAME", "NEW_NAME"),
        help=(
            "Replace all occurences of the given tag by the new one in "
            "the feed body. All attributes and the structure are preserved. "
            "A typical use case for this is replacing tags amp-img for img "
            'in sites "infected" by AMP, e.g.: "--rename-tag amp-img img". '
            "This option may be used as many times as required."
        ),
    )

    parser.add_argument(
        "--body-rename-attr",
        action="append",
        type=str,
        nargs=3,
        default=None,
        metavar=("TAG_NAME", "OLD_ATTR_NAME", "NEW_ATTR_NAME"),
        help=(
            "Rename attributes from the given tag. This can be used, for "
            "example, to recover images that are subjected to some lazy "
            "loading strategy: assuming that some site has all images src "
            "attributes pointing to a placeholder image while the actual "
            'URL for the image is in attribute "data-src", it is possible '
            'to recover it by doing "--body-rename-attr img data-src src". '
            "This option has no effect if the element does not have the "
            "source attribute. The destination attribute will be overriden "
            "if it already exists. This option may be used as many times "
            "as required."
        ),
    )

    parser.add_argument(
        "-Q",
        "--qs-remove-param",
        action="append",
        type=str,
        default=None,
        metavar="REGEX",
        help=(
            "If a URL captured from the source page has a query string, "
            "remove parameters with names matching the regular expression "
            "given in this option. This allows striping tracking parameters "
            "or detecting duplicate URLs that only differ by irrelevant "
            "parameters. Notice that these are regular expressions matching "
            "only against the *name* or the parameter and be aware of the "
            "anchors required to match prefixes only, example: '^utm_.+' . "
            "This option may be used several times if required."
        ),
    )

    parser.add_argument(
        "--require-dates",
        action="store_true",
        default=False,
        help=(
            "Only include an entry in the feed if it has a valid date found "
            "from any supported method. Very useful when filtering blogs and "
            "news sites."
        ),
    )

    parser.add_argument(
        "-U",
        "--user-agent",
        action="store",
        metavar="UA_STRING",
        default=DEFAULT_USER_AGENT,
        help=(
            "Set the user agent to identify ourselves to the site. Some sites "
            "can send different types of content according to it or just "
            "deny access do unknown UAs, so the best option is just "
            "impersonate a commonly used browser."
        ),
    )

    parser.add_argument(
        "--max-page-length",
        action="store",
        default=2048,
        metavar="NUMBER",
        type=int,
        help=(
            "Maximum amount of data, in kilobytes, to download from a single "
            "HTTP request for the pages followed when using option --follow. "
            "If this limit is exceeded, any remaining data will be discarded. "
            "Very important when following links because any of them can led "
            "us into downloading a DVD ISO or something like. This option "
            'does not applies to the "first" page, i.e. the one which URL '
            "is given in command line and it is used as starting point for "
            "the entire process; for this limit, use option "
            "--max-first-page-length"
        ),
    )

    parser.add_argument(
        "--max-first-page-length",
        action="store",
        default=2048,
        metavar="NUMBER",
        type=int,
        help=(
            "Maximum amount of data, in kilobytes, to download from the "
            "first pages, i.e. the ones which URLs are given in command line "
            "to start the process. This is important because the server can "
            "generate and infinite amount of data, redirect us to a "
            "DVD ISO or anything else. For limiting the pages downloaded "
            "when following links (option --follow), see option "
            "--max-page-length"
        ),
    )

    parser.add_argument(
        "--encoding",
        action="store",
        type=str,
        metavar="CHARSET",
        help="Use this explicit character encoding instead of detecting it "
        "automatically. Usually only required for pages with incorrect "
        "charset information which cause the feed to also be presented "
        'in the wrong encoding (aka "mojibake").',
    )

    parser.add_argument(
        "--lang",
        action="append",
        type=str,
        default=None,
        metavar="LANGUAGE_CODE",
        help=(
            "Ask the site to return the content in this particular language, "
            "if available. Languages must be specified in ISO 639 or RFC 1766 "
            "codes (e.g. en, en-US, pt-BR, de-DE). This option may be used "
            "several times if required and the order in which the options "
            "are given will be the preference order of the languages. If no "
            "option is given and environment variable LANG is set, "
            "newslinkrss will try to get a language code from it. Internally, "
            "this sets the HTTP Accept-Language header to the prescribed "
            "value(s)."
        ),
    )

    parser.add_argument(
        "--cookie",
        action="append",
        type=str,
        default=None,
        metavar="COOKIE_DEFINITION",
        help=(
            "Add an arbitrary HTTP cookie to all requests sent to the server. "
            "These cookies may be overwritten by cookies set by the server "
            "and then the new ones will be sent in subsequent requests (if "
            "--follow is used). These explicitly requested cookies are sent "
            "even if option --no-cookies is used but, in this case, the "
            "server will not be able to change or replace them. "
            "The cookie definition is the same used for the Set-Cookie HTTP "
            "header as defined in RFC 2965 and can be pretty complex. The "
            "simplest form (NAME=VALUE) can be used ofr the majority of "
            "cases, however. "
            "This option may be repeated as many times as necessary."
        ),
    )

    parser.add_argument(
        "-H",
        "--header",
        action="append",
        type=str,
        default=None,
        metavar="HTTP_HEADER",
        help=(
            "Add an arbitrary HTTP header to all requests send to the "
            "destination server. Headers must be specified in format "
            '"Name: Value" and will be passed to destination almost verbatim, '
            "with only basic whitespace stripping. This option may be "
            "repeated as many times as necessary."
        ),
    )

    parser.add_argument(
        "-t",
        "--http-timeout",
        action="store",
        default=2.0,
        type=float,
        metavar="SECONDS",
        help="Timeout for HTTP(S) requests, in seconds",
    )

    parser.add_argument(
        "--no-cookies",
        action="store_true",
        default=False,
        help=(
            "Do not remember cookies among requests. As cookies are never "
            "persisted across invocations of this command, this will only "
            "have any effect when using --follow, typically for sites that "
            "use cookies to detect too many requests in a row."
        ),
    )

    parser.add_argument(
        "--locale",
        action="store",
        type=str,
        default=None,
        help=(
            "Use this locale for parsing dates and times. By default, "
            "newslinkrss will use the locale from environment variables and "
            "ignore any failure. If this option is used, it will use the "
            "given locale and abort in the event of a failure (e.g. locale "
            "not available). If you want to keep the default best-effort "
            "strategy for a non-default locale, set LC_ALL for newslinkrss "
            "(i.e. call with 'LC_ALL=pt_BR.UTF-8 newslinkrss <options>')."
        ),
    )

    parser.add_argument(
        "-E",
        "--no-exception-feed",
        action="store_true",
        default=False,
        help=(
            "Do not generate feed entries for runtime errors. "
            "The default behavior is to have the information about any "
            "failures that happen when processing the feed returned as the "
            "feed itself, so the user will see it there. However, this "
            "option allows disabling this for moments where it makes more "
            "sense, e.g. when debugging."
        ),
    )

    parser.add_argument(
        "-o",
        "--output",
        action="store",
        metavar="FILENAME",
        help=(
            "Output file to save the feed. If not given, it will be written "
            "to stdout."
        ),
    )

    parser.add_argument(
        "urls",
        action="store",
        nargs="+",
        metavar="URL",
        help="URL of the website to generate the feed.",
    )

    args = parser.parse_args()
    set_log_level(args)
    set_locale(args)

    logger.debug("URL accept pattern: %s", args.link_pattern)
    logger.debug("URL ignore pattern: %s", args.ignore_pattern)

    try:
        make_feed(args)
    except Exception as exc:
        logger.exception("Unhandled exception")
        if args.no_exception_feed:
            raise exc
        make_exception_feed(exc, args)
        return 1
    return 0


if __name__ == "__main__":
    sys.exit(main())
