#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json, sys, os
from urllib.parse import urljoin

import click
from bs4 import BeautifulSoup

from lawfactory_utils.urls import download, enable_requests_cache, clean_url

from senapy.dosleg import opendata


def _log(*args):
    print(*args, file=sys.stderr)


@click.group()
def cli():
    """
    \b
    USAGE:
        - senapy-cli parse <ID_DOSSIER_SENAT|URL_DOSSIER_SENAT|HTML_FILE>
        - senapy-cli doslegs_urls
        - cat urls | senapy-cli parse_many <output_dir>
    """
    pass


@cli.command('parse')
@click.argument('url')
@click.option('--enable-cache', is_flag=True)
def cli_parse(url, enable_cache):
    """
    \b
    Parse a senat dosleg given either a:
        - senat ID, ex: pjl07-488
        - URL
        - HTML file
    """
    from senapy.dosleg.parser import parse

    if enable_cache:
        enable_requests_cache()

    if os.path.exists(url):
        html = open(url).read()
        data = parse(html)
    else:
        if not url.startswith('http'):
            url = "http://www.senat.fr/dossier-legislatif/%s.html" % url
        html = download(url).text
        data = parse(html, url)

    print(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True))


@cli.command()
@click.option('--min-year', default=2008)
@click.option('--max-year', default=None)
@click.option('--filter-on-promulgation', is_flag=True)
@click.option('--in-discussion', is_flag=True)
def doslegs_urls(min_year, max_year, filter_on_promulgation, in_discussion):
    urls_open_data = set()
    if not in_discussion:
        _log('finding urls in Open Data CSV...(keeping only doslegs in and after', min_year, ')')
        # rewrite columns titles to make it easier to work with
        lines = [{k.lower().replace(' ' , '_'): v for k, v in line.items()}
                    for line in opendata.fetch_csv()]
        filter_on = 'date_promulgation' if filter_on_promulgation else 'date_initiale'
        for line in lines:
            date = line[filter_on]
            if date:
                year = int(date.split('/')[-1])
                if year >= min_year and (max_year is None or year <= max_year):
                    urls_open_data.add(clean_url(line['url_du_dossier']))
        _log('  =>', len(urls_open_data), 'urls found in CSV')

    urls_html = set()
    if max_year is None:
        base_url = 'http://www.senat.fr/dossiers-legislatifs/textes-recents.html'
        if in_discussion:
            base_url = 'https://www.senat.fr/ordre-du-jour/ordre-du-jour.html'
        _log('finding urls on', base_url)
        html = download(base_url).text
        soup = BeautifulSoup(html, 'html5lib')

        for link in soup.select('a'):
            href = link.attrs.get('href', '')
            if '/dossier-legislatif/' in href:
                url = urljoin(base_url, href)
                urls_html.add(clean_url(url))
        _log('  =>', len(urls_html), 'doslegs found on website')

    urls = urls_open_data | urls_html
    if len(urls) != len(urls_html):
        _log('=>', len(urls), 'doslegs found in total')

    for url in sorted(urls):
        print(url)


@cli.command()
@click.argument('output_dir')
@click.option('--overwrite', is_flag=True)
@click.option('--disable-cache', is_flag=True)
def parse_many(output_dir, overwrite, disable_cache):
    from senapy.dosleg.parser import parse

    if not disable_cache:
        enable_requests_cache()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for url in sys.stdin:
        url = url.strip()

        senat_id = url.split('/')[-1].replace('.html', '')

        filepath = os.path.join(output_dir, senat_id)
        if not overwrite and os.path.exists(filepath):
            continue

        _log(' -- ', url)
        html = download(url).text
        parsed = parse(html, url)
        json.dump(parsed, open(filepath, 'w'), ensure_ascii=False, indent=2, sort_keys=True)
    _log('parsing finished')


if __name__ == '__main__':
    cli()
