#!python

import sys
import os
import logging
import argparse
import socket
import getpass
import json
import multiprocessing
import random

import requests
import requests_cache

from datetime import datetime

from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders

from requests_futures.sessions import FuturesSession
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import aiu

from aiu import ArchiveItCollection
from aiu import convert_LinkTimeMap_to_dict
from aiu import generate_archiveit_urits
from aiu import process_timemaps_for_mementos
from aiu import discover_raw_urims
from aiu import get_uri_responses

cpu_count = multiprocessing.cpu_count()

logger = logging.getLogger(__name__)

software_name = "seeds2warc script"

def process_arguments(args):

    parser = argparse.ArgumentParser(prog="{}".format(args[0]),
        description='*THIS SOFTWARE IS EXPERIMENTAL* Produces a WARC from the seeds of an Archive-It collection.',
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-c', '--collection_id', dest='collection_id',
        required=True, help="The identifier of the Archive-It collection to download")

    parser.add_argument('-o', '--outputdir', dest='output_directory',
        required=True, help="The directory to use when writing out the WARC")

    parser.add_argument('-cf', dest="cachefile",
        help="the SQLite file to use for caching",
        default="/tmp/fetch_ait_metadata_cache")

    args = parser.parse_args()

    return args

def fetch_mementos_and_write_warcs(timemap_data, working_directory, collection_id):

    output_directory = "{}/archives".format(working_directory)
    error_directory = "{}/capture/memento_errors".format(working_directory)
    warc_filename_stem = "{}/ARCHIVEIT-{}-{}".format(output_directory,
        collection_id, socket.gethostname())

    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)

    if not os.path.isdir(error_directory):
        os.makedirs(error_directory)

    urims = []

    for urit in timemap_data:

        timemap = timemap_data[urit]

        try:
            for memento in timemap["mementos"]["list"]:
                # raw_urim = generate_raw_urim(memento["uri"])
                # urims.append(raw_urim)
                urims.append(memento["uri"])
        except Exception as e:
            logger.error("Error encountered processing TimeMap at {}".format(urit))
            logger.error("TimeMap Object: {}".format(timemap))
            raise e

    raw_urimdata, errordata = discover_raw_urims(urims)

    with open("{}/errors.jsonl".format(error_directory), 'w') as errorsout:
        for urim in errordata:
            errorsout.write(json.dumps(
                {
                    "URI-M": urim,
                    "Error": errordata[urim]
                }
            ))

    invert_raw_urimdata_mapping = {}
    raw_urims = []

    for urim in raw_urimdata:
        raw_urim = raw_urimdata[urim]
        invert_raw_urimdata_mapping.setdefault(raw_urim, []).append( urim )
        raw_urims.append(raw_urim)

    logger.info("Issuing requests for {} raw mementos".format(len(raw_urims)))
  
    retry_session = requests.Session()
    retry = Retry(
        total=10,
        read=10,
        connect=10,
        backoff_factor=0.3,
        status_forcelist=(500, 502, 504)
    )
    adapter = HTTPAdapter(max_retries=retry)
    retry_session.mount('http://', adapter)
    retry_session.mount('https://', adapter)  

    with FuturesSession(max_workers=cpu_count, session=retry_session) as session:
        futures = get_uri_responses(session, raw_urims)

    warcinfo = {
        'software': software_name,
        'hostname': socket.gethostname(),
        # this does not always work correctly
        # 'ip': socket.gethostbyname(socket.gethostname()),
        'isPartOf': 'Archive-It Collection {}'.format(collection_id),
        'description': 'Crawl of seed mementos from Archvie-It Collection {}'.format(
            collection_id),
        'operator': getpass.getuser()
    }

    completed_raw_urims = []
    leftovers = list(set(raw_urims) - set(completed_raw_urims))

    incrementor = 0
    output_filename = "{}-{}.warc.gz".format(warc_filename_stem, incrementor)
    output = open(output_filename, 'wb')
    writer = WARCWriter(output, gzip=True)
    record = writer.create_warcinfo_record(os.path.basename(output_filename), warcinfo)
    writer.write_record(record)

    while len(leftovers) > 0:

        if output.tell() > 100000000:
            output.close()
            incrementor += 1
            output_filename = "{}-{}.warc.gz".format(warc_filename_stem, incrementor)
            logger.info("creating new WARC at {}".format(output_filename))
            output = open(output_filename, 'wb')
            writer = WARCWriter(output, gzip=True)
            logger.info("writing out 'warcinfo' record")
            record = writer.create_warcinfo_record(os.path.basename(output_filename), warcinfo)
            writer.write_record(record)

        raw_urim = random.choice(leftovers)

        logger.info("Processing raw URI-M {} associated with URI-M {}".format(
            raw_urim, invert_raw_urimdata_mapping[raw_urim]))

        if futures[raw_urim].done():

            logger.info("Raw URI-M {} is done".format(raw_urim))

            response = futures[raw_urim].result()

            try:
                # TODO: if the original URI used a Link header, it will be overridden
                linkdict = convert_LinkTimeMap_to_dict(response.headers["link"])
                urir = linkdict["original_uri"]
            except (KeyError, aiu.timemap.MalformedLinkFormatTimeMap) as e:
                logger.warn("no original relation in the Link header for raw memento at {}".format(raw_urim))

                sample_urim = invert_raw_urimdata_mapping[raw_urim][0]

                with open("{}/errors.jsonl".format(error_directory), 'a') as errorsout:
                    logger.warn("recording this at {}/errors.jsonl".format(error_directory))
                    errorsout.write(json.dumps(
                            {
                                "URI-M": sample_urim,
                                "Error": "no original relation in Link header for raw memento at {}, "
                                    "attempted to use sample memento at {} instead".format(
                                        urim, sample_urim
                                    )
                            }
                    ))

                logger.warn("attempting to acquire original relation from original URI-M at {}".format(sample_urim))

                r = requests.get(sample_urim)

                try:
                    linkdict = convert_LinkTimeMap_to_dict(r.headers["link"])
                    urir = linkdict["original_uri"]
                except KeyError as e:
                    logger.warn("could not process raw memento at {}, skipping...".format(raw_urim))

                    with open("{}/errors.jsonl".format(error_directory), 'a') as errorsout:
                        logger.warn("recording this at {}/errors.jsonl".format(error_directory))
                        errorsout.write(json.dumps(
                            {
                                "URI-M": sample_urim,
                                "Error": "could not process raw memento at {}, "
                                    " no original relation in Link header at sample memento {} either".format(
                                        urim, sample_urim
                                    )
                            }
                        ))

                    continue

            headers_list = response.raw.headers.items()

            # sometimes, via redirects, the different URI-Ms end up at the 
            # same raw URI-M
            logger.info("There are {} URI-Ms leading to raw URI-M {}".format(
                len(invert_raw_urimdata_mapping[raw_urim]), raw_urim
            ))

            http_headers = StatusAndHeaders(
                str(response.status_code),
                headers_list, 
                protocol="HTTP/1.1")
            
            #TODO: don't we want the MDT of the URI-M, not the raw URI-M discovered at the end of the redirect chain?
            mdt = datetime.strptime(
                response.headers['memento-datetime'],
                "%a, %d %b %Y %H:%M:%S GMT").strftime(
                    "%Y-%m-%dT%H:%M:%SZ"
                )
            nowdate = datetime.now().strftime(
                "%Y-%m-%dT%H:%M:%SZ"
            )

            for urim in invert_raw_urimdata_mapping[raw_urim]:
                logger.info("writing out WARC record for URI-M {}".format(urim))
                    
                warc_headers_dict = {
                    "WARC-Date": mdt,
                    "WARC-Creation-Date": nowdate,
                    "WARC-Source-URI": raw_urim,
                    "WARC-Source-URI-Orig": urim
                }

                record = writer.create_warc_record(urir, 'response', 
                    payload=response.raw, http_headers=http_headers,
                    warc_headers_dict=warc_headers_dict)

                writer.write_record(record)

            logger.debug("Removing raw URI-M {} from processing list".format(raw_urim))

            completed_raw_urims.append(raw_urim)

        leftovers = list(set(raw_urims) - set(completed_raw_urims))

    if not output.closed:
        output.close()

if __name__ == '__main__':

    args = process_arguments(sys.argv)

    logger = logging.getLogger()
    loglevel = logging.INFO
    logging.basicConfig( 
        format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
        level=loglevel)

    logger.info("Starting Archive-It seed mementos to WARC *NOTE THAT THIS IS EXPERIMENTAL CODE*")
    logger.info("Using collection ID {}".format(args.collection_id))

    output_directory = "{}/{}".format(args.output_directory, args.collection_id)

    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)

    logger.info("Data will be written out to {}".format(output_directory))

    requests_cache.install_cache(args.cachefile, backend='sqlite')
    session = requests.Session()

    # 1. get seeds for Archive-It collection
    aic = ArchiveItCollection(collection_id=args.collection_id)
    seed_uris = aic.list_seed_uris()

    # 2. get list of URI-Ts
    urit_list = generate_archiveit_urits(args.collection_id, seed_uris)

    # 3. process TimeMaps for mementos
    timemap_data = process_timemaps_for_mementos(urit_list, output_directory)

    # 4. download mementos and save them to WARCs
    fetch_mementos_and_write_warcs(timemap_data, output_directory, args.collection_id)

    logger.info("Data has been written out to {}".format(output_directory))

    logger.info("Finished run")
