#!python

import sys
import os
import logging
import argparse
import socket
import getpass
import json
import multiprocessing
import random
import hashlib
import requests

from datetime import datetime

from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders

from requests_futures.sessions import FuturesSession

from aiu import convert_LinkTimeMap_to_dict
from aiu import process_timemaps_for_mementos
from aiu import discover_raw_urims
from aiu import get_uri_responses

cpu_count = multiprocessing.cpu_count()

logger = logging.getLogger(__name__)

software_name = "seeds2warc script"

def process_arguments(args):

    parser = argparse.ArgumentParser(prog="{}".format(args[0]),
        description='*THIS SOFTWARE IS EXPERIMENTAL* Produces a WARC from the mementos in a TimeMap.',
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-t', '--urit', dest='urit',
        required=True, help="The TimeMap URI (URI-T) of the mementos to download")

    parser.add_argument('-o', '--outputdir', dest='output_directory',
        required=True, help="The directory to use when writing out the WARC")

    args = parser.parse_args()

    return args

def fetch_mementos_and_write_warcs(timemap_data, working_directory, urit):

    output_directory = "{}/archives".format(working_directory)
    error_directory = "{}/capture/memento_errors".format(working_directory)
    warc_filename_stem = "{}/TIMEMAP-{}-{}".format(output_directory, 
        hashlib.sha256(urit.encode('utf8')).hexdigest(), socket.gethostname())

    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)

    if not os.path.isdir(error_directory):
        os.makedirs(error_directory)

    urims = []

    for urit in timemap_data:

        timemap = timemap_data[urit]

        try:
            for memento in timemap["mementos"]["list"]:
                # raw_urim = generate_raw_urim(memento["uri"])
                # urims.append(raw_urim)
                urims.append(memento["uri"])
        except Exception as e:
            logger.error("Error encountered processing TimeMap at {}".format(urit))
            logger.error("TimeMap Object: {}".format(timemap))
            raise e

    raw_urimdata, errordata = discover_raw_urims(urims)

    with open("{}/errors.jsonl".format(error_directory), 'w') as errorsout:
        for urim in errordata:
            errorsout.write(json.dumps(
                {
                    "URI-M": urim,
                    "Error": errordata[urim]
                }
            ))

    invert_raw_urimdata_mapping = {}
    raw_urims = []

    for urim in raw_urimdata:
        raw_urim = raw_urimdata[urim]
        invert_raw_urimdata_mapping.setdefault(raw_urim, []).append( urim )
        raw_urims.append(raw_urim)

    logger.info("Issuing requests for {} raw mementos".format(len(raw_urims)))
    
    with FuturesSession(max_workers=cpu_count) as session:
        futures = get_uri_responses(session, raw_urims)

    warcinfo = {
        'software': software_name,
        'hostname': socket.gethostname(),
        # this does not always work correctly
        # 'ip': socket.gethostbyname(socket.gethostname()),
        'isPartOf': 'TimeMap {}'.format(urit),
        'description': 'Crawl of seed mementos from TimeMap {}'.format(
            urit),
        'operator': getpass.getuser()
    }

    completed_raw_urims = []
    leftovers = list(set(raw_urims) - set(completed_raw_urims))

    incrementor = 0
    output_filename = "{}-{}.warc.gz".format(warc_filename_stem, incrementor)
    output = open(output_filename, 'wb')
    writer = WARCWriter(output, gzip=True)
    record = writer.create_warcinfo_record(os.path.basename(output_filename), warcinfo)
    writer.write_record(record)

    while len(leftovers) > 0:

        if output.tell() > 100000000:
            output.close()
            incrementor += 1
            output_filename = "{}-{}.warc.gz".format(warc_filename_stem, incrementor)
            logger.info("creating new WARC at {}".format(output_filename))
            output = open(output_filename, 'wb')
            writer = WARCWriter(output, gzip=True)
            logger.info("writing out 'warcinfo' record")
            record = writer.create_warcinfo_record(os.path.basename(output_filename), warcinfo)
            writer.write_record(record)

        raw_urim = random.choice(leftovers)

        logger.info("Processing raw URI-M {} associated with URI-M {}".format(
            raw_urim, invert_raw_urimdata_mapping[raw_urim]))

        if futures[raw_urim].done():

            logger.info("Raw URI-M {} is done".format(raw_urim))

            response = futures[raw_urim].result()

            try:
                linkdict = convert_LinkTimeMap_to_dict(response.headers["link"])
                urir = linkdict["original_uri"]
            except KeyError as e:
                logger.warn("no original relation in the Link header for raw memento at {}".format(raw_urim))

                sample_urim = invert_raw_urimdata_mapping[raw_urim][0]

                with open("{}/errors.jsonl".format(error_directory), 'a') as errorsout:
                    logger.warn("recording this at {}/errors.jsonl".format(error_directory))
                    errorsout.write(json.dumps(
                            {
                                "URI-M": sample_urim,
                                "Error": "no original relation in Link header for raw memento at {}, "
                                    "attempted to use sample memento at {} instead".format(
                                        urim, sample_urim
                                    )
                            }
                    ))

                logger.warn("attempting to acquire original relation from original URI-M at {}".format(sample_urim))

                r = requests.get(sample_urim)

                try:
                    linkdict = convert_LinkTimeMap_to_dict(r.headers["link"])
                    urir = linkdict["original_uri"]
                except KeyError as e:
                    logger.warn("could not process raw memento at {}, skipping...".format(raw_urim))

                    with open("{}/errors.jsonl".format(error_directory), 'a') as errorsout:
                        logger.warn("recording this at {}/errors.jsonl".format(error_directory))
                        errorsout.write(json.dumps(
                            {
                                "URI-M": sample_urim,
                                "Error": "could not process raw memento at {}, "
                                    " no original relation in Link header at sample memento {} either".format(
                                        urim, sample_urim
                                    )
                            }
                        ))

                    continue
                

            headers_list = response.raw.headers.items()

            # sometimes, via redirects, the different URI-Ms end up at the 
            # same raw URI-M
            logger.info("There are {} URI-Ms leading to raw URI-M {}".format(
                len(invert_raw_urimdata_mapping[raw_urim]), raw_urim
            ))

            http_headers = StatusAndHeaders(
                str(response.status_code),
                headers_list, 
                protocol="HTTP/1.1")
            
            #TODO: don't we want the MDT of the URI-M, not the raw URI-M discovered at the end of the redirect chain?
            mdt = datetime.strptime(
                response.headers['memento-datetime'],
                "%a, %d %b %Y %H:%M:%S GMT").strftime(
                    "%Y-%m-%dT%H:%M:%SZ"
                )
            nowdate = datetime.now().strftime(
                "%Y-%m-%dT%H:%M:%SZ"
            )

            for urim in invert_raw_urimdata_mapping[raw_urim]:
                logger.info("writing out WARC record for URI-M {}".format(urim))
                    
                warc_headers_dict = {
                    "WARC-Date": mdt,
                    "WARC-Creation-Date": nowdate,
                    "WARC-Source-URI": raw_urim,
                    "WARC-Source-URI-Orig": urim
                }

                record = writer.create_warc_record(urir, 'response', 
                    payload=response.raw, http_headers=http_headers,
                    warc_headers_dict=warc_headers_dict)

                writer.write_record(record)

            logger.debug("Removing raw URI-M {} from processing list".format(raw_urim))

            completed_raw_urims.append(raw_urim)

        leftovers = list(set(raw_urims) - set(completed_raw_urims))

    if not output.closed:
        output.close()

if __name__ == '__main__':

    args = process_arguments(sys.argv)

    logger = logging.getLogger()
    loglevel = logging.INFO
    logging.basicConfig( 
        format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
        level=loglevel)

    logger.info("Starting TimeMap mementos to WARC")
    logger.info("Using TimeMap URI (URI-T) {}".format(args.urit))

    output_directory = "{}/{}".format(args.output_directory, 
        hashlib.sha256(args.urit.encode('utf8')).hexdigest())

    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)

    logger.info("Data will be written out to {}".format(output_directory))

    # 1. process TimeMaps for mementos
    timemap_data = process_timemaps_for_mementos([args.urit], output_directory)

    # 2. download mementos and save them to WARCs
    fetch_mementos_and_write_warcs(timemap_data, output_directory, args.urit)

    logger.info("Data has been written out to {}".format(output_directory))

    logger.info("Finished run")