#!/usr/bin/env python
#
# Copyright (C) 2014-2015 DNAnexus, Inc.
#
# This file is part of dx-toolkit (DNAnexus platform client libraries).
#
#   Licensed under the Apache License, Version 2.0 (the "License"); you may not
#   use this file except in compliance with the License. You may obtain a copy
#   of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#   WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#   License for the specific language governing permissions and limitations
#   under the License.

import concurrent.futures
import os
import sys
import json
import argparse
import dxpy
from dxpy.utils import file_load_utils
from dxpy.utils.printing import fill, refill_paragraphs, BOLD, RED

description = BOLD('Note') + ''': this is a utility for use by bash apps
running in the DNAnexus Platform.

Downloads all files that were supplied as inputs to the app.  By
convention, if an input parameter "FOO" has value

    {"$dnanexus_link": "file-xxxx"}

and filename INPUT.TXT, then the linked file will be downloaded into the
path:

    $HOME/in/FOO/INPUT.TXT

If an input is an array of files, then all files will be placed into
numbered subdirectories under a parent directory named for the
input. For example, if the input key is FOO, and the inputs are {A, B,
C}.vcf then, the directory structure will be:

    $HOME/in/FOO/0/A.vcf
                 1/B.vcf
                 2/C.vcf

Zero padding is used to ensure argument order. For example, if there are
12 input files {A, B, C, D, E, F, G, H, I, J, K, L}.txt, the directory
structure will be:

    $HOME/in/FOO/00/A.vcf
                 ...
                 11/L.vcf

This allows using shell globbing (FOO/*/*.vcf) to get all the files in the input
order.
'''

# Parse the command line
#
# exclude -- a list of arguments to skip over
parser = argparse.ArgumentParser(
    description=refill_paragraphs(description),
    formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--except',
                    help=fill('Do not download the input with this name. (May be used multiple times.)',
                              width_adjustment=-20),
                    action="append",
                    dest="exclude",
                    default=[])

parser.set_defaults(parallel=False)
parser.add_argument("--parallel", help="Download the files in parallel", action="store_true",
                    dest="parallel")
parser.add_argument("--sequential", help="Download the files sequentially", action="store_false",
                    dest="parallel")
args = parser.parse_args()
max_num_parallel_downloads = 8

def create_dirs(idir, dirs):
    '''
    Create a set of directories, so we could store the input files.
    For example, seq1 could be stored under:
        /in/seq1/NC_001122.fasta

    TODO: this call could fail, we need to report a reasonable error code

    Note that we create a directory for every file array, even if
    it has zero inputs.
    '''
    # create the <idir> itself
    file_load_utils.ensure_dir(idir)
    # create each subdir
    for d in dirs:
        file_load_utils.ensure_dir(os.path.join(idir, d))

def download_one_file(file_rec):
    src_file = file_rec['src_file_id']
    trg_file = os.path.join(idir, file_rec['trg_fname'])
    print("downloading file: " + src_file + " to filesystem: " + trg_file)
    dxpy.download_dxfile(src_file, trg_file)
    return file_rec

# Download the files sequentially
#   to_download: list of tuples describing files to download
def sequential_file_download(to_download):
    for file_rec in to_download:
        download_one_file(file_rec)

# Download files in parallel
#   to_download: list of tuples describing files to download
def parallel_file_download(to_download):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_num_parallel_downloads) as executor:
       future_files = {executor.submit(download_one_file, file_rec): file_rec for file_rec in to_download}
       for future in concurrent.futures.as_completed(future_files):
           file_rec = future_files[future]
           try:
               future.result()
           except Exception as exc:
               sys.stderr.write('%r -> %s generated an exception' % (file_rec['src_file_id'], file_rec['trg_fname']))
               raise
           else:
               pass


# Input directory, where all inputs are downloaded
idir = file_load_utils.get_input_dir()
job_input_file = file_load_utils.get_input_json_file()
dirs, inputs, rest = file_load_utils.get_job_input_filenames(job_input_file)

# Exclude directories
dirs_to_create = []
for d in dirs:
    if d not in args.exclude:
        dirs_to_create.append(d)

# Create the directory structure, in preparation for download.
# Allows performing the download in parallel.
create_dirs(idir, dirs_to_create)

# Remove excluded inputs
if len(args.exclude) > 0:
    inputs = file_load_utils.filter_dict(inputs, args.exclude)

# Convert to a flat list of elements to download
to_download=[]
for ival_list in inputs.values():
    to_download.extend(ival_list)

# Download the files
if args.parallel:
    parallel_file_download(to_download)
else:
    sequential_file_download(to_download)
