#!/usr/bin/env python
"""
Directory merge tool that consolidates files from a source directory into the
destination directory by comparing file names, file hashes, and file sizes and
change timestamps.

Usage:
  $ gristle_dir_merger --source-dir [source_dir] --dest-dir [dest_dir] [misc options]


{see: helpdoc.HELP_SECTION}


Main Options:
  --source_dir [SOURCE_DIR]
                        The source directory - from which files are deleted and
                        moved into the destination directory.
  --dest_dir [DEST_DIR] The destination directory - where files are moved into.


Match-Controlling Options:

  --match-on [MATCH_ON] Directs the program to match files on either just their names
                        or on the combination of names and md5 hashes.  Valid values
                        include:  name_only and name_and_md5.  The default is:
                        name_and_md5.

  --on-full-match [ON_FULL_MATCH]
                        Describes the Action for files matched completely.
                        Valid values include:
         keep_dest    - Keeps the desination file and deletes the source file.
                        This is the default - because it is the fastest, and
                        the two files are already deemed to be identical.
         keep_source  - Keeps the source and deletes the destination.  This
                        override might be useful if you want to keep the
                        ownership, privs, etc associated with the source file.
                        Or if you're only matching on name.
         keep_both    - Keeps both files by copying the source_file over with a
                        slightly modified name - in which a number is inserted
                        prior to the file extension.
         keep_newest  - Keeps the newest of the two matching files, as
                        determined by modification time.
         keep_biggest - Keeps the largest of the two matching files.

  --on-partial-match [ON_PARTIAL_MATCH]
                        Describes the Action for files matched by name, but not 
                        by md5.  Only applies to match-on=name_and_md5.
                        Valid values include:
         keep_newest  - Keeps the newest of the two matching files, as
                        determined by modification time.  Ths is the default.
         keep_biggest - Keeps the largest of the two matching files.
         keep_both    - Keeps both files by copying the source_file over with a
                        slightly modified name - in which a number is inserted
                        prior to the file extension.
         keep_dest    - Keeps the desination file and deletes the source file.
         keep_source  - Keeps the source and deletes the destination.

MISC Options:

  --dry-run             Lists files and action, but does not change them.
  -r, --recursive       Walk through directories recursively.  Default is
                        non-recursive.
  --ignore-errors       Will ignore errors that would otherwise stop processing
                        such as encountering a symbolic link.


{see: helpdoc.CONFIG_SECTION}


Examples
  $ gristle_dir_merger --source-dir /tmp/foo --dest-dir /data/foo
        - Compares source of /tmp/foo to dest of /data/foo.
        - Files will be consolidated into /data/foo, and deleted from /tmp/foo.
        - Comparison will be: match-on-name-and-md5 (default)
        - Full matches will use: keep_dest (default)
        - Partial matches will use: keep_newest (default)
        - Bottom line: this is what you normally want

  $ gristle_dir_merger --source-dir /tmp/foo --dest-dir /data/foo --dry-run
        - Same as the first example - except it only prints what it would do
          without actually doing it.
        - Bottom line: this is a good step to take prior to running it for real

  $ gristle_dir_merger --source-dir /tmp/foo --dest-dir /data/foo -r
        - Same as the first example - except it runs recursively through
          the directories.

  $ gristle_dir_merger --source-dir /tmp/foo --dest-dir /data/foo
    --on-partial-match keep_biggest
        - Comparison will be: match-on-name-and-md5 (default)
        - Full matches will use: keep_dest (default)
        - Partial matches will use: keep_biggest (override)
        - Bottom line: this is a good combo if you know that some files
          have been modified on both source & dest, and newest isn't the best.

  $ gristle_dir_merger --source-dir /tmp/foo --dest-dir /data/foo
    --match-on name_only --on-full-match keep_source
        - Comparison will be: match-on-name-only (override)
        - Full matches will use: keep_source (override)
        - Bottom line: this is a good way to go if you have
          files that have changed in both directories, but always want to
          use the source files.

Performance
    Because of a number of optimizations, gristle_dir_merger does as little work
    as it can get away with.  This means using the dest_dir copy rather than the
    source_copy when two files match, and it means stopping the md5 process as
    soon as the files are discovered to be different.

    A simple benchmark was performed with a circa 2010 laptop with an SSD drive
    using two 600 MB directories, each containing the exact same 550 files
    spread across 70 directories.  This is a worst-case scenario, since 100% of
    all files will have to be MD5'd:
        - using defaults (match-on-name-and-md5 and keep-source):
            - ~9 seconds
        - using worst-case settings  (match-on-name-and-md5 and keep-source):
            - ~13 seconds
    These times would be far slower on magnetic media.

Warnings and Limitations
    - Concurrency - this program is not intended to be used on data while
      other processes are modifying the files.


This source code is protected by the BSD license.  See the file "LICENSE"
in the source code root directory for the full language or refer to it here:
  http://opensource.org/licenses/BSD-3-Clause

Copyright 2014-2021 Ken Farmer
"""
import errno
import glob
import hashlib
import logging
import os
from os.path import dirname, basename
from os.path import isdir, isfile, exists, join as pjoin
from os.path import getsize
from pprint import pprint as pp
import shutil
from signal import signal, SIGPIPE, SIG_DFL
import sys
import time
from typing import Dict

import datagristle.common as comm
import datagristle.configulator as conf
import datagristle.helpdoc as helpdoc

#--- Ignore SIG_PIPE and don't throw exceptions on it
#--- (http://docs.python.org/library/signal.html)
signal(SIGPIPE, SIG_DFL)

NAME = basename(__file__)
LONG_HELP = helpdoc.expand_long_help(__doc__)
SHORT_HELP = helpdoc.get_short_help_from_long(LONG_HELP)
comm.validate_python_version()



def main():
    """ Sets up the environment then runs the primary process.
    """
    config_manager = ConfigManager(NAME, SHORT_HELP, LONG_HELP)
    nconfig, _ = config_manager.get_config()

    setup_logs(nconfig.verbosity)

    process_dir = ProcessDirs(nconfig.source_dir,
                              nconfig.match_on,
                              nconfig.on_full_match,
                              nconfig.on_partial_match,
                              nconfig.recursive,
                              nconfig.dry_run,
                              nconfig.ignore_errors)

    process_dir.walk(nconfig.source_dir, nconfig.dest_dir)

    if nconfig.verbosity in ('high', 'debug'):
        process_dir.write_stats()

    return 0



class ProcessDirs(object):
    """ Handles walking through the directories then running the merge process
        against files within them.
    """

    def __init__(self,
                 starting_src_dir: str,
                 matchon: str,
                 full_match: str,
                 part_match: str,
                 recursive: bool,
                 dry_run: bool,
                 ignore_err: bool):

        assert isdir(starting_src_dir)

        self.start_time = time.time()
        self.ignore_err = ignore_err
        self.recursive = recursive
        self.matchon = matchon
        self.starting_src_dir = starting_src_dir
        self.dest_dirs: Dict[str, int] = {} # key is dirpath, value is count of files within
        self.dest_files: Dict[str, str] = {} # key is dirpath, value is list of files
        self.full_match = full_match
        self.part_match = part_match
        self.merger = MergeFiles(dry_run, ignore_err)
        self.full_match_cnt = 0
        self.partial_match_cnt = 0
        self.no_match_cnt = 0
        self.sdir_cnt = 0
        self.sfile_cnt = 0
        self.sdotfile_cnt = 0
        self.ddir_cnt = 0
        self.dfile_cnt = 0



    def walk(self, src_dir, dst_dir):
        """ Inputs:
                - src_dir - fully qualified source directory
                - dst_dir - fully qualified destination directory
            Processing
                - Will call itself
                - Will terminate the program if it encounters a symbolic link,
                  unless --ignore-errors option was provided
        """
        kill_if_symlink(src_dir, 'source', self.ignore_err)
        kill_if_symlink(dst_dir, 'dest', self.ignore_err)

        source_dirs, source_files = self.get_walk_entries(src_dir)

        for sub_dir in source_dirs:

            self.sdir_cnt += 1
            self._is_srcfile_in_destdir(None, dst_dir) # ensure dir in cache
            kill_if_symlink(pjoin(src_dir, sub_dir), 'source', self.ignore_err)
            kill_if_symlink(pjoin(dst_dir, sub_dir), 'dest', self.ignore_err)

            if isdir(pjoin(src_dir, sub_dir)):
                if self.recursive:
                    self.walk(pjoin(src_dir, sub_dir), pjoin(dst_dir, sub_dir))
                if not os.listdir(pjoin(src_dir, sub_dir)):
                    self.merger.keep_dest(pjoin(src_dir, sub_dir), None)
                    log_reason('source_dir empty - will be removed')

            else:
                self.no_match_cnt += 1
                reason = 'source_dir has no matching dest'
                self.merger.keep_source(pjoin(src_dir, sub_dir),
                                        pjoin(dst_dir, sub_dir), reason=reason)

        for file_name in source_files:
            self.sfile_cnt += 1
            if file_name.startswith('.'):
                self.sdotfile_cnt += 1
            self._merge_source_file(pjoin(src_dir, file_name), dst_dir)

        if src_dir == self.starting_src_dir:
            self.remove_top_source()


    def get_walk_entries(self, src_dir):
        """ Intended to emulate the os.walk() interaction.
            Inputs:
                - a_dir - a directory
            Outputs:
                - dirs  - a list of subdirectories within a_dir
                - files - a list of files within a_dir
        """
        entries = os.listdir(src_dir)
        dirs = []
        files = []
        for entry in entries:
            if isdir(pjoin(src_dir, entry)):
                dirs.append(entry)
            else:
                files.append(entry)
        return dirs, files


    def _merge_source_file(self, src_path, dst_path):
        """ Attempts to match a source file against a corresponding one in the
            dest dir by name_only or name_and_md5.  Then it performs the action
            specified for the level of match that occurred.
            Inputs:
               - src_path - the source path - must specify a file
               - dst_path - the source path - must specify a dir
        """
        assert isfile(src_path)
        if exists(dst_path):
            assert isdir(dst_path)

        if self._is_srcfile_in_destdir(basename(src_path), dst_path):
            if self.matchon == 'name_only':
                self.full_match_cnt += 1
                rsn = 'full-match'
                self.merger.action[self.full_match](src_path, dst_path, rsn)
            elif self.matchon == 'name_and_md5':
                if compare_md5s(src_path, pjoin(dst_path, basename(src_path))) == 'matched':
                    self.full_match_cnt += 1
                    rsn = 'full-match'
                    self.merger.action[self.full_match](src_path, dst_path, rsn)
                else:
                    self.partial_match_cnt += 1
                    rsn = 'partial-match'
                    self.merger.action[self.part_match](src_path, dst_path, rsn)
            else:
                logging.critical('Logic Error, bad value: %s' % self.matchon)
                sys.exit(2)
        else:
            self.no_match_cnt += 1
            rsn = 'source_file has no matching dest'
            self.merger.keep_source(src_path, dst_path, rsn)



    def _is_srcfile_in_destdir(self, src_file, dest_dir):
        """ Determines if source_file is also in the corresponding dest_dir.
            Inputs:
                - src_file
                - dest_dir
            Outputs:
                - Boolean
        """
        if dest_dir not in self.dest_dirs:
            dest_file_list = glob.glob(pjoin(dest_dir, '*'))
            dest_file_list.extend(glob.glob(pjoin(dest_dir, '.*')))
            self.dest_dirs[dest_dir] = len(dest_file_list)
            self.dest_files[dest_dir] = [basename(f) for f in dest_file_list]

        if src_file:
            if src_file in self.dest_files[dest_dir]:
                return True
            else:
                return False


    def generate_dest_counts(self):
        ddir_cnt  = 0
        dfile_cnt = 0
        for _, value in self.dest_dirs.items():
            if value > 0:
                ddir_cnt += 1
                dfile_cnt += value
        self.ddir_cnt = ddir_cnt
        self.dfile_cnt = dfile_cnt - ddir_cnt


    def remove_top_source(self):
        if not os.listdir(self.starting_src_dir):
            self.merger.keep_dest(self.starting_src_dir, None)
            log_reason('source_dir empty - will be removed')


    def write_stats(self):
        self.generate_dest_counts()

        logging.info('')
        logging.info('---------- Counts -----------')
        logging.info('Total Source Objects:       %d', (self.sdir_cnt + self.sfile_cnt))
        logging.info('   Directory Read Cnt:      %d', self.sdir_cnt)
        logging.info('   File Read Cnt:           %d', self.sfile_cnt)
        logging.info('      DotFile Read Cnt:     %d', self.sdotfile_cnt)
        logging.info('Total Dest Objects:         %d', (self.ddir_cnt + self.dfile_cnt))
        logging.info('   Directory Read Cnt:      %d', self.ddir_cnt)
        logging.info('   File Read Cnt:           %d', self.dfile_cnt)
        logging.info('')
        logging.info('Full Match Cnt:             %d', self.full_match_cnt)
        logging.info('Partial Match Cnt:          %d', self.partial_match_cnt)
        logging.info('No Match Cnt:               %d', self.no_match_cnt)
        logging.info('')
        logging.info('Delete Source Cnt:          %d', self.merger.delete_source_cnt)
        logging.info('Move Source Cnt:            %d', self.merger.move_source_cnt)
        logging.info('Rename And Move Source Cnt: %d', self.merger.rename_and_move_source_cnt)
        logging.info('')
        logging.info('Seconds Duration:     %7.0f', (time.time() - self.start_time))




class MergeFiles(object):
    """ A collection of methods used to merge directories and files.
    """

    def __init__(self,
                 dry_run: bool,
                 ignore_err: bool):

        self.dry_run = dry_run
        self.ignore_err = ignore_err
        self.action = {'keep_dest': self.keep_dest,
                       'keep_source': self.keep_source,
                       'keep_both': self.keep_both,
                       'keep_newest': self.keep_newest,
                       'keep_biggest': self.keep_biggest}
        self.move_source_cnt = 0
        self.rename_and_move_source_cnt = 0
        self.delete_source_cnt = 0


    def keep_dest(self, src_fqfn, dst_dir, reason=''):
        self._delete_source(src_fqfn, rsn=reason)


    def keep_source(self, src_fqfn, dst_dir, reason=''):
        self._move_source(src_fqfn, dst_dir, reason=reason)


    def keep_biggest(self, src_fqfn, dst_dir, reason=''):
        if getsize(src_fqfn) > getsize(pjoin(dst_dir, basename(src_fqfn))):
            self._move_source(src_fqfn, dst_dir, reason=reason)
        else:
            self._delete_source(src_fqfn, rsn=reason)


    def keep_newest(self, src_fqfn, dst_dir, reason=''):
        src_fn_time = os.path.getmtime(src_fqfn)
        dst_fn_time = os.path.getmtime(pjoin(dst_dir, basename(src_fqfn)))

        if src_fn_time > dst_fn_time:
            self._move_source(src_fqfn, dst_dir, reason=reason)
        else:
            self._delete_source(src_fqfn, rsn=reason)


    def keep_both(self, src_fqfn, dst_dir, reason=''):
        """ No files will over-write one another - instead the new file will
            get a modified name within the destination directory.
        """
        self._rename_and_move_source(src_fqfn, dst_dir, reason=reason)


    def _move_source(self, src_fqfn, dst_dir, reason):

        log_action('move_source', src_fqfn)
        log_reason(reason)
        kill_if_symlink(src_fqfn, 'source', self.ignore_err)
        kill_if_symlink(dst_dir, 'dest', self.ignore_err)
        self.move_source_cnt += 1

        stype = get_fileobj_type(src_fqfn)
        dtype = get_fileobj_type(dst_dir)

        if stype == 'dir' and dtype == 'file':
            logging.critical('dest_dir exists, but is a file: %s' % dst_dir)
            if not self.ignore_err:
                sys.exit(2)

        if not self.dry_run:
            if not dtype:  # dst_path doesn't exist
                create_dest_dir_if_needed(dst_dir)
            if stype == 'file':
                shutil.copy2(src_fqfn, dst_dir)
                os.remove(src_fqfn)
            else:
                shutil.move(src_fqfn, dst_dir)


    def _rename_and_move_source(self, src_fqfn, dst_dir, reason=None):
        log_action('rename_and_move_source', src_fqfn)
        log_reason(reason)
        kill_if_symlink(src_fqfn, 'source', self.ignore_err)
        kill_if_symlink(dst_dir, 'dest', self.ignore_err)

        if isdir(src_fqfn):
            logging.critical('Invalid files: source must be file')
            logging.critical('Source: %s' % src_fqfn)
            if self.ignore_err is False:
                sys.exit(1)

        if isfile(dst_dir):
            dst_path = dirname(dst_dir)

        self.rename_and_move_source_cnt += 1
        dest_file = create_unique_file_name(dst_dir, basename(src_fqfn))

        if not self.dry_run:
            shutil.move(src_fqfn, pjoin(dst_dir, dest_file))


    def _delete_source(self, src_fqfn, rsn=None):
        log_action('delete_source', src_fqfn or '')
        log_reason(rsn)
        kill_if_symlink(src_fqfn, 'source', self.ignore_err)
        self.delete_source_cnt += 1

        if not self.dry_run:
            if isfile(src_fqfn):
                os.remove(src_fqfn)
            else:
                os.rmdir(src_fqfn)



def get_fileobj_type(path):
    if exists(path):
        if isfile(path):
            return 'file'
        else:
            return 'dir'
    else:
        return None



def create_dest_dir_if_needed(dest_dir):
    if not exists(dest_dir):
        os.makedirs(dest_dir)
        logging.info('Created dir: %s' % dest_dir)



def compare_md5s(src_path, dst_path):
    """ Compares two files to determine if their content is identical
        through the use of MD5 hashes.   Two optimizations exist:
           - it first checks to see if the file sizes are identical
           - it only compares 128*md5 block size at a time
        Inputs:
           - source_dir
           - source_file
           - dest_dir
           - dest_file
        Outputs:
           - result of either 'matched' or 'not-matched'
    """
    assert isfile(src_path)
    assert isfile(dst_path)

    #---- first run a cheap check: -----
    if os.path.getsize(src_path) != os.path.getsize(dst_path):
        return 'not-matched'

    #---- next keep comparing sections, hoping to bail out early
    #---- on a difference:
    source_md5 = hashlib.md5()
    dest_md5 = hashlib.md5()
    source_f = open(src_path, 'r')
    dest_f = open(dst_path, 'r')
    while True:
        source_chunk = source_f.read(128 * source_md5.block_size)
        dest_chunk = dest_f.read(128 * dest_md5.block_size)
        if not source_chunk or not dest_chunk:
            break
        else:
            source_md5.update(source_chunk.encode('utf-8'))
            dest_md5.update(dest_chunk.encode('utf-8'))
            if source_md5.hexdigest() != dest_md5.hexdigest():
                return 'not-matched'

    return 'matched'



def create_unique_file_name(dir_name, file_name):
    """ Creates a unique file name for the keep_both aciton.  This allows
        the source file to be copied to the dest directory with a minor
        modification in order to not step on the matching dest_file.

        If no file already exists with the current name then it won't be
        changed.  If it has to be changed, this process will try inserting
        a number between the file's base_name and its extension.

        Inputs:
           - dir_name  - should be the dest_dir
           - file_name - can be either the source_file or dest_file, should
                         be unqualified.
        Outputs:
           - file_name - this is the unqualified, but modified file_name.
    """
    base_name, extension = os.path.splitext(file_name)

    i = 0
    while exists(pjoin(dir_name, file_name)):
        i += 1
        file_name = '%s.%d%s' % (base_name, i, extension)

    return file_name



def setup_logs(verbosity, log_to_console=True):
    """
    """
    translation = {'quiet': 'CRITICAL',
                   'normal': 'ERROR',
                   'high': 'WARNING',
                   'debug': 'DEBUG'}
    log_level = translation[verbosity]
    logging.basicConfig(level=log_level,
                        format='%(message)s',
                        stream=sys.stdout)

def log_action(on_f_match, file_name):
    logging.info('%s - %s', on_f_match, file_name)

def log_reason(reason):
    if reason:
        logging.debug('   reason=%s', reason)



def kill_if_symlink(fqfn, dir_type, ignore_errors):
    """ Tests a fully-qualified file name to see if it's a symlink. If it
        fails the test it generally exits the program.
        Inputs:
            - fqfn - "fully qualified file name" - may be either a file or a
                     directory
            - dir_type - must be either 'source' or 'dest'
            - ignore_errors - doesn't exit if True
        Process:
            - if the fqfn is a symlink, it will write an error msg to the log
              and immediately exit the program.
            - if the file doesn't exist, and it's a source dir_type, then it
              will also error & exit the program.  This is because navigation
              occurs through source_dir - so files should never turn up missing.
              But they may have no corresponding directory on the dest side -
              so this could get called for a dest_dir that doesn't exist.
    """
    assert dir_type in ['source', 'dest']
    if exists(fqfn):
        if os.path.islink(fqfn):
            logging.critical('Symbolic link not supported: %s', fqfn)
            if not ignore_errors:
                sys.exit(1)
    else:
        if dir_type == 'source':
            logging.critical('File missing while checking for symlink: %s', fqfn)
            if not ignore_errors:
                sys.exit(1)



class ConfigManager(conf.Config):


    def define_user_config(self) -> None:
        self.add_custom_metadata(name='source_dir',
                                 type=str,
                                 required=True)
        self.add_custom_metadata(name='dest_dir',
                                 type=str,
                                 required=True)

        self.add_custom_metadata(name='match_on',
                                 type=str,
                                 default='name_and_md5',
                                 choices=['name_only', 'name_and_md5'])

        self.add_custom_metadata(name='on_full_match',
                                 type=str,
                                 default='keep_dest',
                                 choices=['keep_dest', 'keep_source', 'keep_both', 'keep_newest', 'keep_biggest'])

        self.add_custom_metadata(name='on_partial_match',
                                 type=str,
                                 choices=['keep_dest', 'keep_source', 'keep_both', 'keep_newest', 'keep_biggest'])

        self.add_custom_metadata(name='recursive',
                                 short_name='r',
                                 type=bool,
                                 default=False,
                                 action='store_const',
                                 const=True)
        self.add_custom_metadata(name='ignore_errors',
                                 type=bool,
                                 default=False,
                                 action='store_const',
                                 const=True)

        self.add_standard_metadata('dry_run')
        self.add_standard_metadata('verbosity')
        self.add_all_config_configs()
        self.add_all_help_configs()


    def extend_config(self,
                      override_filename=None) -> None:
        if self.config['match_on'] == 'name_only':
            if self.config['on_partial_match'] is not None:
                comm.abort("on-partial-match must not be provided when match-on is 'name_only'")
        else:
            if self.config['on_partial_match'] is None:
                self.config['on_partial_match'] = 'keep_newest'


    def validate_custom_config(self,
                               config: conf.CONFIG_TYPE):
        if (not config.get('dry_run')) and config.get('ignore_errors'):
            comm.abort('ignore-errors requires dry-run')

        if not isdir(config.get('source_dir', '')):
            comm.abort(f"source_dir not found: {config.get('source_dir')}")
        if not isdir(config.get('dest_dir', '')):
            comm.abort(f"dest_dir not found: {config.get('dest_dir')}")



if __name__ == '__main__':
    sys.exit(main())
