#!/bin/sh

##############################################################################
# Rearrange columns in a csv file.
#

##############################################################################
# BOOTSTRAP
#
# Include ../lib in the search path before calling python.
# (Thanks to https://unix.stackexchange.com/questions/20880)
#
if "true" : '''\'
then
    export PYTHONPATH="$(dirname $0)/../lib:$PYTHONPATH"
    pythoncmd=python

    if command -v python3 >/dev/null; then
        pythoncmd=python3
    fi

    exec "$pythoncmd" "$0" "$@"
    exit 127
fi
'''

##############################################################################
# PYTHON CODE BEGINS HERE

import os
import re
import sys
import errno
import getopts
from csvutils import libcsv

__copyright__ = 'Copyright 2019-2022 Mark Kim'
__license__ = 'Apache 2.0'
__version__ = '1.0.1'
__author__ = 'Mark Kim'


##############################################################################
# GLOBALS

SCRIPTNAME = os.path.basename(__file__)

class opts:
    files = []
    delim = None
    fields = None
    multitable = False
    inverse = False


##############################################################################
# USAGE

def usage():
    '''\
Extract and rearrange columns in a csv file.

usage: {SCRIPTNAME} [OPTIONS] [FILES]

Options:
  FILE                  csv file(s) to cut.

  -d, --delim=DELIM     Use  DELIM as the value delimiter, where DELIM may be
                        'p' for the pipe (|), 't' for the tab (\\t), 'a' for
                        the SOH (ASCII 1), or other string literal of one or
                        more characters and Python  string  escape  sequences.
                        DELIM  may  include  escape characters.  By default the
                        delimiter is guessed from the characters in the
                        CSV_DELIMS environment variable.

  -f, --fields=FIELDS   Fields to extract.  FIELDS is a comma separated list of
                        field criteria to extract from the files.  Each
                        criteria may be one of:

                        * COLNUM:  The column number.  The first column is 1.
                          The last column is -1.

                        * COLNUM-[COLNUM]:  A range of columns, both inclusive.
                          The first COLNUM must be a lower number than the
                          second COLNUM.  If the second COLNUM is omitted, it
                          is considered to the last column.

                        * -:  A hyphen extracts all columns that has yet to be
                          extracted by the prior criteria.

                        * COLNAME:  The name of the column from the first row
                          of the CSV file.  If the column name is invalid, a
                          blank value is output.  The column name is assumed to
                          be unique.

                        * /REGEX/[MOD]:  The name of the column, using regex.
                          All columns matching the REGEX are output.  REGEX
                          cannot have a comma.  MOD is a modifier:
                          
                            i  case insensitive match.

                        * ~/REGEX/[MOD]:  The values matching this regex.

                        The order in which the criteria are specified is the
                        order in which they are output.

  -m, --multitable      Multitable support.  One csv file may include multiple
                        csv data by separating them by an empty line.

  -v, --inverse         Extract non-matching fields only.

Environment Variables:
  CSV_DELIMS            A set of characters used to guess the delimiter of a
                        csv file.  The guesswork happens when reading  the
                        first  line  of  the first FILE, by testing each
                        character present in CSV_DELIMS for the character with
                        the most occurrence in the line.  If any of the
                        characters occur the same number  of  times  (including
                        zero),  the  earlier  character  in the variable is
                        chosen.  If the environment variable is not set, it
                        defaults to '{libcsv.DELIMS}'.  CSV_DELIMS may include
                        escape characters.
'''

    print(usage.__doc__.format(**globals()))


##############################################################################
# MAIN

def main():
    errcount = 0

    getopt = getopts.getopts(sys.argv, {
        'd' : 1, 'delim'      : 1,
        'f' : 1, 'fields'     : 1,
        'm' : 0, 'multitable' : 0,
        'v' : 0, 'inverse'    : 0,
        'h' : 0, 'help'       : 0,
    })

    for c in getopt:
        if c in ('-')                   : opts.files.append(getopt.optarg)
        elif c in ('d', 'delim')        : opts.delim = arg_to_delim(getopt.optarg)
        elif c in ('f', 'fields')       : opts.fields = parse_fieldselector(getopt.optarg)
        elif c in ('m', 'multitable')   : opts.multitable = True
        elif c in ('v', 'inverse')      : opts.inverse = True
        elif c in ('h', 'help')         : usage(); sys.exit(0)
        else                            : errcount += 1

    if opts.fields is None:
        sys.stderr.write('Must specify at least one field using --fields\n')
        errcount += 1

    # Sanity check
    if errcount:
        sys.stderr.write('Type `{SCRIPTNAME} --help` for help.\n'.format(**globals()))
        sys.exit(1)

    # Read from STDIN if no file specified
    if not opts.files:
        opts.files.append("-")

    # Read each file
    for f in opts.files:
        with smart_open(f) as fo:
            csvcut(fo, opts.fields)


def arg_to_delim(delim):
    # Delimiter shorthands
    if delim in ('p'): delim = '|'
    elif delim in ('t'): delim = '\t'
    elif delim in ('a'): delim = '\u0001'

    # Translate escape characters
    decoded = delim.encode().decode('unicode_escape')

    # Use the decoded string only if it's a single character
    if len(delim) == 1 or len(decoded) == 1:
        delim = decoded

    return delim


def parse_fieldselector(string):
    parts = string.split(',')
    field_r = r'^((?P<COLNO>[-+]?[0-9]+)|(?P<RANGE>(?P<START>[-+]?[0-9]+)[-](?P<END>([-+]?[0-9]+)?))|(?P<VMOD>[~])?[/](?P<REGEX>([^\\/]|\\.)+)[/](?P<MOD>[i]*)|(?P<REST>[-])|=(?P<COLNAME>.+))$'
    field_rc = re.compile(field_r)

    for i, p in enumerate(parts):
        match = field_rc.search(p)

        if not match:
            parts[i] = FieldSelectorByColumnName(p)
        elif match.group('COLNO'):
            value = match.group('COLNO')
            parts[i] = FieldSelectorByColumnNumber(int(value))
        elif match.group('RANGE'):
            start = None if match.group('START') == '' else int(match.group('START'))
            end = None if match.group('END') == '' else int(match.group('END'))
            parts[i] = FieldSelectorByColumnRange(start, end)
        elif match.group('VMOD'):
            regex = match.group('REGEX')
            mod = match.group('MOD')
            parts[i] = FieldSelectorByValueRegex(regex, mod)
        elif match.group('REGEX'):
            regex = match.group('REGEX')
            mod = match.group('MOD')
            parts[i] = FieldSelectorByColumnRegex(regex, mod)
        elif match.group('COLNAME'):
            value = match.group('COLNAME')
            parts[i] = FieldSelectorByColumnName(value)
        elif match.group('REST'):
            parts[i] = FieldSelectorByUnselected()
        else:
            parts[i] = FieldSelectorByColumnName(p)

    return parts


def smart_open(filename, mode='r'):
    '''Open a file for reading, treating '-' as a stdin or stdout, depending on
    the mode.'''

    if filename == '-':
        # Duplicate stdin/stdout so the caller can close it without closing stdin.
        fd = sys.stdin.fileno() if 'r' in mode else sys.stdout.fileno()
        fo = os.fdopen(os.dup(fd))
    else:
        fo = open(filename, mode)

    return fo


def csvcut(file, fields):
    reader = RowIterator(file, opts.delim)

    for row in reader:
        selectable = FieldSelectableRow(row)
        delim = row.delim()
        output = []

        # Empty row prints an empty line
        if len(row) == 0:
            print('')
            continue

        # Select the cells
        for selector in fields:
            selected = selector.select(selectable)

            output.extend(selected)

        # Inverse only outputs unselected cells
        if opts.inverse:
            output = selectable.unselected();

        # Output the selected fields
        for i, value in enumerate(output):
            if i > 0:
                sys.stdout.write(delim)

            sys.stdout.write(value)

        sys.stdout.write('\n')


class RowIterator(object):
    def __init__(self, file, delim):
        self.__reader = libcsv.Reader(file, delim=delim, has_header=True, is_multitable=opts.multitable)

    def __iter__(self):
        got_header = False

        while True:
            if got_header:
                row = next(self.__reader)
            else:
                row = self.__reader.header()
                got_header = True

            if row is None:
                break
            elif len(row) == 0 and opts.multitable:
                got_header = False

            yield row

    def header(self):
        return self.__reader.header()


class FieldSelectableRow(object):
    def __init__(self, row):
        self.__row = row
        self.__selected = []

    def __len__(self):
        return len(self.__row)

    def __iter__(self):
        for cell in self.__row:
            yield cell.value()

    def header(self):
        colnames = []

        for cell in self.__row.header():
            colnames.append(cell.value())

        return colnames

    def select(self, key, default=None):
        if isinstance(key, int) and key > 0:
            key -= 1
        elif isinstance(key, int):
            key = None

        cell = None if key is None else self.__row[key]
        value = default if cell is None else cell.value()

        if cell is not None:
            self.__selected.append(cell.colnum())

        return value

    def unselected(self):
        unselected = []

        for cell in self.__row:
            index = cell.colnum()

            if index not in self.__selected:
                value = cell.value()
                self.__selected.append(index)

                if value is None:
                    value = ''

                unselected.append(value)

        return unselected


class FieldSelector(object):
    class InvalidSelectorException(Exception): pass

    def select(self, row):
        return []


class FieldSelectorByColumnNumber(FieldSelector):
    def __init__(self, index):
        self.__index = index

    def select(self, row):
        colcount = len(row)
        index = self.__index

        if index <= 0:
            index += colcount + 1

        value = row.select(index, '')

        return [value]


class FieldSelectorByColumnName(FieldSelector):
    def __init__(self, key):
        self.__key = key

    def select(self, row):
        try:
            value = row.select(self.__key, '')
        except ValueError as e:
            # raise FieldSelector.InvalidSelectorException('Invalid column -- "%s"' % self.__key)
            value = ''

        return [value]


class FieldSelectorByColumnRegex(FieldSelector):
    def __init__(self, regex, mod):
        if mod: regex = ('(?%s)' % mod) + regex

        self.__regex_rc = re.compile(regex)

    def select(self, row):
        selected = []
        colnums = []

        # Column regexes to indeces
        for i, name in enumerate(row.header()):
            if self.__regex_rc.search(name):
                colnums.append(i+1)

        for index in colnums:
            selected.append(row.select(index, ''))

        return selected


class FieldSelectorByColumnRange(FieldSelector):
    def __init__(self, start, end):
        self.__start = start
        self.__end = end

    def select(self, row):
        colcount = len(row)
        start = self.__start
        end = self.__end
        step = 1
        values = []

        if start is None: start = 1
        elif start <= 0:  start += colcount + 1

        if end is None: end = colcount
        elif end <= 0:  end += colcount + 1

        # if start > end: step = -1

        for i in range(start, end+step, step):
            value = row.select(i)

            if value is not None:
                values.append(value)

        return values


class FieldSelectorByValueRegex(FieldSelector):
    def __init__(self, regex, mod):
        if mod: regex = ('(?%s)' % mod) + regex

        self.__regex_rc = re.compile(regex)

    def select(self, row):
        selected = []

        for i, value in enumerate(row):
            if self.__regex_rc.search(value):
                selected.append(value)
                row.select(i)

        return selected


class FieldSelectorByUnselected(FieldSelector):
    def select(self, row):
        return row.unselected()


##############################################################################
# ENTRY POINT

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        print('')
        sys.exit(errno.EOWNERDEAD)


# vim:ft=python
