#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2011, A. Murat Eren
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.


# when a list of ids and a fastq file provided, this script removes reads from fastq file
# that have a matching id in the given ids list.
#
# if something seems to be wrong with it, please send an e-mail to meren@mbl.edu


import os
import sys

import IlluminaUtils.lib.fastqlib as u


E = os.path.exists
J = os.path.join


def main(fastq_path, ids_path, delimiter=None, generate_output_for_survived_only=False, keep_ids=False):
    ids = set([id.strip() for id in open(ids_path, 'r')])
    input = u.FastQSource(fastq_path)


    for _id in ids:
        if _id.startswith('@'):
            try:
                print("\n\n    Warning: IDs are not supposed to start with '@' character, if you are sure that is the case for your FASTQ file, press ENTER to continue. Otherwise press CTRL+C to quit, and fix your IDs file.\n")
                input()
                break
            except:
                sys.exit()

    if not generate_output_for_survived_only:
        removed = u.FastQOutput(fastq_path + '.removed')
    survived = u.FastQOutput(fastq_path + '.survived')

    number_of_reads_removed = 0

    while input.next(raw = True):
        if input.pos % 5000 == 0:
            sys.stderr.write('\r[fastqlib:removing] Processed: %s, Removed: %s'\
                    % (u.big_number_pretty_print(input.pos),
                       u.big_number_pretty_print(number_of_reads_removed)))
            sys.stderr.flush()

        if delimiter:
            header_line = input.entry.header_line.split(delimiter)[0]
        else:
            header_line = input.entry.header_line

        if header_line in ids:
            if keep_ids:
                survived.store_entry(input.entry)
            else:
                if not generate_output_for_survived_only:
                    removed.store_entry(input.entry)
            ids.remove(header_line)
            number_of_reads_removed += 1
        else:
            if keep_ids:
                if not generate_output_for_survived_only:
                    removed.store_entry(input.entry)
            else:
                survived.store_entry(input.entry)

    sys.stderr.write('\n\n')

    print('Total number of reads processed: %s' % u.big_number_pretty_print(input.pos))
    print('Number of reads removed        : %s' % u.big_number_pretty_print(number_of_reads_removed))
    if not generate_output_for_survived_only:
        print('FASTQ file for removed reads   : "%s"' % (fastq_path + '.removed'))
    print('FASTQ file for surviving reads : "%s"' % (fastq_path + '.survived'))
    print()

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Remove reads from FASTQ File. Creates two output file\
                                                  that have the same path and name as the input FASTQ\
                                                  file, but with the added suffix of ".survived", and ".removed".')
    parser.add_argument('-i', '--input-fastq', required=True, metavar = 'FASTQ_FILE_PATH',
                        help = 'Sequences file from which reads will be removed in FASTQ format')
    parser.add_argument('-l', '--ids-file-path', required=True, metavar = 'IDS_FILE_PATH',
                        help = 'Input file that contains the list of ids for removal')
    parser.add_argument('-d', '--delimiter', required=False, default=None, metavar = 'CHARACTER',
                        help = 'By default this script will perform exact match match for IDs you listed\
                                in the IDs file. But using this parameter, you can ask the script to "split"\
                                the IDs found in the FASTQ file, and then try to match the first part of the\
                                resulting ID to those you listed in the IDs file.')
    parser.add_argument('-G', '--generate-output-for-survived-only', required=False, action='store_true', default=False,
                        help = 'If provided then only one output file (the file with "survived" ids) will be produced.')
    parser.add_argument('-K', '--keep-ids', required=False, action='store_true', default=False,
                        help = 'If provided, then instead of removing the ids in the list, only the ids in the\
                                list will be kept (and the rest would be removed).')

    args = parser.parse_args()

    sys.exit(main(args.input_fastq, args.ids_file_path, args.delimiter, args.generate_output_for_survived_only, args.keep_ids))
