#!/usr/bin/env python
# Time-stamp: <2020-12-05 16:03:31 Tao Liu>
"""Description: MACS v3 main executable.

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file LICENSE included with
the distribution).
"""

# ------------------------------------
# python modules
# ------------------------------------

import os
import sys
import argparse as ap
import tempfile

# ------------------------------------
# own python modules
# ------------------------------------
from MACS3.Utilities.Constants import *

# ------------------------------------
# Main function
# ------------------------------------
def main():
    """The Main function/pipeline for MACS.

    """
    # Parse options...
    argparser = prepare_argparser()
    args = argparser.parse_args()

    subcommand  = args.subcommand

    if args.outdir:
        # use a output directory to store MACS output
        if not os.path.exists( args.outdir ):
            try:
                os.makedirs( args.outdir )
            except:
                sys.exit( "Output directory (%s) could not be created. Terminating program." % args.outdir )

    if subcommand == "callpeak":
        # General call peak
        from MACS3.Commands.callpeak_cmd import run
        run( args )
    #elif subcommand == "diffpeak":
    #    # differential peak calling w/ bedgraphs + optional peak regions
    #    from MACS3.Commands.diffpeak_cmd import run
    #    run( args )
    elif subcommand == "bdgpeakcall":
        # call peak from bedGraph
        from MACS3.Commands.bdgpeakcall_cmd import run
        run( args )
    elif subcommand == "bdgbroadcall":
        # call broad peak from bedGraph
        from MACS3.Commands.bdgbroadcall_cmd import run
        run( args )
    elif subcommand == "bdgcmp":
        # compare treatment and control to make enrichment scores
        from MACS3.Commands.bdgcmp_cmd import run
        run( args )
    elif subcommand == "bdgopt":
        # operations on the score column of bedGraph file
        from MACS3.Commands.bdgopt_cmd import run
        run( args )
    elif subcommand == "cmbreps":
        # combine replicates
        from MACS3.Commands.cmbreps_cmd import run
        run( args )
    elif subcommand == "randsample":
        # randomly sample sequencing reads, and save as bed file
        from MACS3.Commands.randsample_cmd import run
        run( args )
    elif subcommand == "filterdup":
        # filter out duplicate reads, and save as bed file
        from MACS3.Commands.filterdup_cmd import run
        run( args )
    elif subcommand == "bdgdiff":
        # differential calling
        from MACS3.Commands.bdgdiff_cmd import run
        run( args )
    elif subcommand == "refinepeak":
        # refine peak summits
        from MACS3.Commands.refinepeak_cmd import run
        run( args )
    elif subcommand == "predictd":
        # predict d or fragment size
        from MACS3.Commands.predictd_cmd import run
        run( args )
    elif subcommand == "pileup":
        # pileup alignment results with a given extension method
        from MACS3.Commands.pileup_cmd import run
        run( args )
    #elif subcommand == "hmmratac":
    #    # pileup alignment results with a given extension method
    #    from MACS3.Commands.diffpeak_cmd import run
    #    run( args )
    elif subcommand == "callvar":
        # assemble reads in peak region and call variants
        from MACS3.Commands.callvar_cmd import run
        run( args )


def prepare_argparser ():
    """Prepare optparser object. New options will be added in this
    function first.

    """
    description = "%(prog)s -- Model-based Analysis for ChIP-Sequencing"
    epilog = "For command line options of each command, type: %(prog)s COMMAND -h"
    # top-level parser
    argparser = ap.ArgumentParser( description = description, epilog = epilog ) #, usage = usage )
    argparser.add_argument("--version", action="version", version="%(prog)s "+MACS_VERSION)
    subparsers = argparser.add_subparsers( dest = 'subcommand' ) #help="sub-command help")
    subparsers.required = True

    # command for 'callpeak'
    add_callpeak_parser( subparsers )

    # # command for 'diffpeak'
    # add_diffpeak_parser( subparsers )

    # command for 'bdgpeakcall'
    add_bdgpeakcall_parser( subparsers )

    # command for 'bdgbroadcall'
    add_bdgbroadcall_parser( subparsers )

    # command for 'bdgcmp'
    add_bdgcmp_parser( subparsers )

    # command for 'bdgopt'
    add_bdgopt_parser( subparsers )

    # command for 'cmbreps'
    add_cmbreps_parser( subparsers )

    # command for 'bdgdiff'
    add_bdgdiff_parser( subparsers )

    # command for 'filterdup'
    add_filterdup_parser( subparsers )

    # command for 'predictd'
    add_predictd_parser( subparsers )

    # command for 'pileup'
    add_pileup_parser( subparsers )

    # command for 'randsample'
    add_randsample_parser( subparsers )

    # command for 'refinepeak'
    add_refinepeak_parser( subparsers )

    ## command for 'callvar'
    add_callvar_parser( subparsers )

    ## command for 'refinepeak'
    #add_hmmratac_parser( subparsers )

    return argparser

def add_outdir_option ( parser ):
    parser.add_argument("--outdir", dest = "outdir", type = str, default = '',
                        help = "If specified all output files will be written to that directory. Default: the current working directory")

def add_output_group ( parser, required = True ):
    output_group = parser.add_mutually_exclusive_group( required = required )
    output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str,
                               help = "Output file name. Mutually exclusive with --o-prefix." )
    output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
                               help = "Output file prefix. Mutually exclusive with -o/--ofile." )

def add_callpeak_parser( subparsers ):
    """Add main function 'peak calling' argument parsers.
    """
    argparser_callpeak = subparsers.add_parser("callpeak", help="Main MACS3 Function: Call peaks from alignment results.",
                                               formatter_class = ap.RawDescriptionHelpFormatter,
                                               epilog = """Examples:
1. Peak calling for regular TF ChIP-seq:
    $ macs3 callpeak -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01
2. Broad peak calling on Histone Mark ChIP-seq:
    $ macs3 callpeak -t ChIP.bam -c Control.bam --broad -g hs --broad-cutoff 0.1
3. Peak calling on ATAC-seq (paired-end mode):
    $ macs3 callpeak -f BAMPE -t ATAC.bam -g hs -n test -B -q 0.01
4. Peak calling on ATAC-seq ( focusing on insertion sites, and using single-end mode):
    $ macs3 callpeak -f BAM -t ATAC.bam -g hs -n test -B -q 0.01 --shift -50 --extension 100
""")

    # group for input files
    group_input = argparser_callpeak.add_argument_group( "Input files arguments" )
    group_input.add_argument( "-t", "--treatment", dest = "tfile", type = str, required = True, nargs = "+",
                              help = "ChIP-seq treatment file. If multiple files are given as '-t A B C', then they will all be read and pooled together. REQUIRED." )
    group_input.add_argument( "-c", "--control", dest = "cfile", type = str, nargs = "*",
                                    help = "Control file. If multiple files are given as '-c A B C', they will be pooled to estimate ChIP-seq background noise.")
    group_input.add_argument( "-f", "--format", dest = "format", type = str,
                              choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
                                         "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
                                          "BAMPE", "BEDPE"),
                              help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"",
                              default = "AUTO" )
    group_input.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
                              help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
    group_input.add_argument( "-s", "--tsize",  dest = "tsize", type = int, default = None,
                                    help = "Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set")
    group_input.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "1",
                              help = "It controls the  behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS3 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1" )

    # group for output files
    group_output = argparser_callpeak.add_argument_group( "Output arguments" )
    add_outdir_option( group_output )
    group_output.add_argument( "-n", "--name", dest = "name", type = str,
                               help = "Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
                               default = "NA" )
    group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
                               help = "Whether or not to save extended fragment pileup, and local lambda tracks (two files) at every bp into a bedGraph file. DEFAULT: False",
                               default = False )
    group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                               help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
    group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
                               help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )

    group_output.add_argument( "--SPMR", dest = "do_SPMR", action = "store_true", default = False,
                               help = "If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS3 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False" )
    # group for bimodal
    group_bimodal = argparser_callpeak.add_argument_group( "Shifting model arguments" )
    group_bimodal.add_argument( "--nomodel", dest = "nomodel", action = "store_true", default = False,
                                help = "Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set extsize to change it. It's highly recommended that while you have many datasets to process and you plan to compare different conditions, aka differential calling, use both 'nomodel' and 'extsize' to make signal files from different datasets comparable. DEFAULT: False" )
    group_bimodal.add_argument( "--shift", dest = "shift", type = int, default = 0,
                                help = "(NOT the legacy --shiftsize option!) The arbitrary shift in bp. Use discretion while setting it other than default value. When NOMODEL is set, MACS will use this value to move cutting ends (5') towards 5'->3' direction then apply EXTSIZE to extend them to fragments. When this value is negative, ends will be moved toward 3'->5' direction. Recommended to keep it as default 0 for ChIP-Seq datasets, or -1 * half of EXTSIZE together with EXTSIZE option for detecting enriched cutting loci such as certain DNAseI-Seq datasets. Note, you can't set values other than 0 if format is BAMPE or BEDPE for paired-end data. DEFAULT: 0. " )
    group_bimodal.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
                                help = "The arbitrary extension size in bp. When nomodel is true, MACS will use this value as fragment size to extend each read towards 3' end, then pile them up. It's exactly twice the number of obsolete SHIFTSIZE. In previous language, each read is moved 5'->3' direction to middle of fragment by 1/2 d, then extended to both direction with 1/2 d. This is equivalent to say each read is extended towards 5'->3' into a d size fragment. DEFAULT: 200. EXTSIZE and SHIFT can be combined when necessary. Check SHIFT option." )
    group_bimodal.add_argument( "--bw", dest = "bw", type = int, default = 300,
                                help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. Tweaking this is not recommended. DEFAULT: 300")
    group_bimodal.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
                                help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
    group_bimodal.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
                                help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". This setting is only used while building the shifting model. Tweaking it is not recommended. DEFAULT:5 50" )

    group_bimodal.add_argument( "--fix-bimodal", dest = "onauto", action = "store_true",
                                help = "Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the --exsize parameter to extend each tags towards 3' direction. Not to use this automate fixation is a default behavior now. DEFAULT: False",
                                default = False )

    # General options.
    group_callpeak = argparser_callpeak.add_argument_group( "Peak calling arguments" )
    p_or_q_group = group_callpeak.add_mutually_exclusive_group()
    p_or_q_group.add_argument( "-q", "--qvalue", dest = "qvalue", type = float, default = 0.05,
                               help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. -q, and -p are mutually exclusive." )
    p_or_q_group.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
                               help = "Pvalue cutoff for peak detection. DEFAULT: not set. -q, and -p are mutually exclusive. If pvalue cutoff is set, qvalue will not be calculated and reported as -1 in the final .xls file." )

    # about scaling
    group_callpeak.add_argument( "--scale-to", dest = "scaleto", type = str, choices = ("large", "small"),
                                help = "When set to 'small', scale the larger sample up to the smaller sample. When set to 'larger', scale the smaller sample up to the bigger sample. By default, scale to 'small'. This option replaces the obsolete '--to-large' option. The default behavior is recommended since it will lead to less significant p/q-values in general but more specific results. Keep in mind that scaling down will influence control/input sample more. DEFAULT: 'small', the choice is either 'small' or 'large'." )
    group_callpeak.add_argument( "--down-sample", dest = "downsample", action = "store_true", default = False,
                                 help = "When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. <not implmented>If used together with --SPMR, 1 million unique reads will be randomly picked.</not implemented> Caution: due to the implementation, the final number of selected reads may not be as you expected! DEFAULT: False" )
    group_callpeak.add_argument( "--seed", dest = "seed", type = int, default = -1,
                                 help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
    group_callpeak.add_argument( "--tempdir", dest="tempdir", default=tempfile.gettempdir(),
                                help = "Optional directory to store temp files. DEFAULT: %(default)s")
    group_callpeak.add_argument( "--nolambda", dest = "nolambda", action = "store_true",
                                 help = "If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to the potential chromatin accessibility. ",
                                 default = False )
    group_callpeak.add_argument( "--slocal", dest = "smalllocal", type = int, default = 1000,
                                 help = "The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 1000 " )
    group_callpeak.add_argument( "--llocal", dest = "largelocal", type = int, default = 10000,
                                 help = "The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 10000." )
    group_callpeak.add_argument( "--max-gap", dest = "maxgap", type = int,
                                 help = "Maximum gap between significant sites to cluster them together. The DEFAULT value is the detected read length/tag size." )
    group_callpeak.add_argument( "--min-length", dest = "minlen", type = int,
                                 help = "Minimum length of a peak. The DEFAULT value is the predicted fragment size d. Note, if you set a value smaller than the fragment size, it may have NO effect on the result. For BROAD peak calling, try to set a large value such as 500bps. You can also use '--cutoff-analysis' option with default setting, and  check the column 'avelpeak' under different cutoff values to decide a reasonable minlen value." )
    group_callpeak.add_argument( "--broad", dest = "broad", action = "store_true",
                                 help = "If set, MACS will try to call broad peaks using the --broad-cutoff setting. Please tweak '--broad-cutoff' setting to control the peak calling behavior. At the meantime, either -q or -p cutoff will be used to define regions with 'stronger enrichment' inside of broad peaks. The maximum gap is expanded to 4 * MAXGAP (--max-gap parameter). As a result, MACS will output a 'gappedPeak' and a 'broadPeak' file instead of 'narrowPeak' file. Note, a broad peak will be reported even if there is no 'stronger enrichment' inside. DEFAULT: False", default = False )
    group_callpeak.add_argument( "--broad-cutoff", dest = "broadcutoff", type = float, default = 0.1,
                                 help = "Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. Please note that in broad peakcalling mode, MACS3 uses this setting to control the overall peak calling behavior, then uses -q or -p setting to define regions inside broad region as 'stronger' enrichment. DEFAULT: 0.1 " )
    group_callpeak.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
                                 help = "While set, MACS3 will analyze number or total length of peaks that can be called by different p-value cutoff then output a summary table to help user decide a better cutoff. The table will be saved in NAME_cutoff_analysis.txt file. Note, minlen and maxgap may affect the results. WARNING: May take  ~30 folds longer time to finish. The result can be useful for users to decide a reasonable cutoff value. DEFAULT: False", default = False )

    # post-processing options
    group_postprocessing = argparser_callpeak.add_argument_group( "Post-processing options" )
    postprocess_group = group_postprocessing.add_mutually_exclusive_group()

    postprocess_group.add_argument( "--call-summits", dest="call_summits", action="store_true",
                                    help="If set, MACS will use a more sophisticated signal processing approach to find subpeak summits in each enriched peak region. DEFAULT: False",default=False)
    group_postprocessing.add_argument( "--fe-cutoff", dest="fecutoff", type=float, default = 1.0,
                                       help = "When set, the value will be used to filter out peaks with low fold-enrichment. Note, MACS3 adds one as pseudocount while calculating fold-enrichment. By default, it is set as 1 so there is no filtering. DEFAULT: 1.0")

    # other options
    group_other = argparser_callpeak.add_argument_group( "Other options" )
    group_other.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
                                  help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )

    # obsolete options
    group_obsolete = argparser_callpeak.add_argument_group( "Obsolete options" )
    group_obsolete.add_argument( "--to-large", dest = "tolarge", action = "store_true", default = False,
                                     help = "Obsolete option. Please use '--scale-to large' instead." )
    group_obsolete.add_argument( "--ratio", dest = "ratio", type = float, default = 1.0,
                                     help = "Obsolete option. Originally designed to normalize treatment and control with customized ratio, now it won't have any effect." )
    return

def add_diffpeak_parser( subparsers ):
    """Add main function 'peak calling' argument parsers.
    """
    argparser_diffpeak = subparsers.add_parser("diffpeak", help="MACS3 Differential Peak Function: Call peaks from bedgraphs (or use optional peak regions) and determine peaks of differential occupancy")


    # group for input files
    group_input = argparser_diffpeak.add_argument_group( "Input files arguments" )
    group_input.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
                                    help = "MACS pileup bedGraph for condition 1. REQUIRED" )
    group_input.add_argument( "--t2", dest="t2bdg", type = str, required = True,
                                    help = "MACS pileup bedGraph for condition 2. REQUIRED" )
    group_input.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
                                    help = "MACS control lambda bedGraph for condition 1. REQUIRED" )
    group_input.add_argument( "--c2", dest="c2bdg", type = str, required = True,
                                    help = "MACS control lambda bedGraph for condition 2. REQUIRED" )
    group_input.add_argument( "--peaks1", dest = "peaks1", type = str, default='',
                                    help = "MACS peaks.xls file for condition 1. Optional but must specify peaks2 if present" )
    group_input.add_argument( "--peaks2", dest="peaks2", type = str, default='',
                                    help = "MACS peaks.xls file for condition 2. Optional but must specify peaks1 if present" )
    group_input.add_argument( "-d", "--depth-multiplier", dest = "depth", type = float, default = [1.0], nargs = "+",
                                    help = "Sequence depth in million reads. If two depths are different, use '-d X -d Y' for X million reads in condition 1 and Y million reads in condition 2. If they are same, use '-d X' for X million reads in both condition 1 and condition 2 (e.g. the bedGraph files are from 'callpeak --SPMR'). Default: 1 (if you use 'macs3 callpeak --SPMR' to generate bdg files, we recommend using the smaller depth as a multiplier)" )
#    group_input.add_argument( "-f", "--format", dest = "format", type = str,
#                              choices = ("AUTO", "BED", "XLS"),
#                              help = "Format of peak regions file, \"AUTO\", \"BED\" or \"XLS\". The default AUTO option will let MACS decide which format the file is based on the file extension. DEFAULT: \"AUTO\"",
#                              default = "AUTO" )

    # group for output files
    group_output = argparser_diffpeak.add_argument_group( "Output arguments" )
    add_outdir_option( group_output )
    group_output.add_argument( "-n", "--name", dest = "name", type = str,
                               help = "Experiment name, which will be used to generate output file names. DEFAULT: \"diffpeak\"",
                               default = "diffpeak" )
    group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
                               help = "Whether or not to save basewise p/qvalues from every peak region into a bedGraph file. DEFAULT: False",
                               default = False )
    group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                               help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
    group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
                               help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )

    # General options.
    group_diffpeak = argparser_diffpeak.add_argument_group( "Peak calling arguments" )
    p_or_q_group = group_diffpeak.add_mutually_exclusive_group()
    p_or_q_group.add_argument( "-q", "--qvalue", dest = "diff_qvalue", type = float, default = 0.05,
                               help = "Minimum FDR (q-value) cutoff for differences. DEFAULT: 0.05. -q and -p are mutually exclusive." )
    p_or_q_group.add_argument( "-p", "--pvalue", dest = "diff_pvalue", type = float,
                               help = "Pvalue cutoff for differences. DEFAULT: not set. -q and -p are mutually exclusive." )
    p_or_q_group2 = group_diffpeak.add_mutually_exclusive_group()
    p_or_q_group2.add_argument( "--peaks-qvalue", dest = "peaks_qvalue", type = float, default = 0.05,
                               help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
    p_or_q_group2.add_argument( "--peaks-pvalue", dest = "peaks_pvalue", type = float,
                               help = "Pvalue cutoff for peak detection. DEFAULT: not set. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
    group_diffpeak.add_argument( "-m", "--peak-min-len", dest = "pminlen", type = int,
                                    help = "Minimum length of peak regions. DEFAULT: 200", default = 200 )
    group_diffpeak.add_argument( "--diff-min-len", dest = "dminlen", type = int,
                                    help = "Minimum length of differential region (must overlap a valid peak). DEFAULT: 50", default = 100 )
    group_diffpeak.add_argument( "--ignore-duplicate-peaks", dest="ignore_duplicate_peaks", action="store_false",
                         help="If set, MACS will ignore duplicate regions with identical coordinates. Helpful if --call-summits was set. DEFAULT: True",default=True)
    return

def add_filterdup_parser( subparsers ):
    argparser_filterdup = subparsers.add_parser( "filterdup",
                                                 help = "Remove duplicate reads at the same position, then save the rest alignments to BED or BEDPE file. If you use '--keep-dup all option', this script can be utilized to convert any acceptable format into BED or BEDPE format." )
    argparser_filterdup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
                                      help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
    argparser_filterdup.add_argument( "-f", "--format", dest = "format", type = str,
                                      choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
                                      help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
                                      default = "AUTO" )
    argparser_filterdup.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
                                      help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
    argparser_filterdup.add_argument( "-s", "--tsize", dest = "tsize", type = int,
                                      help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set" )
    argparser_filterdup.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
                                      help = "Pvalue cutoff for binomial distribution test. DEFAULT:1e-5" )
    argparser_filterdup.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "auto",
                                      help = "It controls the '%(prog)s' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS3 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. Default: auto" )
    argparser_filterdup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
                                      help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )

    argparser_filterdup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                                      help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
    add_outdir_option( argparser_filterdup )
    argparser_filterdup.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
                                      help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
                                      default = "stdout" )
    argparser_filterdup.add_argument( "-d", "--dry-run", dest="dryrun", action="store_true", default=False,
                                      help = "When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set" )
    return

def add_bdgpeakcall_parser( subparsers ):
    """Add function 'peak calling on bedGraph' argument parsers.
    """
    argparser_bdgpeakcall = subparsers.add_parser( "bdgpeakcall",
                                                   help = "Call peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS3 are accpetable." )
    argparser_bdgpeakcall.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
                                        help = "MACS score in bedGraph. REQUIRED" )
    argparser_bdgpeakcall.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
                                        help = "Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS3, score 5 means pvalue 1e-5. DEFAULT: 5", default = 5 )
    argparser_bdgpeakcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
                                       help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
    argparser_bdgpeakcall.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
                                       help = "maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30", default = 30 )
    argparser_bdgpeakcall.add_argument( "--call-summits", dest="call_summits", action="store_true", help=ap.SUPPRESS, default=False)
#                         help="If set, MACS will use a more sophisticated approach to find all summits in each enriched peak region. DEFAULT: False",default=False)
    argparser_bdgpeakcall.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
                                        help = "While set, bdgpeakcall will analyze number or total length of peaks that can be called by different cutoff then output a summary table to help user decide a better cutoff. Note, minlen and maxgap may affect the results. DEFAULT: False", default = False )
    argparser_bdgpeakcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
                         help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")

    add_outdir_option( argparser_bdgpeakcall )
    add_output_group( argparser_bdgpeakcall )

    return

def add_bdgbroadcall_parser( subparsers ):
    """Add function 'broad peak calling on bedGraph' argument parsers.
    """
    argparser_bdgbroadcall = subparsers.add_parser( "bdgbroadcall",
                                                    help = "Call broad peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS3 are accpetable." )
    argparser_bdgbroadcall.add_argument( "-i", "--ifile", dest = "ifile" , type = str, required = True,
                                         help = "MACS score in bedGraph. REQUIRED" )
    argparser_bdgbroadcall.add_argument( "-c", "--cutoff-peak", dest = "cutoffpeak", type = float,
                                         help = "Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 2 means qvalue 0.01. DEFAULT: 2",
                                         default = 2 )
    argparser_bdgbroadcall.add_argument( "-C", "--cutoff-link", dest = "cutofflink", type = float,
                                         help = "Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 1", default = 1 )
    argparser_bdgbroadcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
                                         help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
    argparser_bdgbroadcall.add_argument( "-g", "--lvl1-max-gap", dest = "lvl1maxgap", type = int,
                                         help = "maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30", default = 30 )
    argparser_bdgbroadcall.add_argument( "-G", "--lvl2-max-gap", dest = "lvl2maxgap", type = int,
                                         help = "maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800", default = 800)
    argparser_bdgbroadcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
                                         help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")
    add_outdir_option( argparser_bdgbroadcall )
    add_output_group( argparser_bdgbroadcall )
    return

def add_bdgcmp_parser( subparsers ):
    """Add function 'peak calling on bedGraph' argument parsers.
    """
    argparser_bdgcmp = subparsers.add_parser( "bdgcmp",
                                              help = "Deduct noise by comparing two signal tracks in bedGraph. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS3 are accpetable." )
    argparser_bdgcmp.add_argument( "-t", "--tfile", dest = "tfile", type = str, required = True,
                                   help = "Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2. REQUIRED")
    argparser_bdgcmp.add_argument( "-c", "--cfile", dest = "cfile", type = str, required = True,
                                   help = "Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2. REQUIRED")
    argparser_bdgcmp.add_argument( "-S", "--scaling-factor", dest = "sfactor", type = float, default = 1.0,
                                   help = "Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS3 callpeak, and plan to calculate scores as MACS3 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redudant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in larger sample. DEFAULT: 1.0")
    argparser_bdgcmp.add_argument( "-p", "--pseudocount", dest = "pseudocount", type = float, default = 0.0,
                                   help = "The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.")

    argparser_bdgcmp.add_argument( "-m", "--method", dest = "method", type = str, nargs = "+",
                                   choices = ( "ppois", "qpois", "subtract", "logFE", "FE", "logLR", "slogLR", "max" ),
                                   help = "Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for poisson pvalues, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment(need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model(need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or maximum value between the two tracks. Default option is ppois.",default="ppois")

    add_outdir_option( argparser_bdgcmp )
    output_group = argparser_bdgcmp.add_mutually_exclusive_group( required = True )
    output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
                               help = "The PREFIX of output bedGraph file to write scores. If it is given as A, and method is 'ppois', output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile." )
    output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = "+",
                               help = "Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m." )
    return

def add_bdgopt_parser( subparsers ):
    """Add function 'operations on score column of bedGraph' argument parsers.
    """
    argparser_bdgopt = subparsers.add_parser( "bdgopt",
                                              help = "Operations on score column of bedGraph file. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS3 are accpetable." )
    argparser_bdgopt.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
                                   help = "MACS score in bedGraph. Note: this must be a bedGraph file covering the ENTIRE genome. REQUIRED" )
    argparser_bdgopt.add_argument( "-m", "--method", dest = "method", type = str,
                                   choices = ( "multiply", "add", "p2q", "max", "min" ),
                                   help = "Method to modify the score column of bedGraph file. Available choices are: multiply, add, max, min, or p2q. 1) multiply, the EXTRAPARAM is required and will be multiplied to the score column. If you intend to divide the score column by X, use value of 1/X as EXTRAPARAM. 2) add, the EXTRAPARAM is required and will be added to the score column. If you intend to subtract the score column by X, use value of -X as EXTRAPARAM. 3) max, the EXTRAPARAM is required and will take the maximum value between score and the EXTRAPARAM. 4) min, the EXTRAPARAM is required and will take the minimum value between score and the EXTRAPARAM. 5) p2q, this will convert p-value scores to q-value scores using Benjamini-Hochberg process. The EXTRAPARAM is not required. This method assumes the scores are -log10 p-value from MACS3. Any other types of score will cause unexpected errors.", default="p2q")
    argparser_bdgopt.add_argument( "-p", "--extra-param", dest = "extraparam", type = float, nargs = "*",
                                   help = "The extra parameter for METHOD. Check the detail of -m option.")
    add_outdir_option( argparser_bdgopt )
    argparser_bdgopt.add_argument( "-o", "--ofile", dest = "ofile", type = str,
                                   help = "Output BEDGraph filename.", required = True )
    return

def add_cmbreps_parser( subparsers ):
    """Add function 'combine replicates' argument parsers.
    """
    argparser_cmbreps = subparsers.add_parser( "cmbreps",
                                               help = "Combine BEDGraphs of scores from replicates. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS3 are accpetable." )
    argparser_cmbreps.add_argument( "-i", dest = "ifile", type = str, required = True, nargs = "+",
                                    help = "MACS score in bedGraph for each replicate. Require at least 2 files such as '-i A B C D'. REQUIRED" )
    # argparser_cmbreps.add_argument( "-w", dest = "weights", type = float, nargs = "*",
    #                                 help = "Weight for each replicate. Default is 1.0 for each. When given, require same number of parameters as IFILE." )
    argparser_cmbreps.add_argument( "-m", "--method", dest = "method", type = str,
                                    choices = ( "fisher", "max", "mean" ),
                                    help = "Method to use while combining scores from replicates. 1) fisher: Fisher's combined probability test. It requires scores in ppois form (-log10 pvalues) from bdgcmp. Other types of scores for this method may cause cmbreps unexpected errors. 2) max: take the maximum value from replicates for each genomic position. 3) mean: take the average value. Note, except for Fisher's method, max or mean will take scores AS IS which means they won't convert scores from log scale to linear scale or vice versa.", default="fisher")
    add_outdir_option( argparser_cmbreps )
    argparser_cmbreps.add_argument( "-o", "--ofile", dest = "ofile", type = str, required = True,
                                    help = "Output BEDGraph filename for combined scores." )
    return

def add_randsample_parser( subparsers ):
    argparser_randsample = subparsers.add_parser( "randsample",
                                                  help = "Randomly sample number/percentage of total reads." )
    argparser_randsample.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
                                      help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )

    p_or_n_group = argparser_randsample.add_mutually_exclusive_group( required = True )
    p_or_n_group.add_argument( "-p", "--percentage", dest = "percentage", type = float,
                               help = "Percentage of tags you want to keep. Input 80.0 for 80%%. This option can't be used at the same time with -n/--num. REQUIRED")
    p_or_n_group.add_argument( "-n", "--number", dest = "number", type = float,
                               help = "Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED" )
    argparser_randsample.add_argument( "--seed", dest = "seed", type = int, default = -1,
                                       help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
    argparser_randsample.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
                                      help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
                                      default = None)
    add_outdir_option( argparser_randsample )
    argparser_randsample.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
                                       help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
    argparser_randsample.add_argument( "-f", "--format", dest = "format", type = str,
                                       choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
                                       help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will %(prog)s decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
                                       default = "AUTO" )
    argparser_randsample.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
                                       help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )

    argparser_randsample.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                                       help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
    return

def add_bdgdiff_parser( subparsers ):
    argparser_bdgdiff = subparsers.add_parser( "bdgdiff",
                                               help = "Differential peak detection based on paired four bedgraph files. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS3 are accpetable." )
    argparser_bdgdiff.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
                                    help = "MACS pileup bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
    argparser_bdgdiff.add_argument( "--t2", dest="t2bdg", type = str, required = True,
                                    help = "MACS pileup bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
    argparser_bdgdiff.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
                                    help = "MACS control lambda bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
    argparser_bdgdiff.add_argument( "--c2", dest="c2bdg", type = str, required = True,
                                    help = "MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
    argparser_bdgdiff.add_argument( "-C", "--cutoff", dest = "cutoff", type = float,
                                    help = "logLR cutoff. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
    argparser_bdgdiff.add_argument( "-l", "--min-len", dest = "minlen", type = int,
                                    help = "Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default = 200 )
    argparser_bdgdiff.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
                                    help = "Maximum gap to merge nearby differential regions. Consider a wider gap for broad marks. Maximum gap should be smaller than minimum length (-g). DEFAULT: 100", default = 100 )
    argparser_bdgdiff.add_argument( "--d1", "--depth1", dest = "depth1", type = float, default = 1.0,
                                    help = "Sequencing depth (# of non-redundant reads in million) for condition 1. It will be used together with --d2. See description for --d2 below for how to assign them. Default: 1" )
    argparser_bdgdiff.add_argument( "--d2", "--depth2", dest = "depth2", type = float, default = 1.0,
                                    help = "Sequencing depth (# of non-redundant reads in million) for condition 2. It will be used together with --d1. DEPTH1 and DEPTH2 will be used to calculate scaling factor for each sample, to down-scale larger sample to the level of smaller one. For example, while comparing 10 million condition 1 and 20 million condition 2, use --d1 10 --d2 20, then pileup value in bedGraph for condition 2 will be divided by 2. Default: 1" )

    add_outdir_option( argparser_bdgdiff )
    output_group = argparser_bdgdiff.add_mutually_exclusive_group( required = True )
    output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
                               help = "Output file prefix. Actual files will be named as PREFIX_cond1.bed, PREFIX_cond2.bed and PREFIX_common.bed. Mutually exclusive with -o/--ofile." )
    output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = 3,
                               help = "Output filenames. Must give three arguments in order: 1. file for unique regions in condition 1; 2. file for unique regions in condition 2; 3. file for common regions in both conditions. Note: mutually exclusive with --o-prefix." )

    return

def add_refinepeak_parser( subparsers ):
    argparser_refinepeak = subparsers.add_parser( "refinepeak",
                                                 help = "(Experimental) Take raw reads alignment, refine peak summits and give scores measuring balance of waston/crick tags. Inspired by SPP." )
    argparser_refinepeak.add_argument( "-b", dest = "bedfile", type = str, required = True,
                                      help = "Candidate peak file in BED format. REQUIRED." )
    argparser_refinepeak.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
                                      help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
    argparser_refinepeak.add_argument( "-f", "--format", dest = "format", type = str,
                                      choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE"),
                                      help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
                                      default = "AUTO" )
    argparser_refinepeak.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
                                        help = "Cutoff  DEFAULT: 5", default = 5 )
    argparser_refinepeak.add_argument( "-w", "--window-size", dest= "windowsize", help = 'Scan window size on both side of the summit (default: 100bp)',
                                        type = int, default = 200)
    argparser_refinepeak.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
                                       help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )

    argparser_refinepeak.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                                       help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )

    add_outdir_option( argparser_refinepeak )
    add_output_group( argparser_refinepeak )

    return

def add_predictd_parser( subparsers ):
    """Add main function 'predictd' argument parsers.
    """
    argparser_predictd = subparsers.add_parser("predictd", help="Predict d or fragment size from alignment results. In case of PE data, report the average insertion/fragment size from all pairs. *Will NOT filter duplicates*")

    # group for input files
    argparser_predictd.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
                                     help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
    argparser_predictd.add_argument( "-f", "--format", dest = "format", type = str,
                                     choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
                                                "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
                                                "BAMPE", "BEDPE"),
                                     help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. However, if you want to decide the average insertion size/fragment size from PE data such as BEDPE or BAMPE, please specify the format as BAMPE or BEDPE since MACS3 won't automatically recognize three two formats with -f AUTO. Please be aware that in PE mode, -g, -s, --bw, --d-min, -m, and --rfile have NO effect. DEFAULT: \"AUTO\"",
                                     default = "AUTO" )
    argparser_predictd.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
                                     help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
    argparser_predictd.add_argument( "-s", "--tsize",  dest = "tsize", type = int, default = None,
                                     help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
    argparser_predictd.add_argument( "--bw", dest = "bw", type = int, default = 300,
                                     help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. DEFAULT: 300")
    argparser_predictd.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
                                     help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
    argparser_predictd.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
                                     help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". DEFAULT:5 50" )

    add_outdir_option( argparser_predictd )
    argparser_predictd.add_argument( "--rfile", dest = "rfile", type = str, default = "predictd",
                                     help = "PREFIX of filename of R script for drawing X-correlation figure. DEFAULT:'predictd' and R file will be predicted_model.R" )
    argparser_predictd.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
                                     help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )

    argparser_predictd.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                                     help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )

    return

def add_pileup_parser( subparsers ):
    argparser_pileup = subparsers.add_parser( "pileup",
                                              help = "Pileup aligned reads with a given extension size (fragment size or d in MACS language). Note there will be no step for duplicate reads filtering or sequencing depth scaling, so you may need to do certain pre/post-processing." )
    argparser_pileup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
                                   help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
    argparser_pileup.add_argument( "-o", "--ofile", dest = "outputfile", type = str, required = True,
                                   help = "Output bedGraph file name. If not specified, will write to standard output. REQUIRED." )
    add_outdir_option( argparser_pileup )
    argparser_pileup.add_argument( "-f", "--format", dest = "format", type = str,
                                   choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
                                   help = "Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS3 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.",
                                   default = "AUTO" )
    argparser_pileup.add_argument( "-B", "--both-direction", dest = "bothdirection", action = "store_true", default = False,
                                   help = "By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS3 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS3 way of piling up control reads. However MACS3 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False" )

    argparser_pileup.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
                                   help = "The extension size in bps. Each alignment read will become a EXTSIZE of fragment, then be piled up. Check description for -B for detail. It's twice the `shiftsize` in old MACSv1 language. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: 200 " )
    argparser_pileup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
                                   help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )

    argparser_pileup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                                      help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
    return

def add_callvar_parser( subparsers ):
    """Add function 'variant calling' argument parsers. From SAPPER package
    """
    argparser_callvar = subparsers.add_parser("callvar",
                                              formatter_class = ap.RawDescriptionHelpFormatter,
                                              help="Call variants in given peak regions from the alignment BAM files.",
                                              epilog = """Tips to prepare your input BAM files:
*Note: please modify the following command lines accordingly*
Assuming you have two types of BAM files. The first type, what we call
`TREAT`, is from DNA enrichment assay such as ChIP-seq or ATAC-seq
where the DNA fragments in the sequencing library are enriched in
certain genomics regions with potential allele biases; the second
type, called `CTRL` for control, is from genomic assay in which the
DNA enrichment is less biased in multiploid chromosomes and more
uniform across the whole genome (the later one is optional).
1. Clean the BAM files:
    $ samtools view -q 30 -F 4 -F 256 -F 2048 -b TREAT.bam -o TREAT_clean.bam
    $ samtools view -q 30 -F 4 -F 256 -F 2048 -b CTRL.bam -o CTRL_clean.bam
2. Sort the BAM file:
    $ samtools sort  TREAT_clean.bam  TREAT_clean_sorted
    $ samtools sort  CTRL_clean.bam  CTRL_clean_sorted
3. Peak calling (example is for paired-end data):
    $ macs3 callpeak -f BAMPE -t TREAT_clean_sort.bam -c CTRL_clean_sort.bam -n MyFactor
4. Sort peak file:
    $ sort -k1,1 -k2,2n MyFactor_peaks.narrowPeak > MyFactor_peaks.sorted.bed
5. Extract reads in peak regions:
    $ samtools view -b TREAT_clean_sorted.bam -L MyFactor_peaks.sorted.bed -o TREAT_peaks.bam
    $ samtools view -b CTRL_clean_sorted.bam -L MyFactor_peaks.sorted.bed -o CTRL_peaks.bam
Finally, to call variants:
    $ macs3 callvar -b MyFactor_peaks.sorted.bed -t TREAT_peaks.bam -c CTRL_peaks.bam -o MyFactor.vcf
""")
    # group for input files
    group_input = argparser_callvar.add_argument_group( "Input files arguments" )
    group_input.add_argument( "-b", "--peak", dest = "peakbed", type = str, required =True,
                              help = "Peak regions in BED format, sorted by coordinates. REQUIRED." )
    group_input.add_argument( "-t", "--treatment", dest = "tfile", type = str, required = True, 
                              help = "ChIP-seq/ATAC-seq treatment file in BAM format, containing only records in peak regions, sorted by coordinates. Check instruction on how to make the file using samtools. REQUIRED." )
    group_input.add_argument( "-c", "--control", dest = "cfile", type = str, required = False, 
                              help = "Control file in BAM format, containing only records in peak regions, sorted by coordinates. Check instruction on how to make the file using samtools. ")
    # group for output files
    group_output = argparser_callvar.add_argument_group( "Output arguments" )
    add_outdir_option( group_output )
    group_output.add_argument( "-o", "--ofile", dest = "ofile", type = str, required = True,
                               help = "Output VCF file name." )    
    group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                               help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
    # group for parameters
    group_para = argparser_callvar.add_argument_group( "Variant calling arguments" )
    group_para.add_argument( "-g", "--gq-hetero", dest = "GQCutoffHetero", type = float,
                             help = "Genotype Quality score (-10log10((L00+L11)/(L01+L00+L11))) cutoff for Heterozygous allele type. Default:0, or there is no cutoff on GQ.", default = 0 )
    group_para.add_argument( "-G", "--gq-homo", dest = "GQCutoffHomo", type = float,
                             help = "Genotype Quality score (-10log10((L00+L01)/(L01+L00+L11))) cutoff for Homozygous allele (not the same as reference) type. Default:0, or ther is no cutoff on GQ.", default = 0 )
    group_para.add_argument( "-Q", dest = "Q", type = int, default = 20,
                             help = "Only consider bases with quality score greater than this value. Default: 20, which means Q20 or 0.01 error rate." )
    group_para.add_argument( "-D", dest = "maxDuplicate", type = int, default = 1,
                             help = "Maximum duplicated reads allowed per mapping position, mapping strand and the same CIGAR code. Default: 1. When sequencing depth is high, to set a higher value might help evaluate the correct allele ratio.")
    group_para.add_argument( "-F", "--fermi", dest = "fermi", type = str, default = "auto",
                             help = "Option to control when to apply local assembly through Fermi. By default (set as 'auto'), while SAPPER detects any INDEL variant in a peak region, it will utilize Fermi to recover the actual DNA sequences to refine the read alignments. If set as 'on', Fermi will be always invoked. It can increase specificity however sensivity and speed will be significantly lower. If set as 'off', Fermi won't be invoked at all. If so, speed and sensitivity can be higher but specificity will be significantly lower. Default: auto" )
    group_para.add_argument( "--fermi-overlap", dest = "fermiMinOverlap", type = int,
                             help = "The minimal overlap for fermi to initially assemble two reads. Must be between 1 and read length. A longer fermiMinOverlap is needed while read length is small (e.g. 30 for 36bp read, but 33 for 100bp read may work). Default:30", default = 30 )
    group_para.add_argument( "--top2alleles-mratio", dest = "top2allelesMinRatio", type = float,
                             help = "The reads for the top 2 most frequent alleles (e.g. a ref allele and an alternative allele) at a loci shouldn't be too few comparing to total reads mapped. The minimum ratio is set by this optoin. Must be a float between 0.5 and 1.  Default:0.8 which means at least 80%% of reads contain the top 2 alleles.", default = 0.8 )
    group_para.add_argument( "--altallele-count", dest = "altalleleMinCount", type = int,
                             help = "The count of the alternative (non-reference) allele at a loci shouldn't be too few. By default, we require at least two reads support the alternative allele. Default:2", default = 2 )
    group_para.add_argument( "--max-ar", dest = "maxAR", type = float,
                             help = "The maximum Allele-Ratio allowed while calculating likelihood for allele-specific binding. If we allow higher maxAR, we may mistakenly assign some homozygous loci as heterozygous. Default:0.95", default = 0.95 )
    # group for misc
    group_misc = argparser_callvar.add_argument_group( "Misc arguments" )
    group_misc.add_argument( "-m", "--multiple-processing", dest = "np", type = int, default = 1,
                             help = "CPU used for mutliple processing. Please note that, assigning more CPUs does not guarantee the process being faster. Creating too many parrallel processes need memory operations and  may negate benefit from multi processing. Default: 1" )
    return

if __name__ == '__main__':
    __spec__ = None
    try:
        main()
    except KeyboardInterrupt:
        sys.stderr.write("User interrupted me! ;-) Bye!\n")
        sys.exit(0)
    except MemoryError:
        sys.exit( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
        sys.exit(1)
