#!/usr/bin/env python

import sys
from optparse import OptionParser
import errno


from pysam import Samfile, Fastafile
import pysamstats


if __name__ == '__main__':
    
    stats_types = ('coverage', 
                   'coverage_strand', 
                   'coverage_ext', 
                   'coverage_ext_strand', 
                   'coverage_normed',
                   'coverage_gc',
                   'coverage_normed_gc',
                   'variation', 
                   'variation_strand',
                   'tlen',
                   'tlen_strand',
                   'mapq',
                   'mapq_strand',
                   'baseq',
                   'baseq_strand',
                   'baseq_ext',
                   'baseq_ext_strand')
    stats_types_requiring_fasta = ('variation', 'variation_strand', 'baseq_ext', 'baseq_ext_strand', 'coverage_gc', 'coverage_normed_gc')
    usage = 'usage: %prog [options] FILE'
    description = "Calculate statistics per genome position based on pileups from a SAM or BAM file and print them to stdout."
    epilog = """
Supported statistics types:

    * coverage            - number of reads aligned to each genome position 
                            (total and properly paired)
    * coverage_strand     - as coverage but with forward/reverse strand counts
    * coverage_ext        - various additional coverage metrics, including 
                            coverage for reads not properly paired (mate 
                            unmapped, mate on other chromosome, ...)
    * coverage_ext_strand - as coverage_ext but with forward/reverse strand counts 
    * coverage_normed     - depth of coverage normalised by median or mean
    * coverage_gc         - as coverage but also includes a column for %GC
    * coverage_normed_gc  - as coverage_normed but also includes columns for normalisation
                            by %GC      
    * variation           - numbers of matches, mismatches, deletions, 
                            insertions, etc.
    * variation_strand    - as variation but with forward/reverse strand counts
    * tlen                - insert size statistics
    * tlen_strand         - as tlen but with statistics by forward/reverse strand
    * mapq                - mapping quality statistics
    * mapq_strand         - as mapq but with statistics by forward/reverse strand
    * baseq               - baseq quality statistics
    * baseq_strand        - as baseq but with statistics by forward/reverse strand
    * baseq_ext           - extended base quality statistics, including qualities
                            of bases matching and mismatching reference
    * baseq_ext_strand    - as baseq_ext but with statistics by forward/reverse strand
    
Examples:

    pysamstats --type coverage example.bam > example.coverage.txt
    pysamstats --type coverage --chromosome Pf3D7_v3_01 --start 100000 --end 200000 example.bam > example.coverage.txt

"""

    OptionParser.format_epilog = lambda self, formatter: self.epilog
    parser = OptionParser(usage=usage, description=description, epilog=epilog)
    parser.add_option('-t', '--type', dest='type', help='type of statistics to print: %s' % ', '.join(stats_types), default='coverage')
    parser.add_option('-c', '--chromosome', dest='chromosome', help='chromosome name', default=None)
    parser.add_option('-s', '--start', dest='start', type='int', help='start position (1-based)', default=None)
    parser.add_option('-e', '--end', dest='end', type='int', help='end position (1-based)', default=None)
    parser.add_option('-z', '--zero-based', dest='zero_based', help='use zero-based coordinates (default is false, i.e., use one-based coords)', action='store_true')
    parser.add_option('-f', '--fasta', dest='fasta', help='reference sequence file, only required for some statistics', default=None)
    parser.add_option('--gc-window-length', dest='gc_window_length', type=int, help='size of window to use for %GC calculations [300]', default=300, metavar='N')
    parser.add_option('--gc-window-offset', dest='gc_window_offset', type=int, help='window offset to use for deciding which genome position to report %GC calculations against [150]', default=150, metavar='N')
    parser.add_option('-o', '--omit-header', dest='omit_header', help='omit header row from output', action='store_true')
    parser.add_option('-p', '--progress', dest='progress', type='int', help='report progress every N rows', metavar='N', default=None)
    options, args = parser.parse_args()
    
    if len(args) != 1:
        parser.error('missing file operand\n\nTry "pysamstats --help" for more information.')
    
    samfile = Samfile(args[0])
    one_based = not options.zero_based
    write_header = not options.omit_header
    
    try:

        if options.type in stats_types_requiring_fasta:
            
            if options.fasta is None:
                parser.error('missing --fasta option\n\nTry "pysamstats --help" for more information.')
            else:
                fafile = Fastafile(options.fasta)
                fname = 'write_' + options.type
                f = getattr(pysamstats, fname)
                f(sys.stdout, samfile, fafile,
                  write_header=write_header, 
                  chrom=options.chromosome, start=options.start, end=options.end, 
                  one_based=one_based, 
                  progress=options.progress,
                  gc_window_length=options.gc_window_length,
                  gc_window_offset=options.gc_window_offset)
            
        elif options.type in stats_types:
            
            fname = 'write_' + options.type
            f = getattr(pysamstats, fname)
            f(sys.stdout, samfile,
              write_header=write_header, 
              chrom=options.chromosome, start=options.start, end=options.end, 
              one_based=one_based, 
              progress=options.progress)
                
        else:
            parser.error('unsupported statistics type: "%s"\nTry one of %s or "pysamstats --help" for more information.' % (options.type, stats_types))
        
    except IOError as e:
        if e.errno == errno.EPIPE:
            pass # ignore broken pipe
        else:
            raise
