configfile: "config.yml"

import acanthophis


acanthophis.populate_metadata(config,
		runlib="metadata/runlib2sample.tsv",
		sample_meta="metadata/samples.tsv",
		setfile_glob="metadata/samplesets/*.txt")

shell.prefix = "set -euo pipefail; "

wildcard_constraints:
    run="[^/]+",
    lib="[^/]+",
    aligner="[^/]+",
    sample="[^/]+",
    ref="[^/]+",
    type="[^/]+",





def raw_variant_calls_input(wildcards):
    inputs = []
    for sampleset in config["varcall"]["samplesets"]:
        for caller in config["varcall"]["samplesets"][sampleset]["callers"]:
            for aligner in config["varcall"]["samplesets"][sampleset]["aligners"]:
                for ref in config["varcall"]["samplesets"][sampleset]["refs"]:
                    this_rawfiles = expand("data/variants/raw_split/{caller}~{aligner}~{ref}~{sampleset}/{region}.bcf",
                                           caller=caller, aligner=aligner, ref=ref, sampleset=sampleset, region=VARCALL_REGIONS[caller][ref])
                    inputs.extend(this_rawfiles)
    return inputs


rule raw_variant_calls:
    input: raw_variant_calls_input


## Haplotype Caller

### We don't do BQSR as it requires a library of known polymorphism. We don't
### have that, so the original bams will have to do.

rule gatk_hapcall:
    input:
        bam="data/alignments/samples/{aligner}/{ref}/{sample}.bam",
        bai="data/alignments/samples/{aligner}/{ref}/{sample}.bam.bai",
        ref=lambda wc: config['refs'][wc.ref]["fasta"],
    output:
        gvcf=temp("data/variants/gatk/hapcall/{aligner}~{ref}~{sample}/{region}.gvcf"),
    log:
        "data/variants/gatk/hapcall/{aligner}~{ref}~{sample}/{region}.gvcf.log",
    threads:
        2
    shell:
        "gatk"
        "   HaplotypeCaller"
        "   -R {input.ref}"
        "   -I {input.bam}"
        "   -O {output.gvcf}"
        "   -ERC GVCF"
        "   -L {wildcards.region}"
        "   --heterozygosity 0.05"
        "   --heterozygosity-stdev 0.01"
        "   --indel-heterozygosity 0.01"
        "   --max-reads-per-alignment-start 50"
        "   --native-pair-hmm-threads {threads}"
        "   --create-output-variant-index"
        "   --create-output-variant-md5"
        "   --contamination-fraction-to-filter 0.03"
        " >{log} 2>&1"


rule gatk_combinegvcfs:
    input:
        gvcfs=lambda wc: expand("data/variants/gatk/hapcall/{aligner}~{ref}~{sample}/{region}.gvcf",
                                aligner=wc.aligner, ref=wc.ref, region=wc.region,
                                sample=SAMPLESETS[wc.sampleset]),
        ref=lambda wc: config['refs'][wc.ref]["fasta"],
    output:
        gvcf="data/variants/gatk/combinedgvcf/{aligner}~{ref}~{sampleset}/{region}.gvcf.gz",
    log:
        "data/variants/gatk/combinedgvcf/{aligner}~{ref}~{sampleset}/{region}.gvcf.gz.log"
    threads:
        1
    run:
        gvcfarg = " -V ".join(input.gvcfs)
        shell(
            "gatk"
            "   CombineGVCFs"
            "   -R {input.ref}"
            "   -L {wildcards.region}"
            f"  -V {gvcfarg}"
            "   -O {output.gvcf}"
            "   --create-output-variant-index"
            "   --create-output-variant-md5"
            " >{log} 2>&1"
        )


rule gatk_genotypegvcfs:
    input:
        gvcf="data/variants/gatk/combinedgvcf/{aligner}~{ref}~{sampleset}/{region}.gvcf.gz",
        ref=lambda wc: config['refs'][wc.ref]["fasta"],
    output:
        vcf="data/variants/gatk/genotypedgvcf/{aligner}~{ref}~{sampleset}/{region}.vcf.gz",
    log:
        "data/variants/gatk/genotypedgvcf/{aligner}~{ref}~{sampleset}/{region}.gvcf.gz.log"
    threads:
        1
    shell:
        "gatk"
        "   GenotypeGVCFs"
        "   -R {input.ref}"
        "   -V {input.gvcf}"
        "   -O {output.vcf}"
        "   -L {wildcards.region}"
        "   --create-output-variant-index"
        "   --create-output-variant-md5"
        "   --heterozygosity 0.05"
        "   --heterozygosity-stdev 0.01"
        "   --indel-heterozygosity 0.01"
        ">{log} 2>&1"


#GenotypeGVCFs  #region
#VariantRecalibrator, ApplyRecalibration  # per region

rule gatk_mergevariants:
    input:
        vcf=lambda wc: expand("data/variants/gatk/genotypedgvcf/{aligner}~{ref}~{sampleset}/{region}.vcf.gz",
                               aligner=wc.aligner, ref=wc.ref, sampleset=wc.sampleset,
                               region=sorted(VARCALL_REGIONS["gatk-hc"][wc.ref])),
    output:
        vcf="data/variants/final/gatk-hc~{aligner}~{ref}~{sampleset}.vcf.gz",
    log:
        "data/variants/final/gatk-hc~{aligner}~{ref}~{sampleset}.vcf.gz.log",
    run:
        invcfs = " -I ".join(input.vcf)
        shell(
            "gatk MergeVcfs"
            "   -O {output.vcf}" +
            f"  -I {invcfs}"
            " >{log} 2>&1"
        )
            
                   


### ANGSD

# Angsd somewhat hacky verison of Rose's logic w/ hardcoded sites per step1 files

rule angsd_step2_maf_chrom:
    input:
        ref=lambda wc: config['refs'][wc.ref]["fasta"],
        sites=lambda wc: expand("rawdata/angsd-sites-rose/{chr}.sites_mm", chr=wc.chrom),
        bamlist="data/alignments/bamlists/{aligner}~{ref}~{sampleset}.bamlist",
    output:
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.arg",
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.hwe.gz",
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.mafs.gz",
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.qs",
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.saf.gz",
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.saf.idx",
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.saf.pos.gz",
        "data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.snpStat.gz",
    log:
        "data/log/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.log"
    shell:
        "angsd"
        "   -P 1"
        "   -bam {input.bamlist}"
        "   -out data/angsd/step2/maf_{wildcards.aligner}~{wildcards.ref}~{wildcards.sampleset}_{wildcards.chrom}"
        "   -ref {input.ref} -anc {input.ref}"
        "   -GL 2"
        "   -doMajorMinor 3"
        "   -skipTriallelic 1"
        "   -doHWE 1"
        "   -doMaf 1"
        "   -doSaf 1"
        "   -doCounts 1"
        "   -doQsDist 1"
        "   -doSnpStat 1"
        "   -C 50"
        "   -baq 1"
        "   -minMapQ 20"
        "   -minQ 20"
        "   -r {wildcards.chrom}"
        "   -sites {input.sites}"
        "   > {log} 2>&1"


rule all_angsd_step2_maf:
    input:
        expand("data/angsd/step2/maf_{aligner}~{ref}~{sampleset}_{chrom}.mafs.gz",
               aligner=config["angsd"]["aligners"],
               ref=config["angsd"]["refs"]["fasta"],
               sampleset=config["angsd"]["samplesets"],
               chrom=config["angsd"]["chroms"])


#######################################################################
#                              All rule                               #
#######################################################################


rule all:
    input:
        rules.denovo.input,
        rules.reads.input,
        rules.align.input,
        rules.varcall.input,
