import os.path

#: Path to b37/b38 FASTA file
REF = {
    'b37': config.get('b37_path', '/dev/null'),
    'b38': config.get('b38_path', '/dev/null'),
}


rule default:
    input:
        expand(
            "output/clinvar.{build}.tsv{ext}",
            build=("b37", "b38"),
            ext=(".gz", ".gz.tbi", ".gz.md5", ".gz.tbi.md5"),
        )


rule download_xml:
    output: "downloads/ClinVarFullRelease_00-latest.xml.gz"
    shell:
        r"""
        wget -c -N -O {output} ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz \
        2> {output}.log
        """


rule download_txt:
    output: "downloads/variant_summary.txt.gz"
    shell:
        r"""
        wget -c -N -O {output} ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz \
        2> {output}.log
        """


rule parse_clinvar:
    input:
        release_xml="downloads/ClinVarFullRelease_00-latest.xml.gz",
        summary="downloads/variant_summary.txt.gz",
    output:
        b37=temp("parsed/clinvar_table_raw.b37.tsv"),
        b38=temp("parsed/clinvar_table_raw.b38.tsv"),
    shell:
        r"""
        set -euo pipefail

        clinvar_tsv parse_xml \
            --clinvar-xml {input.release_xml} \
            --output-b37 {output.b37} \
            --output-b38 {output.b38} \
            $(if [[ {config[debug]} == "True" ]]; then
                echo "--max-rcvs 100"
            fi)
        """


rule normalize_clinvar:
    input:
        tsv="parsed/clinvar_table_raw.{genome_build}.tsv",
        reference=lambda wildcards: REF[wildcards.genome_build],
    output: temp("normalized/clinvar_table_normalized.{genome_build}.tsv.gz"),
    shell:
        r"""
        set -euo pipefail

        clinvar_tsv normalize_tsv \
            --input-tsv {input.tsv} \
            --reference {input.reference} \
            --output-tsv /dev/stdout \
        | grep -v '^$' \
        | bgzip -c \
        > {output}
        """


rule sort_tabix_clinvar:
    input: "normalized/clinvar_table_normalized.{genome_build}.tsv.gz",
    output:
        tsv="unmerged/clinvar.{genome_build}.tsv.gz",
        tbi="unmerged/clinvar.{genome_build}.tsv.gz.tbi",
        tsv_md5="unmerged/clinvar.{genome_build}.tsv.gz.md5",
        tbi_md5="unmerged/clinvar.{genome_build}.tsv.gz.tbi.md5",
    shell:
        r"""
        set -euo pipefail

        cat \
            <(zcat {input} | head -n 1) \
            <(zcat {input} | tail -n +2 | sort -k2,2V -k3,3n -k4,4n -k11,11) \
        | bgzip -c \
        > {output.tsv}
        tabix -S 1 -s 2 -b 3 -e 4 -f {output.tsv}

        cd $(dirname {output.tsv})
        md5sum $(basename {output.tsv}) >$(basename {output.tsv}).md5
        md5sum $(basename {output.tbi}) >$(basename {output.tbi}).md5
        """

rule merge_clinvar:
    input: "unmerged/clinvar.{genome_build}.tsv.gz",
    output:
        tsv="output/clinvar.{genome_build}.tsv.gz",
        tbi="output/clinvar.{genome_build}.tsv.gz.tbi",
        tsv_md5="output/clinvar.{genome_build}.tsv.gz.md5",
        tbi_md5="output/clinvar.{genome_build}.tsv.gz.tbi.md5",
    params:
        clinvar_version=config.get("clinvar_version", ".")
    shell:
        r"""
        set -euo pipefail

        zcat {input} \
        | clinvar_tsv merge_tsvs \
            --clinvar-version {params.clinvar_version} \
            --input-tsv /dev/stdin \
            --output-tsv /dev/stdout \
        | bgzip -c \
        > {output.tsv}
        tabix -S 1 -s 2 -b 3 -e 4 -f {output.tsv}

        cd $(dirname {output.tsv})
        md5sum $(basename {output.tsv}) >$(basename {output.tsv}).md5
        md5sum $(basename {output.tbi}) >$(basename {output.tbi}).md5
        """
