from os import path
from os.path import join, splitext, dirname
from segzoo.gene_biotypes import BIOTYPES

#OPTIONS
QUIET = ""
CLOBBER = "--clobber"
NOPLOT = "--noplot"
OUTDIR = config['outdir']
SEGMENTATION = config["segmentation"]
# TODO: try to set default values in argparse instead of using inline if/else
GMTK_PARAMS = config["parameters"] if config["parameters"] else []
MNE_FILENAME = config['mne'] if config['mne'] else []
DENDROGRAM = config['dendrogram']

PREFIX = config["prefix"]  # "anaconda_path/envs/your_env/"
SPECIES = config['species']  # "Homo_sapiens"
BUILD = config['build']  # "hg38"

recipeGTF = "rnaseq"
recipeSEQ = "sequence"

fileGTF = BUILD + ".gtf.gz"
fileSEQ = BUILD + ".fa"

BUILDPATH = join(PREFIX, "share", "ggd", SPECIES, BUILD)
HERE = path.abspath(path.dirname(__file__))

RESULTS = "results"
RESULTFILE = "results.tsv"
DATA = "data"
# LOGS = "logs"
PLOTS = "plots"

if config["parameters"]:
	PLOTFILES = ["unnormalized_plot", "normalized_plot"]
else:
	PLOTFILES = ["plot"]

FILENAME_TO_NORMALIZE = {"unnormalized_plot": False, "normalized_plot": True, "plot": False}

# Temporary solution for setting predictable recipe names and file paths
BUILD_RECIPE_MAPPING = {
'hg38': {'sequence':
			 {'recipe': 'hg38-reference-genome-ucsc-v1',
			  'filepath': join(PREFIX, 'share/ggd/Homo_sapiens/hg38/hg38-reference-genome-ucsc-v1/1/hg38-reference-genome-ucsc-v1.fa')},
		'gtf':
			{'recipe': 'hg38-gtf-ensembl-v1',
			 'filepath': join(PREFIX, 'share/ggd/Homo_sapiens/hg38/hg38-gtf-ensembl-v1/1/hg38-gtf-ensembl-v1.gtf.gz')}},
'danRer10': {'sequence':
				 {'recipe': 'danrer10-reference-genome-ucsc-v1',
				  'filepath': join(PREFIX, 'share/ggd/Danio_rerio/danRer10/danrer10-reference-genome-ucsc-v1/1/danrer10-reference-genome-ucsc-v1.fa.gz')},
			 'gtf':
				 {'recipe': 'danrer10-gtf-ensembl-v1',
				  'filepath': join(PREFIX, 'share/ggd/Danio_rerio/danRer10/danrer10-gtf-ensembl-v1/1/danrer10-gtf-ensembl-v1.gtf.gz')}}
}

RECIPE_FILEPATH = BUILD_RECIPE_MAPPING[BUILD]

preprocessed_segmentation = join(OUTDIR, DATA,"segmentation", "segmentation.pkl.gz")


# Main rule that sets the targets to obtain
rule all:
	input:
		expand(join(OUTDIR, PLOTS, '{file}.png'), file=PLOTFILES),
		expand(join(OUTDIR, PLOTS, '{file}.pdf'), file=PLOTFILES)

rule job:
	input:
		len_dist=join(OUTDIR, RESULTS, "length_distribution", RESULTFILE),
		nuc=join(OUTDIR, RESULTS, "nucleotide", RESULTFILE),
		olp=join(OUTDIR, RESULTS, "overlap", RESULTFILE),
		aggs=expand(join(OUTDIR, RESULTS, "aggregation", "gene_biotype", "{biotype}", RESULTFILE), biotype=BIOTYPES),
		stats=join(BUILDPATH, recipeGTF, "gene_biotype", "gene_biotype_stats"),
		mne=MNE_FILENAME,
		gmtk=join(OUTDIR, RESULTS, "gmtk_parameters", RESULTFILE) if config['parameters'] else []
	params:
		outfile=join(OUTDIR, PLOTS, '{file}'),
		dendrogram=DENDROGRAM,
		normalize_gmtk=lambda wildcards: FILENAME_TO_NORMALIZE[wildcards.file]
	output:
		png=join(OUTDIR, PLOTS, '{file}.png'),
		pdf=join(OUTDIR, PLOTS, '{file}.pdf')
	script:
		"visualization.py"


# DOWNLOADING RULES

# GGD conda installations into the anaconda environment folder
rule download_targets:
	input:
		RECIPE_FILEPATH['sequence']['filepath'],
		RECIPE_FILEPATH['gtf']['filepath'],
		join(BUILDPATH, recipeGTF, "gene_biotype", "gene_biotype_stats")  # gene biotype separation output

rule download_ggd_annotation:
	input:
	output:
		RECIPE_FILEPATH['gtf']['filepath']
	params:
		recipe=RECIPE_FILEPATH['gtf']['recipe'],
		outdir=dirname(RECIPE_FILEPATH['gtf']['filepath'])
	shell:
		"""
		rm -R {params.outdir}
		ggd install --prefix {PREFIX} {params.recipe}
		"""

rule download_ggd_sequence:
	input:
	output:
		RECIPE_FILEPATH['sequence']['filepath']
	params:
		recipe=RECIPE_FILEPATH['sequence']['recipe'],
		outdir=dirname(RECIPE_FILEPATH['sequence']['filepath'])
	shell:
		"""
		rm -R {params.outdir}
		ggd install --prefix {PREFIX}  {params.recipe}
		zcat {output}.gz > {output}
		"""


# PREPROCESSING RULES

# Preprocess segway annotation for future faster pasing
rule run_segtools_preprocess:
	input:
		SEGMENTATION
	output:
		preprocessed_segmentation
	params:
		outfile=splitext(splitext(preprocessed_segmentation)[0])[0]  # segtools-preprocess automatically adds the extensions
	shell:
		"segtools-preprocess {QUIET} {CLOBBER} {SEGMENTATION} {params.outfile}"

# Parse the main gtf file to create one file per gene biotype (uses pybedtools)
rule create_gene_biotype_gtf:
	input:
		#gtf=join(BUILDPATH, recipeGTF, fileGTF)
		gtf=RECIPE_FILEPATH['gtf']['filepath']
	output:
		expand(join(BUILDPATH, recipeGTF, "gene_biotype", "{biotype}", "general", fileGTF), biotype=BIOTYPES),
		expand(join(BUILDPATH, recipeGTF, "gene_biotype", "{biotype}", "gene", fileGTF), biotype=BIOTYPES),
		stats=join(BUILDPATH, recipeGTF, "gene_biotype", "gene_biotype_stats")
	params:
		outdir=join(BUILDPATH, recipeGTF, "gene_biotype"),
		outfile=fileGTF
	script:
		"create_gene_biotype_gtfs.py"


# ANALYZING RULES

# Segtools GMTK-Parameters execution
rule run_segtools_gmtk_parameters:
	input:
		GMTK_PARAMS
	output:
		join(OUTDIR, RESULTS, "gmtk_parameters", RESULTFILE)
	script:
		"create_gmtk_results.py"

# Segtools Aggregation results obtention, general and by gene_biotype
rule run_segtools_aggregation_general:
	input:
		preprocessed_segmentation,
		#gtf=join(BUILDPATH, recipeGTF, fileGTF)
		gtf=RECIPE_FILEPATH['gtf']['filepath']
	output:
		join(OUTDIR, DATA, "aggregation", "general", "feature_aggregation.tab")
	params:
		outdir=join(OUTDIR, DATA, "aggregation", "general")
	shell:
		"segtools-aggregation --mode=gene {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf}"

rule run_segtools_aggregation_gene_biotype:
	input:
		preprocessed_segmentation,
		gtf=join(BUILDPATH, recipeGTF, "gene_biotype", "{biotype}", "general", fileGTF)
	output:
		join(OUTDIR, DATA, "aggregation", "gene_biotype", "{biotype}", "feature_aggregation.tab")
	params:
		outdir=join(OUTDIR, DATA, "aggregation", "gene_biotype", "{biotype}")
	shell:
		"segtools-aggregation --mode=gene {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf}"

rule create_aggregation_results_general:
	input:
		infile=join(OUTDIR, DATA, "aggregation", "general", "feature_aggregation.tab")
	output:
		outfile=join(OUTDIR, RESULTS, "aggregation", "general", RESULTFILE)
	script:
		"create_aggregation_results.py"

rule create_aggregation_results_gene_biotype:
	input:
		infile=join(OUTDIR, DATA, "aggregation", "gene_biotype", "{biotype}", "feature_aggregation.tab")
	output:
		outfile=join(OUTDIR, RESULTS, "aggregation", "gene_biotype", "{biotype}", RESULTFILE)
	script:
		"create_aggregation_results.py"

# Segtools Length Distribution execution and results creation
# TODO: Don't just copy, preprocess for easier obtention of results
rule run_segtools_length_distribution:
	input:
		preprocessed_segmentation,
	output:
		join(OUTDIR, DATA, "length_distribution", "segment_sizes.tab"),
		join(OUTDIR, RESULTS, "length_distribution", RESULTFILE)
	params:
		outdir=join(OUTDIR, DATA, "length_distribution")
	shell:
		"segtools-length-distribution {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation};"
		"cp {output[0]} {output[1]}"

# Segtools Feature Distance execution for general file and gene_biotype
rule run_segtools_feature_distance_general:
	input:
		preprocessed_segmentation,
		#gtf=join(BUILDPATH, recipeGTF, fileGTF)
		gtf=RECIPE_FILEPATH['gtf']['filepath']
	output:
		join(OUTDIR, DATA, "feature_distance", "general", "feature_distance.tab"),
		outfile=join(OUTDIR, DATA, "feature_distance", "general", "feature_distance_segments.tab")
	params:
		outdir=join(OUTDIR, DATA, "feature_distance", "general")
	shell:
		"segtools-feature-distance {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf} > {output.outfile}"

rule run_segtools_feature_distance_gene_biotype:
	input:
		preprocessed_segmentation,
		gtf=RECIPE_FILEPATH['gtf']['filepath']
	output:
		join(OUTDIR, DATA, "feature_distance", "gene_biotype", "{biotype}", "feature_distance.tab"),
		outfile=join(OUTDIR, DATA, "feature_distance", "gene_biotype", "{biotype}", "feature_distance_segments.tab")
	params:
		outdir=join(OUTDIR, DATA, "feature_distance", "gene_biotype", "{biotype}")
	shell:
		"segtools-feature-distance {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {preprocessed_segmentation} {input.gtf} > {output.outfile}"

# Bedtools nucleotide results obtention
rule run_bedtools_nuc:
	input:
		SEGMENTATION,
		#fasta=join(BUILDPATH, recipeSEQ, fileSEQ)
		fasta=RECIPE_FILEPATH['sequence']['filepath']
	output:
		join(OUTDIR, DATA, "nucleotide", "bedtools_output")
	shell:
		"bedtools nuc -fi {input.fasta} -bed {SEGMENTATION} > {output}"

rule create_nucleotide_results:
	input:
		infile=join(OUTDIR, DATA, "nucleotide", "bedtools_output")
	output:
		outfile=join(OUTDIR, RESULTS, "nucleotide", RESULTFILE)
	script:
		"create_nucleotide_results.py"

# Segtools Overlap results obtention
rule run_segtools_overlap:
	input:
		SEGMENTATION,
		gtf=RECIPE_FILEPATH['gtf']['filepath']
	output:
		join(OUTDIR, DATA, "overlap", "overlap.tab")
	params:
		outdir=join(OUTDIR, DATA, "overlap")
	shell:
		"segtools-overlap --by=bases {QUIET} {CLOBBER} {NOPLOT} --outdir {params.outdir} {SEGMENTATION} {input.gtf}"

rule create_overlap_results:
	input:
		infile=join(OUTDIR, DATA, "overlap", "overlap.tab")
	output:
		outfile=join(OUTDIR, RESULTS, "overlap", RESULTFILE)
	script:
		"create_overlap_results.py"


# Visualization

rule create_diagram:
	input:
	output:
		"dag.svg",
		"dag.png"
	shell:
		"snakemake --rulegraph all | dot -Tsvg > {PLOTS}/dag.svg"