#!/bin/bash

#A simple set of commands to perform the standard 3C mapping operations and
#generate GRAAL-compatible matrices.
#
#This script must be run in the same directory as fraglist.py as that's what
#it relies on to perform some of the GRAAL-specific stuff like matrix/fragment data
#file generation.

arguments=()
# default output filenames
frag_out="fragments_list.txt"
contig_out="info_contigs.txt"
matrix_out="abs_fragments_contacts_weighted.txt"

trigger_help=0

#Defaults
enzyme=5000
quality_min=30
output_dir="."
bedgraph=0
size=0
duplicate=0
clean_up=1
threads=1
minimap=0
fig_path=""
circular=""
plot=""
iterative=0
filter_events=0
prefix=""
tmp_dir=""
start_stage=1
#Argument parsing
while [[ $# -gt 0 ]]; do

  key="$1"

  case $key in
  -1 | --forward)
    input1="$2"
    shift
    shift
    ;;
  -2 | --reverse)
    input2="$2"
    shift
    shift
    ;;
  -f | --fasta)
    fasta="$2"
    shift
    shift
    ;;
  -e | --enzyme)
    enzyme="$2"
    shift
    shift
    ;;
  -o | --outdir)
    output_dir="$2"
    shift
    shift
    ;;
  -p | --plot)
    plot=" -p"
    shift
    ;;
  -q | --quality_min)
    quality_min="$2"
    shift
    shift
    ;;
  -s | --size)
    size="$2"
    shift
    shift
    ;;
  -t | --threads)
    threads="$2"
    shift
    shift
    ;;
  -T | --tmpdir)
    tmp_dir="$2"
    shift
    shift
    ;;
  -m | --minimap | --minimap2)
    minimap=1
    shift
    ;;
  -b | --bedgraph)
    bedgraph=1
    shift
    ;;
  -d | --duplicates)
    duplicate=1
    shift
    ;;
  -n | --no-cleanup)
    clean_up=0
    shift
    ;;
  -C | --circular)
    circular=" --circular"
    shift
    ;;
  -i | --iterative)
    iterative=1
    shift
    ;;
  -F | --filter)
    filter_events=1
    shift
    ;;
  -P | --prefix)
    prefix="$2"
    shift
    shift
    ;;
  -S | --start-stage)
    start_stage="$2"
    shift
    shift
    ;;
  -h | --help)
    trigger_help=1
    shift
    ;;
  *)
    arguments+=("$1")
    shift
    ;;
  esac
done

set -e
set -o pipefail
set -- "${arguments[@]}"
# Initiate timer
SECONDS=0

# Function to check if the version number of a GNU command is higher than minimum value 
# usage: version_check <command> <major version> <minor version>
version_check() {
  $1 --version | awk -v major=$2 -v minor=$3 'NR==1 {split($NF, vnum, ".");
        if(vnum[1] > major || (vnum[1] == major && vnum[2] > minor))
            {print 1}
        else {print 0}}'
}

setup_yahcp00() {

  # If no output dir specified, defaults to output dir
  if [ -z $tmp_dir ]; then
    tmp_dir="$output_dir"
  fi

  # Define plot output directory
  if [ ! -z $plot ]; then
    plot_dir="$output_dir/plots"
    mkdir -p "$plot_dir"
  fi
  # Initialize output folders
  mkdir -p "$output_dir"
  mkdir -p "$tmp_dir"

  # Initialize logfile:
  now=$(date "+%Y%m%d%H%M%S")
  if [ ! -z $prefix ]; then
    logfile="${output_dir}/${prefix}_${now}.log"
  else
    logfile="${output_dir}/hicstuff_${now}.log"
  fi

  echo "LOGFILE: $logfile"
  cat <<LOGHEAD >"$logfile"
  hicstuff v$(hicstuff --version) log file
  date: $(date)
  enzyme: $enzyme
  input1: $input1
  input2: $input2
  ref: $fasta
  ---
LOGHEAD

  # Check everything's in place
  python3 -c "import Bio" >/dev/null 2>&1 || {
    echo "Error! Biopython is missing from your python libraries. Please install it (using either your package manager or pip)" |
      tee -a "$logfile"
    exit 1
  }

  for tool in $aligner samtools bedtools; do
    command -v $tool >/dev/null 2>&1 || {
      echo "Error! $tool is needed and could not be found on your machine." |
        tee -a "$logfile"
      exit 1
    }
  done

  # Get number of threads to use per job when running two jobs at once
  t=$((threads / 2 < 1 ? 1 : threads / 2))
}

align01() {
  # FQ + FA -> SAM + FRAGS
  # Digests the genome into restriction frags and align reads
  if [ $minimap -eq 0 ]; then
    aligner=bowtie2
    index=${fasta%.fa}
    if [ ! -f "${index}".1.bt2 ]; then
      echo "Building bowtie2 index $index of $fasta"
      #Build fasta index files if they don't exist
      bowtie2-build --quiet "$fasta" "$index"
    fi
  else
    aligner=minimap2
  fi

  # Remove adapters and PCR duplicates
  if [ $duplicate -eq 1 ]; then
    echo "Removing adapters and PCR duplicates..."

    command -v pcr_duplicates >/dev/null 2>&1 || {
      echo "Error! pcr_duplicates is needed and could not be found on your machine." |
        tee -a "$logfile"
      exit 1
    }

    pcr_duplicates "$reads_for" "$reads_rev" "$reads_for".trimmed "$reads_rev".trimmed
    reads_for=${reads_for}.trimmed
    reads_rev=${reads_rev}.trimmed
  fi


  # Map reads iteratively
  if [ $iterative -eq 1 ]; then
    # Conditionaly append minimap option flag to command
    [ $minimap -eq 1 ] && miniflag="-m"
    map_cmd() { hicstuff iteralign -f "$fasta" -t $t -o "$2" -T "$tmp_dir" $miniflag "$1"; }
  else
    if [ $minimap -eq 0 ]; then
      map_cmd() { bowtie2 --very-sensitive-local -p $t -x "$index" -U "$1" >"$2"; }
    else
      map_cmd() { minimap2 -2 -t $t -ax sr "$fasta" "$1" >"$2"; }
    fi
  fi

  echo "Performing alignment and generating bed files..."
  sam_for="$tmp_dir/for.sam"
  sam_rev="$tmp_dir/rev.sam"
  map_cmd "$reads_for" "$sam_for" &
  map_cmd "$reads_rev" "$sam_rev" &
  wait

  # Add mapping statistics to log file
  # 4 line per reads per line
  n_pairs=$( (zcat "$reads_for" 2>/dev/null || cat "$reads_for") |
    wc -l |
    awk '{print $1/4}')
  echo "$n_pairs read pairs in fastq files." |
    tee -a "$logfile"
  n_reads=$((n_pairs * 2))
  n_mapped=$(
    awk -v qmin=$quality_min '($2 == 0 || $2 == 16) &&
                               $5 > qmin {print $0}' "$sam_for" "$sam_rev" |
      wc -l |
      tail -n 1 |
      awk '{print $1}'
  )
  perc_mapped=$(python -c "print(round(100*$n_mapped/$n_reads, 2))")
  echo "${perc_mapped}% single end reads uniquely mapped with MQ > $quality_min (${n_mapped}/${n_reads})" |
    tee -a "$logfile"

}

attrib_filter02() {
  # Get data to BED format, attirbute frags to reads and optionally filters events
  # BAM/SAM + FRAGS -> 2DBED
  # If a restriction enzyme is specified, event filtering is meaningless
  if [[ "$enzyme" =~ "^[0-9]+$" ]] && [ "$filter_events" -eq 1 ]; then
    echo "A restriction enzyme must be specified to filter 3C events." |
      tee -a "$logfile"
    exit 1
  fi
  # Write fragments_list.txt info_contigs.txt (optionally save plot)
  echo "Writing fragment information..."
  hicstuff digest $circular $plot ${plot:+-f $plot_dir} \
    --enzyme "$enzyme" \
    --outdir "$output_dir" \
    --size "$size" "$fasta" 2>&1 | tee -a $logfile

  # Sort alignments by read names and make sure no duplicate alignments are added
  samtools sort -@ $t -n "$sam_for" |
    samtools view -@ $t -F 2539 > "$tmp_dir/for.sorted.sam" && \
    mv "$tmp_dir/for.sorted.sam" "$sam_for" &
  samtools sort -@ $t -n "$sam_rev" |
    samtools view -@ $t -F 2539 > "$tmp_dir/rev.sorted.sam" && \
    mv "$tmp_dir/rev.sorted.sam" "$sam_rev" &

  wait
# Convert to BED and paste together pairs where both reads map with a good quality
# Filtered reads are then split again into end1 and end2
# Note: BAM positions are 1-based, BED are 0-based
  paste <(awk -v OFS="\t" '{{start=$4-1;end=start+length($10); if($2 == 0) {{$2 = "+"}}
                             else {{$2 = "-"}}
                               print $3,start,end,$1,$5,$2}}' "$sam_for") \
        <(awk -v OFS="\t" '{{start=$4-1;end=start+length($10); if($2 == 0) {{$2 = "+"}}
                             else {{$2 = "-"}}
                               print $3,start,end,$1,$5,$2}}' "$sam_rev") \
          | awk -v file1="$tmp_dir/unsorted_contacts_for.bed" \
                -v file2="$tmp_dir/unsorted_contacts_rev.bed" \
                -v qual=$quality_min -v OFS='\t' \
                '{if($5 > qual && $11 > qual) {print $1,$2,$3,$4,$6 > file1; print $7,$8,$9,$10,$12 > file2}}'

  # Check if version of UNIX sort supports parallelization and add flag if it does
  sort_par=""
  par=$(version_check sort 8 20)
  [ "$par" -eq 1 ] && sort_par="--parallel=$threads"

  # Sort by genomic position for bedtools intersect
  sort -S 2G ${sort_par} -T "$tmp_dir" -k1,1 -k2,2n -k6,6n \
    "$tmp_dir"/unsorted_contacts_for.bed \
    >"$tmp_dir"/total_contacts_for.bed

  sort -S 2G ${sort_par} -T "$tmp_dir" -k1,1 -k2,2n -k6,6n \
    "$tmp_dir"/unsorted_contacts_rev.bed \
    >"$tmp_dir"/total_contacts_rev.bed
  # Make a bed out of fragments_list.txt with 0-based fragment index for GRAAL
  awk 'NR>1 { print $2"\t"$3"\t"$4"\t"(NR-2) }' \
    "$output_dir"/fragments_list.txt |
    sort -k 1,1 -k2,2n \
    >"$tmp_dir"/fragments_list.bed


  pairs="$tmp_dir/contact_intersect_sorted.bed"
  echo "Intersecting bed files..."
  # Function to get longest overlaps between each read and fragment
  # 1. split bed into 1 chunk/thread
  # 2. intersect chunks with restriction fragments in parallel
  # 3. only keep longest overlap for each read
  # 4. merge back overlaps from all chunks and sort by read ID
  attrib_frags() {
    
    # Check if version of GNU split supports --number. Otherwise
    # Splits by lines (will not be as fast)
    split_ver=$(version_check split 8 7)
    if [ "$split_ver" -eq 1 ]; then
      split_method="--number=l/$threads"
    else
      tot_lines=$(wc -l $1 | awk '{print $1}')
      split_method="--lines=$((tot_lines / threads))"
    fi

    declare -i i=1
    split $split_method -a 4 --additional-suffix="$3_${now}.bedsplit" "$1" "$tmp_dir/x"
    for file in $tmp_dir/*$3_${now}.bedsplit; do
      (
        bedtools intersect -sorted -a $file -b $2 -wo |
        sort -k4,4d -T "$tmp_dir" |
        bedtools groupby -g 4 -c 10 -o max -full > $tmp_dir/${now}_${i}_$3_grouped.bed
      ) &
      ((i++))
    done
    wait
    rm $tmp_dir/*$3_${now}.bedsplit
    sort -k4,4d -S 2G ${sort_par} -T "$tmp_dir" $tmp_dir/${now}_*_$3_grouped.bed |
      awk '{print $6,$7,$8,$9,$5}' 
  }

  # Intersect fragment list with mapping data (end1 and end2 separately)
  attrib_frags "$tmp_dir"/total_contacts_for.bed \
               "$tmp_dir"/fragments_list.bed "for" \
               > "$tmp_dir"/attrib_for.bed
  attrib_frags "$tmp_dir"/total_contacts_rev.bed \
               "$tmp_dir"/fragments_list.bed "rev" \
               > "$tmp_dir"/attrib_rev.bed
  rm $tmp_dir/${now}_*_for_grouped.bed $tmp_dir/${now}_*_rev_grouped.bed

  # And paste ends together
  paste "$tmp_dir"/attrib_for.bed \
        "$tmp_dir"/attrib_rev.bed |
        tr ' ' '\t' \
        >"$pairs"
  rm "$tmp_dir"/attrib_*.bed
  # Filter spurious (loops and uncuts) 3C events
  if [ "$filter_events" -eq 1 ]; then
    if ! [ $clean_up -eq 1 ]; then
      # Keep unfiltered in case user wants it
      cp "$pairs" "$tmp_dir/contact_intersect_sorted.unfiltered.bed"
    fi
    hicstuff filter $plot ${plot:+-f $plot_dir} ${prefix:+-P $prefix} \
            "$pairs" "$tmp_dir/contact.filtered.bed" 2>&1 |
      tee -a "$logfile"
    mv "$tmp_dir/contact.filtered.bed" "$pairs"
  fi

}

gen_matrix03() {
  # Generates matrix from a sorted list of pairs
  # 2DBED -> SPARSEMAT
  echo "$(wc -l $pairs |
    awk '{print $1}') pairs used to build the contact map" |
    tee -a "$logfile"
  echo "Generating contact map..."
  # Write GRAAL matrix out of intersecting bed file
  if [ "$bedgraph" -eq 0 ]; then
    echo -e "id_fragment_a\tid_fragment_b\tn_contact" >"$output_dir/$matrix_out"
    cut -f4,9 "$pairs" |
      sort -S 2G ${sort_par} -T "$tmp_dir" -V |
      uniq -c |
      sed 's/^ *//' |
      tr ' ' '\t' |
      gawk -v OFS="\t" '{print $0,$1}' |
      cut -f1 --complement >>"$output_dir/$matrix_out"
    matext="tsv"
  # Write 2D bedgraph matrix
  else
    cut -f4,5,9,10 --complement "$pairs" |
      sort -S 2G ${sort_par} -T "$tmp_dir" -V |
      uniq -c |
      sed 's/^ *//' |
      tr ' ' '\t' |
      gawk -v FS=" " -v OFS="\t" '{print $0,$1}' |
      cut -f1 --complement |
      tr '\t' ' ' >"${output_dir}/${matrix_out}"
    matext="2bg"
  fi
}

cleanup_files04() {
  if [ ! -z "$prefix" ]; then
    if [[ -f "${output_dir}/${contig_out}" &&  -f "${output_dir}/${frag_out}" ]]; then
    mv "${output_dir}/${contig_out}" "${output_dir}/${prefix}.chr.tsv"
    mv "${output_dir}/${frag_out}" "${output_dir}/${prefix}.frag.tsv"
    fi
    mv "${output_dir}/${matrix_out}" "${output_dir}/${prefix}.mat.${matext}"
  fi

  if [ $clean_up -eq 1 ]; then
    rm "$sam_for" "$sam_rev"
    rm "$tmp_dir"/fragments_list.bed
    rm "$tmp_dir"/unsorted_contacts_*.bed
    rm "$tmp_dir"/contact_intersect_sorted.bed
    rm "$tmp_dir"/total_contacts_*.bed
  fi
}

# Define input files based on pipeline start point
case "$start_stage" in
"1")
  reads_for=$input1
  reads_rev=$input2
  ;;
"2")
  sam_for=$input1
  sam_rev=$input2
  ;;
"3")
  pairs=$input1
  ;;
*)
  echo "Wrong pipeline start stage."
  exit 1
  ;;
esac

# Execute portions of pipeline based on start point
setup_yahcp00
if [ $start_stage -le 1 ]; then align01; fi
if [ $start_stage -le 2 ]; then attrib_filter02; fi
if [ $start_stage -le 3 ]; then gen_matrix03; fi
cleanup_files04
echo "Contact map generated after $(($SECONDS / 3600)):$((($SECONDS / 60) % 60)):$(($SECONDS % 60)) (H:M:S)." |
  tee -a "$logfile"

echo "Finito"
