#! /usr/bin/bash
####################
# count
#     Description:
#         Count the sgrna from paired-end fastq files
####################
function printusage {
    echo "Usage: $0 [-h] -p pattern -f input1.fq -r input2.fq -l library.txt" 1>&2
}

function printhelp {
    printusage
    echo "" 1>&2
    echo "Parameters:" 1>&2
    echo "    h: print this help" 1>&2
    echo "    p: the sgRNA pattern to be used, for example: ACCG([ATGC]{20})GTTT" 1>&2
    echo "    f: the forward fastq file to be count reads." 1>&2
    echo "    r: the reverse fastq file to be count reads." 1>&2
    echo "    l: the library file: gene <tab> guide." 1>&2
    echo "" 1>&2
    echo "Input:" 1>&2
    echo "    the input files should be fastq files." 1>&2
    echo "" 1>&2
    echo "Output:" 1>&2
    echo "    the reads count will be output to /dev/stdout (1>)" 1>&2
}

ov=1

while getopts "hp:f:r:l:" opt; do
  case ${opt} in
    h)
      printhelp; exit 0
      ;;
    p)
      GPATTERN=${OPTARG}
      ;;
    f)
      fqf=${OPTARG}
      ;;
    r)
      fqr=${OPTARG}
      ;;
    l)
      lib=${OPTARG}
      ;;
    \?)
      echo "Invalid option: -"${OPTARG}
      printusage; exit 1
      ;;
  esac
done


# check the required parameters
if [ -z "${fqf}" ]; then
    printusage
    exit 1
fi

if [ -z "${fqr}" ]; then
    printusage
    exit 1
fi

if [ -z "${GPATTERN}" ]; then
    GPATTERN="ACCG([ATGC]{20})GTTT"
fi

####################

function reverse_complement {
    rev | tr "ATGC" "TACG"
}

function fq2_to_seqpair {
    cat $2 | reverse_complement | paste $1 - | awk 'FNR % 4 == 2 {print $0}'
}

function fq2_to_seqpair2 {
    cat $2 | reverse_complement | paste $1 - | awk 'FNR % 4 == 2 {print $0}'
    cat $1 | reverse_complement | paste $2 - | awk 'FNR % 4 == 2 {print $0}'
}


function seqpair_guide {
    awk -v GPAT=$GPATTERN -F '\t' 'BEGIN {
            GPATTERN=GPAT;
        }
        $0 ~ HPATTERN {
            match($1, GPATTERN, a);
            match($2, GPATTERN, b);
            print substr($1, a[1, "start"], a[1, "length"]) FS substr($2, b[1, "start"], b[1, "length"]);
        }' $@
}

function count_guide {
    awk -F '\t' '{
        if (length($1) > 0) {
            lcount[$1] += 1;
        } else if (length($2) > 0) {
            lcount[$2] += 1;
        }
    }
    END {
        for (x in lcount) {
            print x FS lcount[x];
        }
    }' $@
}

function map_to_lib {
    awk -F '\t' 'FNR == NR && FNR !=1 {
        lib[($1 FS $2 FS $2)] = 0;
        guide_gene[$2] = $1;
    }
    FNR != NR {
        if (guide_gene[$1] != "") {
            lib[(guide_gene[$1] FS $1 FS $1)] += $2;
        }
    }
    END {

        for (i in lib) {
           print i FS lib[i];
        }
    }' $@
}
####################

echo -e "gene\tguide\tbarcode\tcount"

fq2_to_seqpair2 ${fqf} ${fqr} | \
    seqpair_guide - | \
    count_guide | \
    map_to_lib ${lib} - | sort -V

####################
