#!/bin/bash
#
# This shell script prepares the data, s.t. Proper can handle it better
#
# use -help to see options
#
# FracPete

# the usage of this script
function usage()
{
   echo
   echo "usage: ${0##*/} -i <input-dir> -o <output-dir> [-h]"
   echo 
   echo "combines the names file with the data file, it also performs a "
   echo "binarization for 'CELL GROWTH, CELL DIVISION AND DNA SYNTHESIS' (_growth) "
   echo "and for 'nucleus' (_nucleus)"
   echo
   echo " -h   this help"
   echo " -i   <input-dir>"
   echo "      the directory where the data is located"
   echo "      default: $SRC"
   echo " -o   <output-dir>"
   echo "      where to put the processed files"
   echo "      default: $DEST"
   echo
}

# cleans up temporary files
function clean_up()
{
   rm -f _*
}

# returns the header from the file TMPFILE in TMP
function get_header()
{
   TMP=`cat $TMPFILE | sed s/{\r}//g | grep -v "^$" | cut -f 1 -d ":" | sed s/$/,/`
   TMP=`echo $TMP | sed s/,$//g | sed s/" "*//g`
}


# variables
ROOT=`expr "$0" : '\(.*\)/'`
SRC="$ROOT/original"
DEST="$ROOT"

# interprete parameters
while getopts ":hi:o:" flag
do
   case $flag in
      i) SRC=$OPTARG
         ;;
      o) DEST=$OPTARG
         ;;
      h) usage
         exit 0
         ;;
      *) usage
         exit 1
         ;;
   esac
done

# the genes
echo "genes relation..."
NAME=Genes_relation
TMPFILE=$SRC/$NAME.names;get_header
#echo $TMP > $DEST/$NAME
cat $SRC/$NAME.data | sed s/"."$//g > $DEST/$NAME
#echo $TMP > $DEST/$NAME.test
cat $SRC/$NAME.test | sed s/"."$//g > $DEST/$NAME.test

# the relations
echo "interactions relation..."
NAME=Interactions_relation
TMPFILE=$SRC/$NAME.names;get_header
#echo $TMP > $DEST/$NAME
cat $SRC/$NAME.data | sed s/"."$//g > $DEST/$NAME
#echo $TMP > $DEST/$NAME.test
cat $SRC/$NAME.test | sed s/"."$//g > $DEST/$NAME.test

# binarize them
echo "binarizing..."
NAME=Genes_relation
cat $DEST/$NAME | sed s/"CELL GROWTH, CELL DIVISION AND DNA SYNTHESIS"/pos/g | sed s/"CELL RESCUE, DEFENSE, CELL DEATH AND AGEING\|CELLULAR BIOGENESIS (proteins are not localized to the corresponding organelle)\|CELLULAR COMMUNICATION\/SIGNAL TRANSDUCTION\|CELLULAR ORGANIZATION (proteins are localized to the corresponding organelle)\|CELLULAR TRANSPORT AND TRANSPORTMECHANISMS\|ENERGY\|IONIC HOMEOSTASIS\|METABOLISM\|PROTEIN DESTINATION\|PROTEIN SYNTHESIS\|TRANSCRIPTION\|TRANSPORT FACILITATION\|TRANSPOSABLE ELEMENTS VIRAL AND PLASMID PROTEINS"/neg/g | sed s/"\"pos\""/pos/g | sed s/"\"neg\""/neg/g > $DEST/"$NAME"_growth

NAME=Genes_relation
cat $DEST/$NAME | sed s/",nucleus"$/",1"/g | sed s/","[^1,]*$/,neg/g | sed s/",1"$/,pos/g > $DEST/"$NAME"_nucleus

# clean up
clean_up

