#!/bin/bash
#
# This shell script prepares the data, s.t. Proper can handle it better
#
# use -help to see options
#
# FracPete

# the usage of this script
function usage()
{
   echo
   echo "usage: ${0##*/} -i <input-dir> -o <output-dir> [-h]"
   echo 
   echo "changes the class labels from \"1.\" and \"0.\" to \"'1'\" and \"'0'\""
   echo "this is done, since it is a binary class and not a numerical"
   echo "it also creates 'relational' datasets from it, i.e. one files contains"
   echo "the bag and the class, while the other contains the rest of the cols"
   echo "(the reference is via the bag-id)."
   echo
   echo " -h   this help"
   echo " -i   <input-dir>"
   echo "      the directory where the two musk-files (*.data) are located"
   echo "      default: $SRC"
   echo " -o   <output-dir>"
   echo "      where to put the processed files"
   echo "      default: $DEST"
   echo
}

# cleans up temporary files
function clean_up
{
   rm -f *_
}

# extracts the headers from description file
function create_headers()
{
   cd $DEST
   TMP=`cat $DESCRIPTION | grep ":" | sed s/":".*//g | sed s/$/","/g`
   echo $TMP | sed s/" "*//g | sed s/","$/",class"/g > $HEADERS
}

# adds the headers to file TMP
function add_header()
{
   cat $HEADERS > "$DEST/$FILE"_
   cat $DEST/$FILE | sed s/\{r\}//g >> "$DEST/$FILE"_
   rm -f $DEST/$FILE
   mv "$DEST/$FILE"_ $DEST/$FILE
}

# changes the class labels for file TMP
function change_class()
{
   echo $FILE...
   cat $SRC/$FILE | sed s/"1."$/"'1'"/g | sed s/"0."$/"'0'"/g > $DEST/$FILE
}

# returns the count of columns in TMP
function count_cols()
{
   TMP=`head -n1 $DEST/$FILE | sed s/","/" "/g | wc -w | sed s/" "*//g`
}

# creates relational data from the file, i.e. it splits the file into two, 
# where one contains the bag and the class and the other rest of it (bag-id is
# the reference)
function create_relational()
{
   TMPFILE=`echo $DEST/$FILE | sed s/".data"//g`
   count_cols;COUNT=$TMP
   cat $DEST/$FILE | cut -f1,$COUNT -d"," | sort -fu > $TMPFILE-bag.rel
   cat $DEST/$FILE | cut -f1-$(($COUNT-1)) -d"," > $TMPFILE-data.rel
}

# variables
ROOT=`expr "$0" : '\(.*\)/'`
SRC="$ROOT/original"
DEST="$ROOT"
HEADERS="$DEST/headers_"

# interprete parameters
while getopts ":hi:o:" flag
do
   case $flag in
      i) SRC=$OPTARG
         ;;
      o) DEST=$OPTARG
         ;;
      h) usage
         exit 0
         ;;
      *) usage
         exit 1
         ;;
   esac
done

# perform changing of class labels
for i in $SRC/*.data
do
   FILE=`echo ${i##*/}`
   DESCRIPTION="$SRC/"`echo $FILE | sed s/"."[^\.]*$//g`".names"
   change_class
   create_headers
   add_header
   create_relational
done

# clean up
clean_up

