#!/bin/bash
#
# This shell script prepares the data, s.t. Proper can handle it better
#
# use -help to see options
#
# FracPete

# the usage of this script
function usage()
{
   echo
   echo "usage: ${0##*/} -i <input-dir> -o <output-dir> [-h]"
   echo 
   echo "adds field descriptions to the CSV-files, it generates these files:"
   echo " tic-data.txt           the training data"
   echo " tic-class.txt          classified examples (= unclass. with label)"
   echo " tic-unclass.txt        unclassified exmaples"
   echo " tic-combined.txt       training data + unclass. examples"
   echo " tic-combined-bag.txt   training data + unclass. examples (bag+class)"
   echo " tic-combined-data.txt  training data + unclass. examples (w/o class)"
   echo
   echo " -h   this help"
   echo " -i   <input-dir>"
   echo "      the directory with the unprocessed files"
   echo "      default: $SRC"
   echo " -o   <output-dir>"
   echo "      where to put the processed files"
   echo "      default: $DEST"
   echo
}

# cleans up temporary files
function clean_up
{
   rm -f *_
   rm -f xx*
}

# extracts the headers from description file
function create_headers()
{
   cd $DEST
   csplit -s $DESCRIPTION /"Nr Name Description Domain"/
   csplit -s xx01 /"L0:"/
   TMP=`cat xx00 | sed s/\{r\}//g | grep -v "^$\|^Nr" | sed s/$/"#"/g | sed s/^[0-9]*" "//g | sed s/" see L"[0-9]//g`
   echo $TMP | sed s/"#"" "*/"\t"/g > $HEADERS
}

# filters the file TMP (removes the CR)
function filter()
{
   cat $TMP | sed s/\{r\}//g > "$TMP"_
   rm -f $TMP
   mv "$TMP"_ $TMP
}

# adds the headers to file TMP
function add_header()
{
   cat $HEADERS > "$TMP"_
   cat $TMP | sed s/\{r\}//g >> "$TMP"_
   rm -f $TMP
   mv "$TMP"_ $TMP
}

# changes the class labels for file TMP
function change_class()
{
   echo "change classes for $TMP..."
   cat $TMP | sed s/"1"$/"'1'"/g | sed s/"0"$/"'0'"/g > "$TMP"_
   rm -f "$TMP"
   mv "$TMP"_ $TMP
}

# returns the count of columns in TMP
function count_cols()
{
   TMP=`head -n1 $SRCFILE | sed s/" "/_/g | sed s/"\\t"/" "/g | wc -w | sed s/" "*//g`
}

# creates relational data from the file, i.e. it splits the file into two, 
# where one contains the bag and the class and the other rest of it (bag-id is
# the reference)
function create_relational()
{
   SRCFILE=$TMP
   TMPFILE=`echo $SRCFILE | sed s/".txt"//g`
   count_cols;COUNT=$TMP
   cat $SRCFILE | cut -f1,$COUNT | sort -ru > $TMPFILE-bag.txt
   cat $SRCFILE | cut -f1-$(($COUNT-1)) > $TMPFILE-data.txt
}

# variables
ROOT=`expr "$0" : '\(.*\)/'`
SRC="$ROOT/original"
DEST="$ROOT"
DESCRIPTION="$SRC/TicDataDescr.txt"
HEADERS="$DEST/headers_"

# interprete parameters
while getopts ":hi:o:" flag
do
   case $flag in
      i) SRC=$OPTARG
         ;;
      o) DEST=$OPTARG
         ;;
      h) usage
         exit 0
         ;;
      *) usage
         exit 1
         ;;
   esac
done

# generate headers
echo "creating headers..."
create_headers

# copy data file
echo "create data file..."
cp $SRC/ticdata2000.txt $DEST/tic-data.txt
TMP="$DEST/tic-data.txt";filter 

# create unclassified file
echo "create unclassified file..."
cp $SRC/ticeval2000.txt $DEST/tic-unclass.txt
TMP="$DEST/tic-unclass.txt";filter

# create classified file
echo "create classified file..."
cp $SRC/tictgts2000.txt $DEST/tic-targets_
TMP="$DEST/tic-data.txt";filter
paste $DEST/tic-unclass.txt $DEST/tic-targets_ > $DEST/tic-class.txt

# change class labels
TMP="$DEST/tic-data.txt";change_class
TMP="$DEST/tic-class.txt";change_class

# create combined file (data+unclass)
echo "create combined file..."
cp $DEST/tic-data.txt $DEST/tic-combined.txt
cat $DEST/tic-unclass.txt >> $DEST/tic-combined.txt

# add headers
echo "adding headers..."
TMP="$DEST/tic-data.txt";add_header
TMP="$DEST/tic-unclass.txt";add_header
TMP="$DEST/tic-class.txt";add_header
TMP="$DEST/tic-combined.txt";add_header

# create relational data
echo "creating relational data..."
TMP="$DEST/tic-class.txt";create_relational
TMP="$DEST/tic-combined.txt";create_relational

# clean up
clean_up

