#!/usr/bin/perl -w

#gff_compare.pl

use strict;
use vars qw ($opt_c $opt_f $opt_g $opt_i $opt_l $opt_o $opt_x);  # required if strict used
use Getopt::Std;
use constant GNUPLOT => '/usr/bin/gnuplot';

getopts ('c:f:g:il:xo:');     # ('aci:p:o:') means 'ac' are flags, 'i:p:o:' gets following scalar.


# Print a helpful message if the user provides no input file.
if (!@ARGV) { 
        print "usage:  gff_compare.pl [options] gffDB gff\n\n";
	print "options:\n";
	print "-c <cover>         :  overlap required to associate two regions [ default is 0 nts ]\n";
	print "-f <featureDB>     :  which type of feature from gffDB    [ default is all]\n";
        print "                        possible types of loci are:\n";
        print "                        OTH | COD | RNA \n";
	print "-g <feature>       :  which type of feature from gff    [ default is all]\n";
        print "                        possible types of loci are:\n";
        print "                        OTH | COD | RNA \n";
        print "-i                 :  ignore strand, substract anyway     [ default removes only if in the same strand]\n";
        print "-l <len>           :  until which coordinate    [ default is all]\n";
	print "-o <output>        :  output file [default = gff1_intersec_gff2.gff]\n";
	print "-x                 :  search for NON overlaps\n";
       exit;
}

my $gffDB = shift;
my $gff   = shift;

my $output;
if ($opt_o) { $output = $opt_o; }
else        { $output = "gff.intersec_gffDB.gff";  }

my $cover; 
if ($opt_c) { $cover = $opt_c; } 
else        { $cover = 0;      }

my $typetargetDB;
if ($opt_f) { $typetargetDB = $opt_f; }
else        { $typetargetDB = "all";  }

my $typetarget;
if ($opt_g) { $typetarget = $opt_g; }
else        { $typetarget = "all";  }

my $exclude; 
if ($opt_x) { $exclude = 1; } 
else        { $exclude = 0; }

my $len;
if ($opt_l) { if ($opt_l < 0) { print "bad len\n"; die; } else { $len = $opt_l; } }
else        { $len = -1;  }

gff_intersect_gffDB ("$output", $len, $cover, $gff, $typetarget, $gffDB, $typetargetDB, $exclude);


sub gff_intersect_gffDB {
    
    my ($outgff, $len, $cover, $gff, $typetarget, $gffdb, $typetargetdb, $exclude) = @_;
    
    my $line;
    
    my $seq;
    my $source;
    my $feature;
    my $lend;
    my $rend;
    my $strand;
    my $ave_size;

    my $n_win       = 0;
    my $n_win_FN    = 0;
    my $n_win_type  = 0;
    my $n_win_total = 0;


    my $n_windb_total = 0;
    my $n_windb_type  = 0;
   
    my $sourceDB;
    my @seqdb;
    my @featuredb;
    my @lenddb;
    my @renddb;
    my @stranddb;

    my $value;

    parse_gff ($gffdb, $typetargetdb, $len, \$n_windb_total, \$n_windb_type, \$sourceDB, \@seqdb, \@featuredb, \@lenddb, \@renddb, \@stranddb);
    
    open (OUT,">$outgff") || die;

    open (GFF,"$gff") || die "coudn't open $gff: $!"; 
    while (<GFF>) {
	
	if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+\S+\s+/) {
	    $line = $_;
	    
	    $seq     = $1;
	    $source  = $2; if (!$source) { print "no source:\n$_\n"; }
	    $feature = $3;
	    $lend    = $4;
	    $rend    = $5;
	    $strand  = "$6";
	    
	    if ($rend < $lend) { print "gff_intersect_gffDB():parsing error feature $feature lend = $lend rend = $rend\n"; die; }
	    
	    if ($len == -1 || ($len > 0 && $rend <= $len)) {
		$n_win_total ++;
		
		if ($typetarget =~ /^all$/ || $feature =~ /$typetarget/) {
		    $n_win_type ++;
		    
		    $ave_size += $rend-$lend+1;

		    # yeast interconversion of names
		    yeast_name_interconversion(\$seq);
		    
		    if ($seq =~ /^(\S+)\-(\d+)[\>\<]\d+/) { 
			$seq = $1; 
			$lend += $2;
			$rend += $2;
		    }

		    $value = search_annotation($cover, $seq, $lend, $rend, $strand, $n_windb_type, 
					       \@seqdb, \@featuredb, \@lenddb, \@renddb, \@stranddb);
		    
		    if ($exclude == 0) { if ($value == 1) { $n_win ++; print OUT $line; }                                      }
		    else               { if ($value == 1) { next;                       } else { $n_win ++; print OUT $line; } }
		    
		}
	    } 
	}
    }
    close (GFF);
    close (OUT);
    
    my $n_win_FP     = $n_win_type - $n_win;
    my $n_win_notype = $n_win_total - $n_win_type;
    
    $ave_size /= $n_win_type;

    print "gff:\t$gff\n";
    if ($len == -1) {
        print "\t\tTotal $source regions:\t$n_win_total\n";
    }
    else {
        print "\t\tTotal $source regions [<= $len ] :\t$n_win_total\n";
    }
    print "\t\t $typetarget   $source regions:\t$n_win_type [ave_size = $ave_size]\n";
    print "\t\t!$typetarget   $source regions:\t$n_win_notype\n\n";
    
    my $frac_win    = 0; if ($n_win_type   > 0) { $frac_win    = $n_win    / $n_win_type   * 100.0; }
    my $frac_win_FN = 0; if ($n_win_notype > 0) { $frac_win_FN = $n_win_FN / $n_win_notype * 100.0; }

    my $say;
    if  ($exclude == 0) { $say = "intersect [$cover nts] with"; }
    else                { $say = "NOT intersect [$cover nts] with"; }

    printf "[ $typetarget $source regions] $say [$typetargetDB $sourceDB regions]: $n_win / $n_win_type (%.2f)\n\n", $frac_win;
}


#
#
###
sub overlap {
    my ($lend_loci, $rend_loci, $lend, $rend, $overlap)  = @_;

    my $is_same_loci = 0;

    if ($lend >= $lend_loci && $rend <= $rend_loci) { $is_same_loci = 1; } #is included
    if ($lend <= $lend_loci && $rend >= $rend_loci) { $is_same_loci = 1; } #extends over

    if ($rend < $rend_loci && $rend >= $lend_loci+$overlap)  { $is_same_loci = 1; } #left-end overlap
    if ($lend > $lend_loci && $lend <= $rend_loci-$overlap)  { $is_same_loci = 1; } #right-end overlap


    return $is_same_loci;

}


sub parse_gff {

    my ($gff, $typetarget, $len, $n_win_ref, $n_win_type_ref, $source_ref, $seq_ref, $feature_ref, 
	$lend_ref, $rend_ref, $strand_ref) = @_;

    my $seq;
    my $source;
    my $feature;
    my $lend;
    my $rend;
    my $strand;
    my $size = 0;

    my $idx = 0;

    open (GFF,"$gff") || die "coudn't open $gff: $!"; 
    while (<GFF>) {
	
	if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+\S+\s+/) {
	    $seq     = $1;
	    $source  = $2; if (!$source) { print "no source:\n$_\n"; }
	    $feature = $3;
	    $lend    = $4;
	    $rend    = $5;
	    $strand  = "$6";
	    
	    if ($rend < $lend) { print "parse_gff():parsing error feature $feature lend = $lend rend = $rend\n"; die; }

            if ($len == -1 || ($len > 0 && $rend < $len)) {
		$$n_win_ref ++;
		
		if ($typetarget =~ /^all$/ || $feature =~ /$typetarget/) {
		    
		    $size += $rend - $lend + 1;

		    # yeast interconversion of names
		    yeast_name_interconversion(\$seq);
		    
		    if ($seq =~ /^(\S+)\-(\d+)[\>\<]\d+/) { 
			$seq = $1; 
			$lend += $2;
			$rend += $2;
		    }
		    
		    $seq_ref->[$idx]     = $seq;
		    $feature_ref->[$idx] = $feature;
		    $lend_ref->[$idx]    = $lend;
		    $rend_ref->[$idx]    = $rend;
		    $strand_ref->[$idx]  = $strand;
		    $idx ++;
		}
	    }
	}
    }
    close (GFF);

    if ($idx == 0 && !$source) { $source = "empty"; }

    $$source_ref = $source;  # we assume that all elements in a given gff file have the same source

    $$n_win_type_ref = $idx;

    my $n_win_notype = $$n_win_ref - $$n_win_type_ref;
    
    $size /= $$n_win_type_ref;

    print "gffDB:\t$gff\n";
    if ($len == -1) {
        print "\t\tTotal $source regions:\t$$n_win_ref\n";
    }
    else {
        print "\t\tTotal $source regions [<= $len ] :\t$$n_win_ref\n";
    }
    print "\t\t $typetarget   $source regions:\t$$n_win_type_ref [ave_size = $size]\n";
    print "\t\t!$typetarget   $source regions:\t$n_win_notype\n\n";

}

sub search_annotation {

    my ($cover, $seq, $lend, $rend, $strand, $idx, $seq_ref, $feature_ref, $lend_ref, $rend_ref, $strand_ref) = @_;

    my $isthere = 0;
    
    for (my $x = 0; $x < $idx; $x++) {
	
	if ($seq =~ /^$seq_ref->[$x]$/ &&
	    ($opt_i || $strand eq $strand_ref->[$x] || $strand =~ /^\.$/) &&
	    overlap($lend, $rend, $lend_ref->[$x], $rend_ref->[$x], $cover) == 1) { 
	    $isthere = 1;
	}
    }
    
    
    return $isthere;
}

sub yeast_name_interconversion {
    my ($seq_ref) = @_;
    
    my $seq = "chr";

    # yeast interconversion of names
    if ($$seq_ref =~ /NC\_00(\d+)/) { 

	my $num = $1;

	if    ($num == 1133) { $seq .= "I";    }
	elsif ($num == 1134) { $seq .= "II";   }
	elsif ($num == 1135) { $seq .= "III";  }
	elsif ($num == 1136) { $seq .= "IV";   }
	elsif ($num == 1137) { $seq .= "V";    }
	elsif ($num == 1138) { $seq .= "VI";   }
	elsif ($num == 1139) { $seq .= "VII";  }
	elsif ($num == 1140) { $seq .= "VIII"; }
	elsif ($num == 1141) { $seq .= "IX";   }
	elsif ($num == 1142) { $seq .= "X";    }
	elsif ($num == 1143) { $seq .= "XI";   }
	elsif ($num == 1144) { $seq .= "XII";  } 
	elsif ($num == 1145) { $seq .= "XIII"; }
	elsif ($num == 1146) { $seq .= "XIV";  }
	elsif ($num == 1147) { $seq .= "XV";   }
	elsif ($num == 1148) { $seq .= "XVI";  }
	elsif ($num == 1224) { $seq .= "Mito";  }
	else                 { print "wrong SC chromosome\n"; die; }

	$$seq_ref = $seq;
   }
}
