#!/bin/perl
use strict;
use warnings;

use Data::Dumper;

my $mode = $ARGV[0] || "dpl";

# Structure: fam_prot_ortho{seq_name}=family; seqs_prot_ortho{fam}{seq}=1
my ($fam_prot_ortho_ref, $seqs_prot_ortho_ref) = &get_prot_ortho_fam();
my %fam_prot_ortho = %{$fam_prot_ortho_ref};
my %seqs_prot_ortho = %{$seqs_prot_ortho_ref};

# Structure: our_seqs{fam}{seq}=1; our_fams{seq}=fam
my ($our_seqs_ref, $our_fams_ref) = &get_own_fam();
my %our_seqs = %{$our_seqs_ref};
my %our_fams = %{$our_fams_ref};

#print Dumper(\%fam_prot_ortho); exit;
#print Dumper(\%our_fams); exit;
#print Dumper(\%our_seqs); exit;

# TODO: print # of distinct Sequences. How to handle dpl3 and dpl?
#print "# of all distinct Sequences: " .
    #qx#cat /homes/prak01/praktikum/ORGA/Family/*.fa | grep ">" | wc -l#;
print "Nr. of Sequences in proteinortho input: " .
    qx#cat /homes/prak09/praktikum/PRAK09/Annotation/Species_Sequences/*.fa | grep ">" | wc -l#;
print "Nr. of distinct Sequences the proteinortho output: " .
    scalar(keys %fam_prot_ortho) . "\n\n";

my @missing_fams;

for my $our_fam (sort keys %our_seqs) {
    my @ortho_fams;

    my @same_seqs;
    for my $seq (sort keys %{$our_seqs{$our_fam}}) {
        if (exists $fam_prot_ortho{$seq}) {
            push @same_seqs, $seq;

            if (grep {/$fam_prot_ortho{$seq}/} @ortho_fams) {
                next;
            }
            else { push @ortho_fams, $fam_prot_ortho{$seq}; }
        }
    }


    if (@ortho_fams == 1) {
        my $ortho_fam = $ortho_fams[0];
        my @additional_seqs = grep { !exists $our_seqs{$our_fam}{$_} } sort keys %{$seqs_prot_ortho{$ortho_fam}};
        my @missing_seqs = grep { !exists $seqs_prot_ortho{$ortho_fam}{$_} } sort keys %{$our_seqs{$our_fam}};

        print "$our_fam: *MATCH*\n";
        print "\tOur family:\t$our_fam\n";
        print "\tProteinortho orthologs:\t$ortho_fam\n";
        print "\t\tSame Seqs:\t@same_seqs\n";

        print "\t\tAdditional:\t@additional_seqs\n";
        #&print_hash(\%{$seqs_prot_ortho{$ortho_fam}});
        print "\t\tMissing:\t@missing_seqs\n\n";

        my %add_families;
        for my $seq (@additional_seqs) {
            if (exists $our_fams{$seq}) {
                $add_families{$our_fams{$seq}} = 1;
            }
        }
        print "\t\tFamilies of additional findings: ";
        &print_hash(\%add_families);
        print "\n\n\n";
    }
    elsif (@ortho_fams > 1) {
        print "$our_fam sequences are not considered orthologous in the Proteinortho output.\n";
        print "\tOur family: $our_fam\n\tProteinortho Lines: @ortho_fams\n";

        print "\t\tOur Sequences: ";
        &print_hash(\%{$our_seqs{$our_fam}});
        print "\n\n";

        my @prot_ortho_seqs;

        for my $ortho_fam (@ortho_fams) {
            print "\t\t$ortho_fam: ";
            &print_hash(\%{$seqs_prot_ortho{$ortho_fam}});
            print "\n";

            push @prot_ortho_seqs, grep { $seqs_prot_ortho{$ortho_fam}{$_} } sort keys %{$seqs_prot_ortho{$ortho_fam}};
        }

        my @additional_seqs = grep { !exists $our_seqs{$our_fam}{$_} } @prot_ortho_seqs;
        my @missing_seqs;

        for my $seq (sort keys %{$our_seqs{$our_fam}}) {
            my @found = grep { $_ eq $seq } @prot_ortho_seqs;
            if (!@found) {
                push @missing_seqs, $seq;
            }
        }

        print "\n";
        print "\t\tAdditional:\t@additional_seqs\n";
        print "\t\tMissing:\t@missing_seqs\n\n";

        my %add_families;
        for my $seq (@additional_seqs) {
            if (exists $our_fams{$seq}) {
                $add_families{$our_fams{$seq}} = 1;
            }
        }
        print "\t\tFamilies of additional findings: ";
        &print_hash(\%add_families);
        print "\n\n\n";
    }
    else {
        push @missing_fams, $our_fam;
    }
}


print "Following Families do not exist in the proteinortho output:\n\n@missing_fams\n";

# Print Sequences from Prothortho that didn't exist
#my @missing_seqs_in_our_orthos;
#for my $fam (sort keys %seqs_prot_ortho) {
    #for my $seq (sort keys %{$seqs_prot_ortho{$fam}}) {
        #if (!exists $our_fams{$seq}) {
            #push @missing_seqs_in_our_orthos, $seq;
        #}
    #}
#}

#print "Sequences proteinortho found additionally:\n\n@missing_seqs_in_our_orthos\n";


sub get_prot_ortho_fam {
    my %fam_hash;
    my %seq_hash;
    my $id_cnt = 0;

    open my $prot_ortho_fh, "<", "/homes/prak01/praktikum/ORGA/proteinortho_microRNA.proteinortho"
        || die;

    while (<$prot_ortho_fh>) {
        if ($_ =~ /^#/) { next; }

        my $fam = "protortho_lineID_" . $id_cnt++;

        # We don't need info about number of genes etc. from the first 3 columns
        my @columns = split /\s/, qx/echo "$_" | cut -d"\t" -f4-/;

        for (@columns) {
            if ($_ eq "*") { next; }

            my @seqs = split ',', $_;

            for my $seq (@seqs) {
                #my ($seq_name) = $seq =~ /^[^-]*-([^_]*)/;
                #my $fam = &get_MIPF_fam($seq_name);
                if ($mode eq "dpl") {
                    if ($seq =~ /^dpl3-/) { next; }
                }
                else {
                    if ($seq =~ /^dpl-/) { next; }
                }

                if (! exists $fam_hash{$seq}) {
                    $fam_hash{$seq} = $fam;
                    $seq_hash{$fam}{$seq} = 1;
                }
                else {
                    die "We have a duplicate Sequence in the Proteinortho output.";
                }
            }
        }

        #my $crnt_fam;
        #for my $seq (@columns) {
        #}
    }
    #print Dumper(\%fam_hash); exit;
    return (\%fam_hash, \%seq_hash);
}

sub get_own_fam {
    my @files = </homes/prak01/praktikum/ORGA/Family/*.fa>;
    my %seq_hash;
    my %fam_hash;
    
    for my $file (@files) {
        my ($fam) = $file =~ /([^\/]*).fa$/;

        open my $fh, "<", $file;
        while (<$fh>) {
            if ($_ =~ /^>([^\s]*)/) {
                if ($mode eq "dpl") {
                    if ($_ =~ /^dpl3-/) { next; }
                }
                else {
                    if ($_ =~ /^dpl-/) { next; }
                }

                my $seq_name = $1;
                $seq_hash{$fam}{$seq_name} = 1;
                $fam_hash{$seq_name} = $fam;
            }
        }
    }

    #print Dumper(\%fam_hash); exit;
    return (\%seq_hash, \%fam_hash);
}

sub get_MIPF_fam {
    my $seq_name = shift;

    my ($fam_id, $fam) = split /\s/, qx#grep "$seq_name" /homes/prak05/praktikum/PRAK05/Annotation/miFam.1line.dat | awk '{print \$1, \$2}'#;
    #print "$fam_id, $fam\n"; exit;

    if ($fam_id) {
        return "$fam_id" . "_$fam";
    }
    else {
        return undef;
    }
}

sub print_hash {
    my $ref = shift;
    my %h = %{$ref};

    for (sort keys %h) {
        print "$_, ";
    }

    return;
}