#!/bin/perl use strict; use warnings; use Data::Dumper; my $mode = $ARGV[0] || "dpl"; # Structure: fam_prot_ortho{seq_name}=family; seqs_prot_ortho{fam}{seq}=1 my ($fam_prot_ortho_ref, $seqs_prot_ortho_ref) = &get_prot_ortho_fam(); my %fam_prot_ortho = %{$fam_prot_ortho_ref}; my %seqs_prot_ortho = %{$seqs_prot_ortho_ref}; # Structure: our_seqs{fam}{seq}=1; our_fams{seq}=fam my ($our_seqs_ref, $our_fams_ref) = &get_own_fam(); my %our_seqs = %{$our_seqs_ref}; my %our_fams = %{$our_fams_ref}; #print Dumper(\%fam_prot_ortho); exit; #print Dumper(\%our_fams); exit; #print Dumper(\%our_seqs); exit; # TODO: print # of distinct Sequences. How to handle dpl3 and dpl? #print "# of all distinct Sequences: " . #qx#cat /homes/prak01/praktikum/ORGA/Family/*.fa | grep ">" | wc -l#; print "Nr. of Sequences in proteinortho input: " . qx#cat /homes/prak09/praktikum/PRAK09/Annotation/Species_Sequences/*.fa | grep ">" | wc -l#; print "Nr. of distinct Sequences the proteinortho output: " . scalar(keys %fam_prot_ortho) . "\n\n"; my @missing_fams; for my $our_fam (sort keys %our_seqs) { my @ortho_fams; my @same_seqs; for my $seq (sort keys %{$our_seqs{$our_fam}}) { if (exists $fam_prot_ortho{$seq}) { push @same_seqs, $seq; if (grep {/$fam_prot_ortho{$seq}/} @ortho_fams) { next; } else { push @ortho_fams, $fam_prot_ortho{$seq}; } } } if (@ortho_fams == 1) { my $ortho_fam = $ortho_fams[0]; my @additional_seqs = grep { !exists $our_seqs{$our_fam}{$_} } sort keys %{$seqs_prot_ortho{$ortho_fam}}; my @missing_seqs = grep { !exists $seqs_prot_ortho{$ortho_fam}{$_} } sort keys %{$our_seqs{$our_fam}}; print "$our_fam: *MATCH*\n"; print "\tOur family:\t$our_fam\n"; print "\tProteinortho orthologs:\t$ortho_fam\n"; print "\t\tSame Seqs:\t@same_seqs\n"; print "\t\tAdditional:\t@additional_seqs\n"; #&print_hash(\%{$seqs_prot_ortho{$ortho_fam}}); print "\t\tMissing:\t@missing_seqs\n\n"; my %add_families; for my $seq (@additional_seqs) { if (exists $our_fams{$seq}) { $add_families{$our_fams{$seq}} = 1; } } print "\t\tFamilies of additional findings: "; &print_hash(\%add_families); print "\n\n\n"; } elsif (@ortho_fams > 1) { print "$our_fam sequences are not considered orthologous in the Proteinortho output.\n"; print "\tOur family: $our_fam\n\tProteinortho Lines: @ortho_fams\n"; print "\t\tOur Sequences: "; &print_hash(\%{$our_seqs{$our_fam}}); print "\n\n"; my @prot_ortho_seqs; for my $ortho_fam (@ortho_fams) { print "\t\t$ortho_fam: "; &print_hash(\%{$seqs_prot_ortho{$ortho_fam}}); print "\n"; push @prot_ortho_seqs, grep { $seqs_prot_ortho{$ortho_fam}{$_} } sort keys %{$seqs_prot_ortho{$ortho_fam}}; } my @additional_seqs = grep { !exists $our_seqs{$our_fam}{$_} } @prot_ortho_seqs; my @missing_seqs; for my $seq (sort keys %{$our_seqs{$our_fam}}) { my @found = grep { $_ eq $seq } @prot_ortho_seqs; if (!@found) { push @missing_seqs, $seq; } } print "\n"; print "\t\tAdditional:\t@additional_seqs\n"; print "\t\tMissing:\t@missing_seqs\n\n"; my %add_families; for my $seq (@additional_seqs) { if (exists $our_fams{$seq}) { $add_families{$our_fams{$seq}} = 1; } } print "\t\tFamilies of additional findings: "; &print_hash(\%add_families); print "\n\n\n"; } else { push @missing_fams, $our_fam; } } print "Following Families do not exist in the proteinortho output:\n\n@missing_fams\n"; # Print Sequences from Prothortho that didn't exist #my @missing_seqs_in_our_orthos; #for my $fam (sort keys %seqs_prot_ortho) { #for my $seq (sort keys %{$seqs_prot_ortho{$fam}}) { #if (!exists $our_fams{$seq}) { #push @missing_seqs_in_our_orthos, $seq; #} #} #} #print "Sequences proteinortho found additionally:\n\n@missing_seqs_in_our_orthos\n"; sub get_prot_ortho_fam { my %fam_hash; my %seq_hash; my $id_cnt = 0; open my $prot_ortho_fh, "<", "/homes/prak01/praktikum/ORGA/proteinortho_microRNA.proteinortho" || die; while (<$prot_ortho_fh>) { if ($_ =~ /^#/) { next; } my $fam = "protortho_lineID_" . $id_cnt++; # We don't need info about number of genes etc. from the first 3 columns my @columns = split /\s/, qx/echo "$_" | cut -d"\t" -f4-/; for (@columns) { if ($_ eq "*") { next; } my @seqs = split ',', $_; for my $seq (@seqs) { #my ($seq_name) = $seq =~ /^[^-]*-([^_]*)/; #my $fam = &get_MIPF_fam($seq_name); if ($mode eq "dpl") { if ($seq =~ /^dpl3-/) { next; } } else { if ($seq =~ /^dpl-/) { next; } } if (! exists $fam_hash{$seq}) { $fam_hash{$seq} = $fam; $seq_hash{$fam}{$seq} = 1; } else { die "We have a duplicate Sequence in the Proteinortho output."; } } } #my $crnt_fam; #for my $seq (@columns) { #} } #print Dumper(\%fam_hash); exit; return (\%fam_hash, \%seq_hash); } sub get_own_fam { my @files = ; my %seq_hash; my %fam_hash; for my $file (@files) { my ($fam) = $file =~ /([^\/]*).fa$/; open my $fh, "<", $file; while (<$fh>) { if ($_ =~ /^>([^\s]*)/) { if ($mode eq "dpl") { if ($_ =~ /^dpl3-/) { next; } } else { if ($_ =~ /^dpl-/) { next; } } my $seq_name = $1; $seq_hash{$fam}{$seq_name} = 1; $fam_hash{$seq_name} = $fam; } } } #print Dumper(\%fam_hash); exit; return (\%seq_hash, \%fam_hash); } sub get_MIPF_fam { my $seq_name = shift; my ($fam_id, $fam) = split /\s/, qx#grep "$seq_name" /homes/prak05/praktikum/PRAK05/Annotation/miFam.1line.dat | awk '{print \$1, \$2}'#; #print "$fam_id, $fam\n"; exit; if ($fam_id) { return "$fam_id" . "_$fam"; } else { return undef; } } sub print_hash { my $ref = shift; my %h = %{$ref}; for (sort keys %h) { print "$_, "; } return; }