#!/usr/bin/perl -w

=head1 DESCRIPTION

Reads a directory of annotation files and creates a matrix with the following specification:

Every GO term on the X axis with every sample on the Y, and each cell showing the percentage 
of all genes in that sample that were assigned that term.

Because some of these values can be quite small, a sprint("%.2e") is done on them.

=head1 INPUT

This script expects a specific directory structure as input.  Edit the $annotation_dir to 
provide a source path, and under that it expects one directory per sample.  Within each
sample directory it looks for a file that is the output from a GO slim mapping.  This file
name is specified at the top of this script within the $SLIM_FILE_NAMES variable.

=head1 OUTPUT

Output is written to STDOUT, so you should redirect the output of this script to a file
of your choice.

=cut

use strict;
$|++;

my $USE_SLIM_FILES = 1;
my $SLIM_FILE_NAMES = 'annotation.slimmap.v2';  ## expects one of these files in each sample dir

## finds all files under this directory named like $sample_id/annotation.tab
my $annotation_dir = '/usr/local/projects/dacc/jcvi_metagenomic_autoannotate/output/PGA';

my $sample_dirs = load_sample_dirs( $annotation_dir );

## structure like:
#   $h{$sample_id}{$go_term} = $count;
my %samples = ();

## this is mostly used to index the column order of GO terms
my %go_count = ( 'unassigned' => 0, 'assigned' => 0 );

## this is used to count the number of genes per sample (key)
my %gene_counts = ();

for my $sample_id ( @$sample_dirs ) {
    
    print STDERR "INFO: processing sample $sample_id\n";
    
    my $tab_in = "$annotation_dir/$sample_id/annotation.tab";
    
    open(my $tab_fh, $tab_in) || die "failed to read annotation tab file ($tab_in): $!";

    $samples{$sample_id}{'unassigned'} = 0;
    $samples{$sample_id}{'assigned'} = 0;

    while (my $line = <$tab_fh>) {
        chomp $line;
        my @cols = split("\t", $line);
        next unless scalar @cols > 15;
        
        $gene_counts{$sample_id}++;
        
        my @go_terms = split(' \|\| ', $cols[8]);
        
        $samples{$sample_id}{'unassigned'}++ if scalar(@go_terms) < 1;
        $samples{$sample_id}{'assigned'}++ if scalar(@go_terms) > 0;
        
        ## if we're using a slim file we're done after logging the gene count
        if (! $USE_SLIM_FILES ) {

            for my $term ( @go_terms ) {
                next unless $term =~ /GO\:/;
                $samples{$sample_id}{$term}++;
                $go_count{$term}++;
            }
        }
    }
    
    if ( $USE_SLIM_FILES ) {
        open(my $slim_map, "$annotation_dir/$sample_id/$SLIM_FILE_NAMES") || die "can't read slim file for sample $sample_id: $!";
        
        while ( my $line = <$slim_map> ) {
            chomp $line;

            if ( $line =~ /^\s*((GO\:\d+).+)/ ) {
                my @cols = split("\t", $1);
                my $term = $2;
                
                $samples{$sample_id}{$term} += $cols[1];
                $go_count{$term} += $cols[1];
            }
        }
    }
    
    ## debugging
    #last if scalar(keys %samples) == 10;
}

my @go_col = sort keys %go_count;

print "\t", join("\t", @go_col), "\n";

for my $sample_id ( keys %samples ) {
    print "$sample_id\t";
    
    for ( my $i=0; $i<scalar(@go_col); $i++ ) {
        
        print sprintf( "%.2e", ($samples{$sample_id}{$go_col[$i]} || 0) / $gene_counts{$sample_id} );
        
        print "\t" unless $i == scalar(@go_col) - 1;
    }
    
    print "\n";
}


sub load_sample_dirs {
    my $base = shift;
    
    opendir(my $base_dh, $base) || die "failed to read annotation base directory: $!";
    
    my $dirs = [];
    
    while ( my $thing = readdir($base_dh) ) {
        next unless $thing =~ /^SR/;
        next unless -d "$base/$thing";
        
        push @$dirs, $thing;
    }
    
    return $dirs;
}