#!/usr/bin/env perl 

use warnings;
use strict;
use Data::Dumper;
use Getopt::Long;
use File::Basename qw/basename/;
use Bio::SeqIO;
use Storable qw/nstore_fd/;
use List::MoreUtils qw/uniq/;

# Quick hash implementation that is core-perl
#use B qw/hash/;
use Digest::MD5 qw/md5_hex md5/;
use Digest::SHA qw/sha1_hex sha1/;

use version 0.77;
our $VERSION="0.5.1";

local $0 = basename $0;
sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";}
exit(main());

sub main{
  my $settings={};
  GetOptions($settings,qw(help hashing=s version output=s k=i)) or die $!;
  if($$settings{version}){
    print "$0 $VERSION\n";
    return 0;
  }

  usage() if($$settings{help} || !@ARGV);

  $$settings{k} ||= 16;
  $$settings{output} ||= die("ERROR: need --output");
  $$settings{hashing}||= "md5_hex";

  my %index = (
    settings => {
      k => $$settings{k},
      version => $VERSION,
      hashing => $$settings{hashing},
      locusArray => [], # To be filled out later as an array of locus names
    }
  );

  for my $f(@ARGV){
    my $indices = indexFasta($f, $$settings{k}, $settings);

    # Combine the associative arrays into the larger associative array
    for my $key(qw(locus allele)){
      while(my($hash, $values) = each(%{ $$indices{$key} })){
        $index{$key}{$hash} = $values;
      }
    }

    # Record the stops
    while(my($stop, $count) = each(%{ $$indices{stops} })){
      $index{stops}{$stop} += $count;
    }

  }

  my @loci = sort {$a cmp $b} uniq(values(%{$index{locus}}));
  $index{locusArray} = \@loci;

  # Save the indices
  my $indexfile = $$settings{output};
  open (my $fh, '>:raw', $indexfile) or die "ERROR: could not write to $indexfile: $!";
  nstore_fd \%index, $fh;
  close $fh;

  return 0;
}

sub indexFasta{
  my($file, $k, $settings) = @_;
  
  my %indexLocus;
  my %indexAllele;
  my $hashing_sub = \&{$$settings{hashing}};
  my %stops;

  my $in = Bio::SeqIO->new(-file=>$file);
  while(my $seq = $in->next_seq){
    my $sequence = $seq->seq;
    my $id = $seq->id;
    my(@F) = split(/_/, $id);
    my $allele = pop(@F);
    my $locus = join("_", @F);

    my $locusHash  = &$hashing_sub(substr($sequence, 0, $k));
    #my $alleleHash = md5_hex($sequence);
    $indexLocus{$locusHash} = $locus;
    $indexAllele{$locus}{$sequence} = [$locus, $allele];

    # Record the stop codon
    my $stop = substr($sequence, -3, 3);
    $stops{$stop}++;
  }
  return {locus=>\%indexLocus, allele=>\%indexAllele, stops=>\%stops};
}


sub usage{
  print "$0: indexes a fasta file
  Fasta file have deflines in the format of >locus_allele
    where locus is a string and allele is an int
  Usage: $0 [options] *.fasta [*.gbk...]
  --k       kmer length [default: 16]
  --hashing Hashing algorithm md5_hex, sha1_hex [default: md5_hex]
  --output  Output prefix for index files
  --version print version and exit
  --help    This useful help menu
  ";
  exit 0;
}