#!/usr/bin/env perl use warnings; use strict; use Data::Dumper; use Getopt::Long; use File::Basename qw/basename/; use Bio::SeqIO; use Storable qw/nstore_fd/; use List::MoreUtils qw/uniq/; # Quick hash implementation that is core-perl #use B qw/hash/; use Digest::MD5 qw/md5_hex md5/; use Digest::SHA qw/sha1_hex sha1/; use version 0.77; our $VERSION="0.5.1"; local $0 = basename $0; sub logmsg{local $0=basename $0; print STDERR "$0: @_\n";} exit(main()); sub main{ my $settings={}; GetOptions($settings,qw(help hashing=s version output=s k=i)) or die $!; if($$settings{version}){ print "$0 $VERSION\n"; return 0; } usage() if($$settings{help} || !@ARGV); $$settings{k} ||= 16; $$settings{output} ||= die("ERROR: need --output"); $$settings{hashing}||= "md5_hex"; my %index = ( settings => { k => $$settings{k}, version => $VERSION, hashing => $$settings{hashing}, locusArray => [], # To be filled out later as an array of locus names } ); for my $f(@ARGV){ my $indices = indexFasta($f, $$settings{k}, $settings); # Combine the associative arrays into the larger associative array for my $key(qw(locus allele)){ while(my($hash, $values) = each(%{ $$indices{$key} })){ $index{$key}{$hash} = $values; } } # Record the stops while(my($stop, $count) = each(%{ $$indices{stops} })){ $index{stops}{$stop} += $count; } } my @loci = sort {$a cmp $b} uniq(values(%{$index{locus}})); $index{locusArray} = \@loci; # Save the indices my $indexfile = $$settings{output}; open (my $fh, '>:raw', $indexfile) or die "ERROR: could not write to $indexfile: $!"; nstore_fd \%index, $fh; close $fh; return 0; } sub indexFasta{ my($file, $k, $settings) = @_; my %indexLocus; my %indexAllele; my $hashing_sub = \&{$$settings{hashing}}; my %stops; my $in = Bio::SeqIO->new(-file=>$file); while(my $seq = $in->next_seq){ my $sequence = $seq->seq; my $id = $seq->id; my(@F) = split(/_/, $id); my $allele = pop(@F); my $locus = join("_", @F); my $locusHash = &$hashing_sub(substr($sequence, 0, $k)); #my $alleleHash = md5_hex($sequence); $indexLocus{$locusHash} = $locus; $indexAllele{$locus}{$sequence} = [$locus, $allele]; # Record the stop codon my $stop = substr($sequence, -3, 3); $stops{$stop}++; } return {locus=>\%indexLocus, allele=>\%indexAllele, stops=>\%stops}; } sub usage{ print "$0: indexes a fasta file Fasta file have deflines in the format of >locus_allele where locus is a string and allele is an int Usage: $0 [options] *.fasta [*.gbk...] --k kmer length [default: 16] --hashing Hashing algorithm md5_hex, sha1_hex [default: md5_hex] --output Output prefix for index files --version print version and exit --help This useful help menu "; exit 0; }