#!/usr/bin/env perl
use warnings;
use strict;
use Data::Dumper;
use Getopt::Long;
use File::Basename qw/basename/;
use Bio::SeqIO;
use Storable qw/retrieve/;
use threads;
use Thread::Queue;
# Quick hash implementation that is core-perl
use Digest::MD5 qw/md5_hex md5/;
use Digest::SHA qw/sha1_hex sha1/;
local $0 = basename $0;
sub logmsg{my $TID=threads->tid; local $0=basename $0; print STDERR "$0 (TID $TID): @_\n";}
exit(main());
sub main{
my $settings={};
GetOptions($settings,qw(help novel-alleles=s putatives numcpus=i version k dump db|database=s)) or die $!;
usage() if($$settings{help});
$$settings{db} ||= die("ERROR: need --db");
# TODO might be smart to get the actual locus max length in the database
$$settings{maxGeneLength}=10000;
$$settings{version} && die "ERROR: you can only use --version with hashest-index.pl";
$$settings{k} && die "ERROR: you can only use --k with hashest-index.pl";
$$settings{numcpus} ||= 1;
logmsg "START: loading index $$settings{db}";
my $index = retrieve($$settings{db});
if($$settings{dump}){
print Dumper $index;
logmsg "Dumped database to stdout";
return 0;
}
logmsg "DONE: loading index $$settings{db}";
usage() if(!@ARGV);
# Form a regex to capture stop codons
my $stopsRegexStr = '(?:' . join("|", sort{$a cmp $b} keys(%{ $$index{stops} })) . ')$';
$$settings{stopsRegex} = qr/$stopsRegexStr/i;
my @thr;
my $asmQ = Thread::Queue->new(@ARGV);
# Kick off the printer queue and give it the header to print first
my $printQ = Thread::Queue->new;
$printQ->enqueue(
[
'stdout',
# The header is the assembly and then all the loci
join("\t", "Assembly", @{ $$index{locusArray} })
]
);
# Kick off threads
for(my $i=0;$i<$$settings{numcpus};$i++){
$thr[$i] = threads->new(\&searchAsmWorker, $index, $asmQ, $printQ, $settings);
}
# Start off printer thread
my $printer = threads->new(\&printer, $printQ, $settings);
# Send termination signal to threads
for(@thr){
$asmQ->enqueue(undef);
}
# Wait for threads to finish
for(@thr){
$_->join;
}
# Wait for the printer to finish
$printQ->enqueue(undef);
$printer->join;
return 0;
}
# Separate printer thread to make sure there is no stdout collisions.
sub printer{
my($Q, $settings) = @_;
my $naFh;
if($$settings{'novel-alleles'}){
my $novelAlleles = $$settings{'novel-alleles'};
logmsg "Opening $novelAlleles for writing...";
open($naFh, '>', $novelAlleles) or die "ERROR: could not write to novel alleles file $novelAlleles: $!";
}
while(defined(my $toPrint = $Q->dequeue)){
my ($fhStr, $str) = @$toPrint;
if($fhStr eq 'stdout'){
print $str . "\n";
}
elsif($fhStr eq 'novel'){
print $naFh $str . "\n";
}
else {
die "ERROR: did not understand filehandle $fhStr to write the string $str";
}
}
}
sub searchAsmWorker{
my($index, $asmQ, $printQ, $settings) = @_;
my $k = $$index{settings}{k} || die "ERROR: could not find k in database $$settings{db}";
my $hashing_sub = \&{$$index{settings}{hashing}};
my @locusName = @{ $$index{locusArray} };
my $numSearched = 0;
while(defined(my $asm = $asmQ->dequeue)){
logmsg "Typing for $asm";
my $loci = searchAsm($asm, $k, $hashing_sub, $index, $settings);
my $printLine = $asm;
for my $locus(@locusName){
my $allele = $$loci{$locus} // 0;
$printLine .= "\t$allele";
}
$printQ->enqueue(['stdout', $printLine]);
if($$settings{'novel-alleles'}){
my $novelAlleles = $$loci{_novel};
for my $locus(@locusName){
for my $alleleSeq(@{ $$novelAlleles{$locus} }){
#my $allele = $hashing_sub($$novelAlleles
my $hashsum = &$hashing_sub($alleleSeq);
$printQ->enqueue(['novel', ">".$locus."_".$hashsum."\n$alleleSeq"]);
}
}
}
$numSearched++;
}
return $numSearched;
}
sub searchAsm{
my($asm, $k, $hashing_sub, $index, $settings) = @_;
my %locus;
my $stopsRegex = $$settings{stopsRegex};
my $in=Bio::SeqIO->new(-file=>$asm);
while(my $seqOrig = $in->next_seq){
my $revcom = $seqOrig->revcom;
for my $seq($seqOrig, $revcom){
my $sequence = $seq->seq;
my $seqLength = length($sequence);
# sliding window to get hashes and match against db
SLIDING_WINDOW:
for(my $i=0; $i<$seqLength-$k; $i++){
my $subseq = substr($sequence, $i, $k);
my $locusHash = &$hashing_sub($subseq);
#logmsg "Compare $locusHash";
# Test if we found the locus hash which would indicate we found the locus
if($$index{locus}{$locusHash}){
# Get the name of the putative locus
my $locus = $$index{locus}{$locusHash};
#logmsg "Testing locus $locus from ".$seq->id." pos $i";
# Get downstream sequence to see if it matches an allele
my $candidateSequence = "";
for(my $j=$k;$j<$$settings{maxGeneLength};$j++){
$candidateSequence = substr($sequence, $i, $j);
if($candidateSequence !~ $stopsRegex){
next;
}
if($$index{allele}{$locus}{$candidateSequence}){
#logmsg "Found $candidateSequence";
#push(@locus, $$index{allele}{$locus}{$candidateSequence});
my $allele = $$index{allele}{$locus}{$candidateSequence}[1];
if(defined $locus{$locus}){
logmsg "WARNING: locus $locus is defined more than once. Appending allele $locus{$locus} with ~$allele";
$allele = "~$allele";
}
$locus{$locus} .= $allele;
# push ahead the search to wherever $j is but back it up the kmer length
$i += $j-$k;
next SLIDING_WINDOW;
}
}
# Figure out the ORF but it should be at least 20aa or 60nt.
for(my $pos=60; $pos < length($candidateSequence); $pos+=3){
my $candidateOrf = substr($candidateSequence, 0, $pos);
my $candidateStop = substr($candidateOrf, -3, 3);
#logmsg $pos;
if($$index{stops}{$candidateStop}){
push(@{ $locus{_novel}{$locus} }, $candidateOrf);
last;
}
}
# The next part of this loop is only when we care about putative alleles
# for a locus we might have detected.
next if(!$$settings{putatives});
# If we get to this point, then `next SLIDING_WINDOW` was not run,
# but a locus was identified.
# Mark that we think we know the locus but not the allele.
my $allele = "?";
if(defined $locus{$locus}){
logmsg "WARNING: locus $locus is defined more than once. Appending allele $locus{$locus} with ~$allele";
$allele = "~$allele";
}
$locus{$locus} .= $allele;
}
}
}
}
$in->close;
#print Dumper \@locus, "Num loci: ".scalar(@locus);die;
#print "Found ". scalar(@locus)." loci with k=$k\n"; die;
return \%locus;
}
sub usage{
print "$0: reports an MLST profile for a genome assembly
Usage: $0 [options] *.fasta [*.gbk...] > out.tsv
--db Database from hashest-index.pl
--numcpus Number of threads to use [default: 1]
--dump Dump the database instead of analyzing anything
--novel-alleles (optional) A filename to write novel alleles
into a fasta format. Defline will be
>locusname_hashsum
--putatives Print a '?' instead of an int when a locus
has been detected but no exact allele was
found
--help This useful help menu
";
exit 0;
}