use strictures 1;
use utf8;
use 5.018;
=head1 NAME
Bio::WebService::LANL::SequenceLocator - Locate sequences within HIV using LANL's web tool
=head1 SYNOPSIS
use Bio::WebService::LANL::SequenceLocator;
my $locator = Bio::WebService::LANL::SequenceLocator->new(
agent_string => 'Your Organization - you@example.com',
);
my @sequences = $locator->find([
"agcaatcagatggtcagccaaaattgccctatagtgcagaacatccaggggcaagtggtacatcaggccatatcacctagaactttaaatgca",
]);
See L</EXAMPLE RESULTS> below.
=head1 DESCRIPTION
This library provides simple programmatic access to
L<LANL's HIV sequence locator|http://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html>
web tool and is also used to power
L<a simple, JSON-based web API|https://indra.mullins.microbiol.washington.edu/locate-sequence/>
for the same tool (via L<Bio::WebService::LANL::SequenceLocator::Server>).
Nearly all of the information output by LANL's sequence locator is parsed and
provided by this library, though the results do vary slightly depending on the
base type of the query sequence. Multiple query sequences can be located at
the same time and results will be returned for all.
Results are extracted from both tab-delimited files provided by LANL as well as
the HTML itself.
=head1 EXAMPLE RESULTS
# Using @sequences from the SYNOPSIS above
use JSON;
print encode_json(\@sequences);
__END__
[
{
"query" : "sequence_1",
"query_sequence" : "AGCAATCAGATGGTCAGCCAAAATTGCCCTATAGTGCAGAACATCCAGGGGCAAGTGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
"base_type" : "nucleotide",
"reverse_complement" : "0",
"alignment" : "\n Query AGCAATCAGA TGGTCAGCCA AAATTGCCCT ATAGTGCAGA ACATCCAGGG 50\n :::::::: ::::::::: ::::: :::: :::::::::: :::::::::: \n HXB2 AGCAATCA-- -GGTCAGCCA AAATTACCCT ATAGTGCAGA ACATCCAGGG 1208\n\n Query GCAAGTGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 93\n :::: ::::: :::::::::: :::::::::: :::::::::: ::: \n HXB2 GCAAATGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 1251\n\n ",
"hxb2_sequence" : "AGCAATCA---GGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
"similarity_to_hxb2" : "94.6",
"start" : "373",
"end" : "462",
"genome_start" : "1162",
"genome_end" : "1251",
"polyprotein" : "Gag",
"region_names" : [
"Gag",
"p17",
"p24"
],
"regions" : [
{
"cds" : "Gag",
"aa_from_protein_start" : [ "125", "154" ],
"na_from_cds_start" : [ "373", "462" ],
"na_from_hxb2_start" : [ "1162", "1251" ],
"na_from_query_start" : [ "1", "93" ],
"protein_translation" : "SNQMVSQNCPIVQNIQGQVVHQAISPRTLNA"
},
{
"cds" : "p17",
"aa_from_protein_start" : [ "125", "132" ],
"na_from_cds_start" : [ "373", "396" ],
"na_from_hxb2_start" : [ "1162", "1185" ],
"na_from_query_start" : [ "1", "27" ],
"protein_translation" : "SNQMVSQNC"
},
{
"cds" : "p24",
"aa_from_protein_start" : [ "1", "22" ],
"na_from_cds_start" : [ "1", "66" ],
"na_from_hxb2_start" : [ "1186", "1251" ],
"na_from_query_start" : [ "28", "93" ],
"protein_translation" : "PIVQNIQGQVVHQAISPRTLNA"
}
]
}
]
=cut
package Bio::WebService::LANL::SequenceLocator;
use Moo;
use Data::Dumper;
use HTML::LinkExtor;
use HTML::TableExtract;
use HTML::TokeParser;
use HTTP::Request::Common;
use List::AllUtils qw< pairwise part min max >;
our $VERSION = 20170324;
=head1 METHODS
=head2 new
Returns a new instance of this class. An optional parameter C<agent_string>
should be provided to identify yourself to LANL out of politeness. See the
L</SYNOPSIS> for an example.
=cut
has agent_string => (
is => 'ro',
lazy => 1,
builder => sub { '' },
);
has agent => (
is => 'ro',
lazy => 1,
builder => sub {
require LWP::UserAgent;
my $self = shift;
my $agent = LWP::UserAgent->new(
agent => join(" ", __PACKAGE__ . "/$VERSION", $self->agent_string),
);
$agent->env_proxy;
return $agent;
},
);
has lanl_base => (
is => 'ro',
lazy => 1,
builder => sub { 'https://www.hiv.lanl.gov' },
);
has lanl_endpoint => (
is => 'ro',
lazy => 1,
builder => sub { shift->lanl_base . '/cgi-bin/LOCATE/locate.cgi' },
);
has _bogus_slug => (
is => 'ro',
default => sub { 'BOGUS_SEQ_SO_TABULAR_FILES_ARE_LINKED_IN_OUTPUT' },
);
sub _request {
my $self = shift;
my $req = shift;
my $response = $self->agent->request($req);
if (not $response->is_success) {
warn sprintf "Request failed: %s %s -> %s\n",
$req->method, $req->uri, $response->status_line;
return;
}
return $response->decoded_content;
}
=head2 find
Takes an array ref of sequence strings. Sequences may be in amino acids or
nucleotides and mixed freely. Sequences should not be in FASTA format.
If sequence bases are not clearly nucleotides or clearly amino acids, LANL
seems to default to nucleotides. This can be an issue for some sequences since
the full alphabet for nucleotides overlaps with the alphabet for amino acids.
To overcome this problem, you may specify C<< base => 'nucleotide' >>
or C<< base => 'amino acid' >> after the array ref of sequences. This forces
every sequence to be interpreted as nucleotides or amino acids, so you cannot
mix base types in your sequences if you use this option. C<n>, C<nuc>, and
C<nucleotides> are accepted aliases for C<nucleotide>. C<a>, C<aa>, C<amino>,
and C<amino acids> are accepted aliases for C<amino acid>.
Returns a list of hashrefs when called in list context, otherwise returns an
arrayref of hashrefs.
See L</EXAMPLE RESULTS> for the structure of the data returned.
=cut
sub find {
my ($self, $sequences, %args) = @_;
my $content = $self->submit_sequences($sequences, %args)
or return;
return $self->parse_html($content);
}
sub submit_sequences {
my ($self, $sequences, %args) = @_;
if (defined $args{base}) {
my $base = lc $args{base};
if ($base =~ /^n(uc(leotides?)?)?$/i) {
$args{base} = 1;
} elsif ($base =~ /^(a(mino( acids?)?)?|aa)$/i) {
$args{base} = 0;
} else {
warn "Unknown base type <$args{base}>, ignoring";
delete $args{base};
}
}
# Submit multiple sequences at once using FASTA
my $fasta = join "\n", map {
("> sequence_$_", $sequences->[$_ - 1])
} 1 .. @$sequences;
# LANL only presents the parseable table.txt we want if there's more
# than a single sequence. We always add it so we can reliably skip it.
$fasta .= "\n> " . $self->_bogus_slug . "\n";
return $self->_request(
POST $self->lanl_endpoint,
Content_Type => 'form-data',
Content => [
organism => 'HIV',
DoReverseComplement => 0,
seq_input => $fasta,
(defined $args{base}
? ( base => $args{base} )
: ()),
],
);
}
sub parse_html {
my ($self, $content) = @_;
# Fetch and parse the two tables provided as links which removes the need
# to parse all of the HTML.
my @results = $self->parse_tsv($content);
# Now parse the table data from the HTML
my @tables = $self->parse_tables($content);
# Extract the alignments, parsing the HTML a third time!
my @alignments = $self->parse_alignments($content);
unless (@results and @tables and @alignments) {
warn "Didn't find all three of TSV, tables, and alignments!\n";
warn "TSV: ", scalar @results, "\n";
warn "HTML tables: ", scalar @tables, "\n";
warn "HTML alignments: ", scalar @alignments, "\n";
warn "Content:\n$content\n", "=" x 80, "\n";
return;
}
unless (@results == @tables and @results == @alignments) {
warn "Tab-delimited results count doesn't match parsed HTML result count. Bug!\n";
warn "TSV: ", scalar @results, "\n";
warn "HTML tables: ", scalar @tables, "\n";
warn "HTML alignments: ", scalar @alignments, "\n";
warn "Content:\n$content\n", "=" x 80, "\n";
return;
}
@results = pairwise {
my $new = {
%$a,
base_type => $b->{base_type},
regions => $b->{rows},
region_names => [ map { $_->{cds} } @{$b->{rows}} ],
};
delete $new->{$_} for qw(protein protein_start protein_end);
$new;
} @results, @tables;
@results = pairwise { +{ %$b, %$a } } @results, @alignments;
# Fill in genome start/end for amino acid sequences
for my $r (@results) {
next unless $r->{base_type} eq 'amino acid';
if ($r->{genome_start} or $r->{genome_end}) {
warn "Amino acid sequence with genome start/end already?!",
" query <$r->{query_sequence}>";
next;
}
$r->{genome_start} = min map { $_->{na_from_hxb2_start}[0] } @{$r->{regions}};
$r->{genome_end} = max map { $_->{na_from_hxb2_start}[1] } @{$r->{regions}};
}
return wantarray ? @results : \@results;
}
sub parse_tsv {
my ($self, $content) = @_;
my @results;
my %urls;
my $extract = HTML::LinkExtor->new(
sub {
my ($tag, %attr) = @_;
return unless $tag eq 'a' and $attr{href};
return unless $attr{href} =~ m{/(table|simple_results)\.txt$};
$urls{$1} = $attr{href};
},
$self->lanl_base,
);
$extract->parse($content);
for my $table_name (qw(table simple_results)) {
next unless $urls{$table_name};
my $table = $self->_request(GET $urls{$table_name})
or next;
my (@these_results, %seen);
my @lines = split "\n", $table;
my @fields = map {
s/^SeqName$/query/; # standard key
s/(?<=[a-z])(?=[A-Z])/_/g; # undo CamelCase
s/ +/_/g; # no spaces
y/A-Z/a-z/; # normalize to lowercase
# Account for the same field twice in the same data table
if ($seen{$_}++) {
$_ = /^(start|end)$/
? "protein_$_"
: join "_", $_, $seen{$_};
}
$_;
} split "\t", shift @lines;
for (@lines) {
my @values = split "\t";
my %data;
@data{@fields} = @values;
next if $data{query} eq $self->_bogus_slug;
$data{query_sequence} =~ s/\s+//g
if $data{query_sequence};
push @these_results, \%data;
}
# Merge with existing results, if any
@results = @results
? pairwise { +{ %$a, %$b } } @results, @these_results
: @these_results;
}
return @results;
}
sub parse_tables {
my ($self, $content) = @_;
my @tables;
my %columns_for = (
'amino acid' => [
"CDS" => "cds",
"AA position relative to protein start in HXB2" => "aa_from_protein_start",
"AA position relative to query sequence start" => "aa_from_query_start",
"AA position relative to polyprotein start in HXB2" => "aa_from_polyprotein_start",
"NA position relative to CDS start in HXB2" => "aa_from_cds_start",
"NA position relative to HXB2 genome start" => "na_from_hxb2_start",
],
'nucleotide' => [
"CDS" => "cds",
"Nucleotide position relative to CDS start in HXB2" => "na_from_cds_start",
"Nucleotide position relative to query sequence start" => "na_from_query_start",
"Nucleotide position relative to HXB2 genome start" => "na_from_hxb2_start",
"Amino Acid position relative to protein start in HXB2" => "aa_from_protein_start",
],
);
for my $base_type (sort keys %columns_for) {
my ($their_cols, $our_cols) = part {
state $i = 0;
$i++ % 2
} @{ $columns_for{$base_type} };
my $extract = HTML::TableExtract->new( headers => $their_cols );
$extract->parse($content);
# Examine all matching tables
for my $table ($extract->tables) {
my %table = (
coords => [$table->coords],
base_type => $base_type,
columns => $our_cols,
rows => [],
);
for my $row ($table->rows) {
@$row = map { defined $_ ? s/^\s+|\s*$//gr : $_ } @$row;
# An empty row with only a sequence string in the first column.
if ( $row->[0]
and $row->[0] =~ /^[A-Za-z]+$/
and not grep { defined and length } @$row[1 .. scalar @$row - 1])
{
$table{rows}->[-1]{protein_translation} = $row->[0];
next;
}
# Not all rows are data, some are informational sentences.
next if grep { not defined } @$row;
my %row;
@row{@$our_cols} =
map { ($_ and $_ eq "NA") ? undef : $_ }
map { ($_ and /(\d+) → (\d+)/) ? [$1, $2] : $_ }
@$row;
push @{$table{rows}}, \%row;
}
push @tables, \%table
if @{$table{rows}};
}
}
# Sort by depth, then within each depth by count
@tables = sort {
$a->{coords}[0] <=> $b->{coords}[0]
or $a->{coords}[1] <=> $b->{coords}[1]
} @tables;
if (@tables > 1) {
unless ( $tables[-1]->{rows}[0]{na_from_query_start} eq "1 →"
and $tables[-1]->{rows}[0]{protein_translation} eq "X") {
warn "Last table appears to be real!? It should be the bogus table of the bogus sequence.";
warn "Table is ", Dumper($tables[-1]), "\n";
return;
} else {
pop @tables;
}
}
return @tables;
}
sub parse_alignments {
my ($self, $content) = @_;
my @alignments;
my $doc = HTML::TokeParser->new(
\$content,
unbroken_text => 1,
);
my $expect_alignment = 0;
while (my $tag = $doc->get_tag("b", "pre")) {
my $name = lc $tag->[0];
my $text = $doc->get_text;
next unless defined $text;
# <pre>s are preceeded by a bold header, which we use as an indicator
if ($name eq 'b') {
$expect_alignment = $text =~ /Alignment\s+of\s+the\s+query\s+sequence\s+to\s+HXB2/i;
} elsif ($name eq 'pre') {
if ($text =~ /^\s*Query\b/m and $text =~ /^\s*HXB2\b/m) {
push @alignments, $text;
warn "Not expecting alignment, but found one‽"
unless $expect_alignment;
}
elsif ($text =~ /^\s+$/ and $expect_alignment) {
push @alignments, undef; # We appear to have found an unaligned sequence.
}
$expect_alignment = 0;
}
}
if (defined $alignments[-1]) {
warn "Last alignment is non-null! It should be the empty alignment of the bogus sequence.";
warn "Alignment is <$alignments[-1]>\n";
return;
} else {
pop @alignments;
}
my @results;
for (@alignments) {
my @hxb2;
if (defined) {
push @hxb2, $1 =~ s/\s+//gr
while /^\s*HXB2\b\s+(.+?)(?:\s+\d+|\s*)$/gm;
}
push @results, {
alignment => $_,
hxb2_sequence => @hxb2 ? join("", @hxb2) : undef,
};
}
return @results;
}
=head1 AUTHOR
Thomas Sibley E<lt>trsibley@uw.eduE<gt>
=head1 COPYRIGHT
Copyright 2014 by the Mullins Lab, Department of Microbiology, University of
Washington.
=head1 LICENSE
Licensed under the same terms as Perl 5 itself.
=cut
42;