package Bio::ToolBox::db_helper::big;
# modules
require Exporter;
use strict;
use Carp;
use List::Util qw(min max sum);
use Bio::ToolBox::db_helper::constants;
use Bio::DB::Big;
our $VERSION = '1.66';
# Initialize CURL buffers
BEGIN {
# not clear if this should be done only once or if it's harmless to re-init
# for every new file, so I guess best to just do it here at the very beginning
# initialization is only for remote files
Bio::DB::Big->init();
}
# Exported names
our @ISA = qw(Exporter);
our @EXPORT = qw(
open_bigwig_db
collect_bigwig_score
collect_bigwig_scores
collect_bigwig_position_scores
open_bigbed_db
collect_bigbed_scores
collect_bigbed_position_scores
open_bigwigset_db
collect_bigwigset_score
collect_bigwigset_scores
collect_bigwigset_position_scores
sum_total_bigbed_features
);
# Hash of Bigfile chromosomes
my %BIG_CHROMOS;
# sometimes user may request a chromosome that's not in the bigfile
# that could lead to an exception
# we will record the chromosomes list in this hash
# $BIG_CHROMOS{bigfile}{altchromo} = chromo
# we also record the chromosome name variant with or without chr prefix
# to accommodate different naming conventions
# Hash of Bigfile chromosome lengths
my %BIG_CHROMOLENGTHS;
# since libBigWig doesn't internally clip chromosome lengths
# we will cache chromosome lengths and check them before it leads to an exception
# $BIG_CHROMOLENGTHS{bigfile}{chromo} = length
# Opened bigFile db objects
my %OPENED_BIG;
# a cache for opened Bigfile databases, primarily for collecting scores
# caching here is only for local purposes of collecting scores
# db_helper also provides caching of db objects but with option to force open in
# the case of forking processes - we don't have that here
# BigWigSet bigWig IDs
my %BIGWIGSET_WIGS;
# cache for the bigwigs from a BigWigSet used in a query
# we want to use low level bigWig access which isn't normally
# available from the high level BigWigSet, so we identify the
# bigWigs from the bigWigSet and cache them here
# The true statement
1;
#### BigWig Subroutines
sub open_bigwig_db {
my $path = shift;
my $bw = _open_big($path) or
croak " Unable to open bigWig file '$path'! $@\n";
unless ($bw->is_big_wig) {
croak " $path is not a bigWig file!\n";
}
return $bw; # we do not cache here
}
sub collect_bigwig_score {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset(s)
my $param = shift;
# this will use the built-in summary statistics
# the following methods are available: mean, min, max, std, cov
# adjust old method name for compatibility
$param->[METH] = 'std' if $param->[METH] eq 'stddev';
# check how many features we have
if (scalar @$param == 9) {
# only one bw, great!
my $bw = _get_bigwig($param->[DATA]);
my $chromo = $BIG_CHROMOS{$param->[DATA]}{$param->[CHR]} or return;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
my $s = $bw->get_stats($chromo, $param->[STRT] - 1, $param->[STOP], 1,
$param->[METH]);
return $s->[0];
}
else {
# we have multiple bigwigs
my @scores;
for (my $d = DATA; $d < scalar @$param; $d++) {
my $bw = _get_bigwig($param->[$d]);
my $chromo = $BIG_CHROMOS{$param->[$d]}{$param->[CHR]} or next;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
my $s = $bw->get_stats($chromo, $param->[STRT] - 1, $param->[STOP], 1,
$param->[METH]);
push @scores, $s->[0];
}
if ($param->[METH] eq 'min') {
return min(@scores);
}
elsif ($param->[METH] eq 'max') {
return max(@scores);
}
else {
confess sprintf " how did we get here? unable to calculate %s from multiple bigwigs!\n",
$param->[METH];
}
}
}
sub collect_bigwig_scores {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset(s)
my $param = shift;
# check how many features we have
if (scalar @$param == 9) {
# only one, great!
my $bw = _get_bigwig($param->[DATA]);
my $chromo = $BIG_CHROMOS{$param->[DATA]}{$param->[CHR]} or return;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
my $raw_scores = $bw->get_values($chromo, $param->[STRT] - 1, $param->[STOP]);
my @scores = grep {defined} @$raw_scores;
return wantarray ? @scores : \@scores;
}
else {
# we have multiple bigwigs
my @scores;
for (my $d = DATA; $d < scalar @$param; $d++) {
my $bw = _get_bigwig($param->[$d]);
my $chromo = $BIG_CHROMOS{$param->[$d]}{$param->[CHR]} or next;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
my $raw = $bw->get_values($chromo, $param->[STRT] - 1, $param->[STOP]);
push @scores, grep {defined} @$raw;
}
return wantarray ? @scores : \@scores;
}
}
sub collect_bigwig_position_scores {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset(s)
my $param = shift;
my %pos2score;
# check how many features we have
if (scalar @$param == 9) {
# only one, great!
my $bw = _get_bigwig($param->[DATA]);
my $chromo = $BIG_CHROMOS{$param->[DATA]}{$param->[CHR]} or return;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
my $intervals = $bw->get_intervals($chromo, $param->[STRT] - 1, $param->[STOP]);
# record intervals into hash
foreach my $i (@$intervals) {
for (my $p = $i->{start} + 1; $p <= $i->{end}; $p++) {
$pos2score{$p} = $i->{value};
}
}
}
else {
# we have multiple bigwigs
my %duplicates; # hash of duplicate positions, position => number
# collect from each one
for (my $d = DATA; $d < scalar @$param; $d++) {
my $bw = _get_bigwig($param->[$d]);
my $chromo = $BIG_CHROMOS{$param->[$d]}{$param->[CHR]} or next;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
my $intervals = $bw->get_intervals($chromo, $param->[STRT] - 1, $param->[STOP]);
# record intervals into hash
foreach my $i (@$intervals) {
for (my $p = $i->{start} + 1; $p <= $i->{end}; $p++) {
# check every position to see if it's a duplicate
if (exists $pos2score{$p} ) {
if (exists $duplicates{$p} ) {
# append an incrementing number at the end
$duplicates{$p} += 1; # increment first
my $new = sprintf("%d.%d", $p, $duplicates{$p});
$pos2score{$new} = $i->{value};
}
else {
# first time duplicate
my $new = $p . '.1';
$pos2score{$new} = $i->{value};
$duplicates{$p} = 1;
}
}
else {
$pos2score{$p} = $i->{value};
}
}
}
}
# check for duplicate positions - we may not have any
if (%duplicates) {
_remove_duplicate_positions(\%pos2score, \%duplicates);
}
}
return wantarray ? %pos2score : \%pos2score;
}
#### BigBed Subroutines
sub open_bigbed_db {
my $path = shift;
my $bb = _open_big($path) or
croak " Unable to open bigBed file '$path'! $@\n";
unless ($bb->is_big_bed) {
croak " $path is not a bigBed file!\n";
}
return $bb;
}
sub collect_bigbed_scores {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset
my $param = shift;
# look at each bedfile
# usually there is only one, but for stranded data there may be
# two bedfiles (+ and -), so we'll check each bed file for strand info
my @scores;
for (my $d = DATA; $d < scalar @$param; $d++) {
# open the bedfile
my $bb = _get_bigbed($param->[$d]);
# first check chromosome is present
my $chromo = $BIG_CHROMOS{$param->[$d]}{$param->[CHR]} or next;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
# collect the features overlapping the region
# we are using the high level API rather than the low-level
# since we getting the individual scores from each bed element
my $bb_stream = Bio::ToolBox::db_helper::big::BedIteratorWrapper->new(
$bb, $chromo, $param->[STRT], $param->[STOP]);
# process each feature
while (my $bed = $bb_stream->next_seq) {
# First check whether the strand is acceptable
if (
$param->[STND] eq 'all' # all data is requested
or $bed->strand == 0 # unstranded data
or (
# sense data
$param->[STR] == $bed->strand
and $param->[STND] eq 'sense'
)
or (
# antisense data
$param->[STR] != $bed->strand
and $param->[STND] eq 'antisense'
)
) {
# we have acceptable data to collect
# store the appropriate datapoint
if ($param->[METH] eq 'count') {
push @scores, 1;
}
elsif ($param->[METH] eq 'pcount') {
push @scores, 1 if ($bed->start >= $param->[STRT] and
$bed->end <= $param->[STOP]);
}
elsif ($param->[METH] eq 'ncount') {
push @scores, $bed->display_name || $bed->primary_id;
}
else {
push @scores, $bed->score;
}
}
}
}
# return collected data
return wantarray ? @scores : \@scores;
}
sub collect_bigbed_position_scores {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset
my $param = shift;
# look at each bedfile
# usually there is only one, but there may be more
my %pos2data;
for (my $i = DATA; $i < scalar @$param; $i++) {
# open the bedfile
my $bb = _get_bigbed($param->[$i]);
# first check that the chromosome is present
my $chromo = $BIG_CHROMOS{$param->[$i]}{$param->[CHR]} or next;
$param->[STRT] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STRT] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
$param->[STOP] = $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo} if
$param->[STOP] > $BIG_CHROMOLENGTHS{$param->[DATA]}{$chromo};
# collect the features overlapping the region
my $bb_stream = Bio::ToolBox::db_helper::big::BedIteratorWrapper->new(
$bb, $chromo, $param->[STRT], $param->[STOP]);
# process each feature
while (my $bed = $bb_stream->next_seq) {
# First check whether the strand is acceptable
if (
$param->[STND] eq 'all' # all data is requested
or $bed->strand == 0 # unstranded data
or (
# sense data
$param->[STR] == $bed->strand
and $param->[STND] eq 'sense'
)
or (
# antisense data
$param->[STR] != $bed->strand
and $param->[STND] eq 'antisense'
)
) {
# we have acceptable data to collect
# determine position to record
my $position;
if ($bed->start == $bed->end) {
# just one position recorded
$position = $bed->start;
}
else {
# calculate the midpoint
$position = int(
( ($bed->start + $bed->end) / 2) + 0.5
);
}
# check the position
next unless (
# want to avoid those whose midpoint are not technically
# within the region of interest
$position >= $param->[STRT] and $position <= $param->[STOP]
);
# store the appropriate datapoint
# for score and length, we're putting these into an array
if ($param->[METH] eq 'count') {
$pos2data{$position} += 1;
}
elsif ($param->[METH] eq 'pcount') {
$pos2data{$position} += 1 if
($bed->start <= $param->[STRT] and $bed->end <= $param->[STOP]);
}
elsif ($param->[METH] eq 'ncount') {
$pos2data{$position} ||= [];
push @{ $pos2data{$position} }, $bed->display_name ||
$bed->primary_id;
}
else {
# everything else we just take the score
push @{ $pos2data{$position} }, $bed->score + 0;
}
}
}
}
# combine multiple datapoints at the same position
if ($param->[METH] eq 'ncount') {
foreach my $position (keys %pos2data) {
my %name2count;
foreach (@{$pos2data{$position}}) { $name2count{$_} += 1 }
$pos2data{$position} = scalar(keys %name2count);
}
}
elsif ($param->[METH] eq 'count' or $param->[METH] eq 'pcount') {
# do nothing, these aren't arrays
}
elsif ($param->[METH] eq 'mean') {
foreach my $position (keys %pos2data) {
$pos2data{$position} = sum( @{$pos2data{$position}} ) /
scalar( @{$pos2data{$position}} );
}
}
elsif ($param->[METH] eq 'median') {
foreach my $position (keys %pos2data) {
$pos2data{$position} = median( @{$pos2data{$position}} );
}
}
elsif ($param->[METH] eq 'min') {
foreach my $position (keys %pos2data) {
$pos2data{$position} = min( @{$pos2data{$position}} );
}
}
elsif ($param->[METH] eq 'max') {
foreach my $position (keys %pos2data) {
$pos2data{$position} = max( @{$pos2data{$position}} );
}
}
elsif ($param->[METH] eq 'sum') {
foreach my $position (keys %pos2data) {
$pos2data{$position} = sum( @{$pos2data{$position}} );
}
}
else {
# just take the mean for everything else
foreach my $position (keys %pos2data) {
$pos2data{$position} = sum( @{$pos2data{$position}} ) /
scalar( @{$pos2data{$position}} );
}
}
# return collected data
return wantarray ? %pos2data : \%pos2data;
}
sub sum_total_bigbed_features {
# there is no easy way to do this with this adapter, except to literally
# walk through the entire file.
# well, we do this with bam files, I guess we could do the same here
# honestly, who uses this????? it's legacy. skip for now until someone complains
return undef;
}
#### BigWigSet Subroutines
sub open_bigwigset_db {
my $path = shift;
return Bio::ToolBox::db_helper::big::BigWigSet->new($path);
}
sub collect_bigwigset_score {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset
my $param = shift;
# lookup the bigWig files based on the parameters
my $ids = _lookup_bigwigset_wigs($param);
return unless scalar(@$ids) > 0;
croak("multiple selected bigWig files from a BigWigSet is not supported with single score method")
if scalar(@$ids) > 1;
push @$param, @$ids;
# use the low level single bigWig API
return collect_bigwig_score($param);
}
sub collect_bigwigset_scores {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset
my $param = shift;
# lookup the bigWig files based on the parameters
my $ids = _lookup_bigwigset_wigs($param);
return unless scalar(@$ids) > 0;
push @$param, @$ids;
# use the low level single bigWig API
return collect_bigwig_scores($param);
}
sub collect_bigwigset_position_scores {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset
my $param = shift;
# lookup the bigWig files based on the parameters
my $ids = _lookup_bigwigset_wigs($param);
return unless scalar(@$ids) > 0;
push @$param, @$ids;
# use the low level single bigWig API
return collect_bigwig_position_scores($param);
}
#### Internal
sub _open_big {
my $path = shift;
$path =~ s/^file://; # clean up file prefix if present
my $big;
eval {
$big = Bio::DB::Big->open($path);
};
return $big if $big;
return;
}
sub _get_bigwig {
my $file = shift;
return $OPENED_BIG{$file} if exists $OPENED_BIG{$file};
# open and cache the bigFile object
my $bw = _open_big($file) or
croak " Unable to open big file '$file'! $@\n";
unless ($bw->is_big_wig) {
croak " $file is not a bigWig file!\n";
}
$OPENED_BIG{$file} = $bw;
_record_seqids($file, $bw);
return $bw;
}
sub _get_bigbed {
my $file = shift;
return $OPENED_BIG{$file} if exists $OPENED_BIG{$file};
# open and cache the bigFile object
my $bb = _open_big($file) or
croak " Unable to open big file '$file'! $@\n";
unless ($bb->is_big_bed) {
croak " $file is not a bigBed file!\n";
}
$OPENED_BIG{$file} = $bb;
_record_seqids($file, $bb);
return $bb;
}
sub _record_seqids {
my ($file, $big) = @_;
$BIG_CHROMOS{$file} = {};
my $chroms = $big->chroms(); # returns hash seq_id => length
foreach my $c (keys %$chroms) {
my $chr = $chroms->{$c}{name};
my $len = $chroms->{$c}{length};
# store length
$BIG_CHROMOLENGTHS{$file}{$chr} = $len;
$BIG_CHROMOS{$file}{$chr} = $chr; # itself
# now store alternate names, with or without chr prefix
if ($chr =~ /^chr(.+)$/i) {
$BIG_CHROMOS{$file}{$1} = $chr;
}
else {
$BIG_CHROMOS{$file}{"chr$chr"} = $chr;
}
}
}
sub _remove_duplicate_positions {
# collect the feature and hashes
my ($pos2data, $duplicates) = @_;
# remove the duplicates
# we will combine all of them with a simple mean, what else to do?
foreach my $pos (keys %{ $duplicates } ) {
my $num = $duplicates->{$pos};
# collect all the values
my @values;
push @values, $pos2data->{$pos}; # initial value
for my $i (1..$num) {
push @values, $pos2data->{ "$pos\.$i" };
delete $pos2data->{ "$pos\.$i" };
}
$pos2data->{$pos} = sum(@values) / scalar(@values);
}
}
sub _lookup_bigwigset_wigs {
# passed parameters as array ref
# chromosome, start, stop, strand, strandedness, method, db, dataset
my $param = shift;
# the datasets, could be either types or names, unfortunately
my @types = splice(@$param, DATA);
# we cache the list of looked up bigwigs to avoid doing this over and over
my $lookup = sprintf("%s_%s_%s", join('_', @types), $param->[STND],
$param->[STR]);
return $BIGWIGSET_WIGS{$lookup} if exists $BIGWIGSET_WIGS{$lookup};
# filter first by the name or type
my @ids = $param->[DB]->filter_bigwigs(@types);
# then check for strand if necessary
if ($param->[STND] ne 'all' and $param->[STR] != 0) {
# looks like we are collecting stranded data
# try to filter again based on strand attribute
my $strand_to_keep;
if ($param->[STND] eq 'sense') {
$strand_to_keep = $param->[STR];
}
elsif ($param->[STND] eq 'antisense') {
$strand_to_keep = $param->[STR] == -1 ? 1 :
$param->[STR] == 1 ? -1 : 0;
}
else {
confess sprintf "bad strandedness value: %s", $param->[STND];
}
@ids = $param->[DB]->filter_bigwigs_by_strand($strand_to_keep, @ids);
}
# map the names back to full bigwig paths
my @paths = map { $param->[DB]->get_bigwig_path($_) } @ids;
# cache and return
$BIGWIGSET_WIGS{$lookup} = \@paths;
return \@paths;
}
package Bio::ToolBox::db_helper::big::BigWigSet;
# this package borrows concepts from Bio::DB::BigWigSet by Lincoln Stein
# in order to make it compatible with Bio::DB::Big
# it is NOT a drop-in replacement or equivalent
# if you need more functionality, install Bio::DB::BigWigSet
use Carp;
use IO::Dir;
use IO::File;
use File::Spec;
sub new {
my ($class, $dir) = @_;
croak "must call method new with a directory path!" unless $dir;
croak "BigWigSet '$dir' is not a directory path!" unless -d $dir;
my $self = {
dir => $dir,
bwfiles => {},
metadata => {}
};
# read directory
my $D = IO::Dir->new($dir) or croak "unable to open $dir! $!";
while (my $f = $D->read) {
my $f_path = File::Spec->catfile($dir, $f);
next unless -f $f_path;
if ($f =~ /\.(?:bw|bigwig)$/i) {
# bigwig file
$self->{bwfiles}{$f} = $f_path;
}
elsif ($f =~ /^meta.*\.txt$/i) {
# a genuine metadata file - excellent!
# let's open it and parse the contents
my $fh = IO::File->new($f_path) or
croak "unable to read metadata file '$f_path'! $!";
my $current_bw; # the current bigwig file we're describing
while (my $line = $fh->getline) {
next if $line =~ /^#/;
next if $line !~ /\w+/;
chomp $line;
if ($line =~ /\[(.+\.(?:bw|bigwig))\]/i) {
# start of a new metadata block for the current bw file
$current_bw = $1;
next;
}
elsif ($line =~ /^([\w\-\.]+)\s*=\s*([\w\-\.]+)/) {
# a metadata line
croak "malformed metadata file! no bw file stanza header before metadata line!"
unless defined $current_bw;
$self->{metadata}{$current_bw}{$1} = $2;
}
# ignore everything else
}
$fh->close;
}
# ignore all other files
}
undef $D;
# fill out the metadata
foreach my $f (keys %{$self->{bwfiles}}) {
# a genuine metadata file isn't absolutely required
# therefore we will always put in our own metadata gleaned from file name
# do a regex to pull out basename and possibly strand information from filename
my $name = $f;
$name =~ s/\.(?:bw|bigwig)$//i; # remove extension for the name
my ($type, $strand);
if ($f =~ /^(.+)[\._](?:f|for|forward|plus|\+)\.(?:bw|bigwig)$/i) {
$type = $1;
$strand = 1;
}
elsif ($f =~ /^(.+)[\._](?:r|rev|reverse|minus|\-)\.(?:bw|bigwig)$/i) {
$type = $1;
$strand = -1;
}
else {
$type = $name;
$strand = 0;
}
# be careful and do not overwrite pre_existing key value
$self->{metadata}{$f}{name} ||= $name;
$self->{metadata}{$f}{type} ||= $type;
$self->{metadata}{$f}{strand} ||= $strand;
}
bless $self, $class;
return $self;
}
sub bigwig_names {
my $self = shift;
my @b = keys %{ $self->{bwfiles} };
return wantarray ? @b : \@b;
}
sub bigwigs {
# to maintain compatiblity with the old BigWigSet, we will return the full path
# of all the bigwig files
my $self = shift;
my @b = values %{ $self->{bwfiles} };
return wantarray ? @b : \@b;
}
sub get_bigwig {
my ($self, $bwfile) = @_;
return unless $bwfile;
my $path = $self->get_bigwig_path($bwfile);
if ($path) {
return Bio::ToolBox::db_helper::big::open_bigwig_db($path);
}
else {
carp "unrecognized bigWig file name '$bwfile'!\n";
return;
}
}
sub get_bigwig_path {
my ($self, $bwfile) = @_;
return unless $bwfile;
return $self->{bwfiles}{$bwfile} || undef;
}
sub metadata {
my $self = shift;
return $self->{metadata};
}
sub filter_bigwigs {
my ($self, @names) = @_;
# we start with all of the bigwigs available, then filter out
my $start_list = $self->bigwig_names;
my $md = $self->metadata;
my @filtered;
foreach my $name (@names) {
# there may be more than one name of a dataset passed
# in all likelihood, probably not, but just in case....
# we are filtering on basically any value that matches and are not really
# restricting on a specific attribute key
# so type, name, display_name, or primary_tag are all checked in that order
# let's hope this is adequate and won't create too many problems
# I'm betting on the fact that bigwigset databases are so obscure that
# most users won't even bother with a genuine metadata file
# in fact, why am I even bothering with this at all?
# because they are a cool concept and I occasionally use them. huh.
foreach my $b (@$start_list) {
if (exists $md->{$b}{type}) {
push @filtered, $b if $name eq $md->{$b}{type};
}
elsif (exists $md->{$b}{name}) {
push @filtered, $b if $name eq $md->{$b}{name};
}
elsif (exists $md->{$b}{display_name}) {
push @filtered, $b if $name eq $md->{$b}{display_name};
}
elsif (exists $md->{$b}{primary_tag}) {
push @filtered, $b if $name eq $md->{$b}{primary_tag};
}
}
}
return wantarray ? @filtered : \@filtered;
}
sub filter_bigwigs_by_strand {
my $self = shift;
my $strand = shift;
# check what files were passed
my @files;
if (scalar(@_) == 1 and ref($files[0]) eq 'ARRAY') {
@files = @$_[0];
}
elsif (scalar(@_) == 0) {
@files = $self->bigwig_names;
}
else {
@files = @_;
}
# filter the bigWigs based on strand
# we keep anything that matches the given strand
# because the old BigWigSet adapter didn't really handle strand properly,
# and certainly not standard BioPerl 1,0,-1 values, I had adopted plus, none, minus
# now I'm doomed as I have to support this too. ugh.
my @keepers;
my $md = $self->metadata;
if ($strand >= 0) {
foreach my $f (@files) {
my $str = $md->{$f}{strand} || 0;
if ($str =~ /[a-z]/) {
# ugh old school
$str = $str eq 'plus' ? 1 :
$str eq 'minus' ? -1 :
$str eq 'none' ? 0 : 0;
}
push @keepers, $f if $str >= 0;
}
}
elsif ($strand < 0) {
foreach my $f (@files) {
my $str = $md->{$f}{strand} || 0;
if ($str =~ /[a-z]/) {
# ugh old school
$str = $str eq 'plus' ? 1 :
$str eq 'minus' ? -1 :
$str eq 'none' ? 0 : 0;
}
push @keepers, $f if $str < 0;
}
}
return @keepers;
}
package Bio::ToolBox::db_helper::big::BedIteratorWrapper;
use Carp;
use Bio::ToolBox::SeqFeature;
sub new {
my $class = shift;
# check the bigBed object
my $bb = shift;
unless (ref($bb) eq 'Bio::DB::Big::File' and $bb->is_big_bed) {
confess "passed big object is not a bigBed file!";
}
# get coordinates
my ($seqid, $start, $end) = @_;
confess "no coordinates!" unless (defined $seqid and defined $start and defined $end);
$start -= 1; # compensate for 0-based coordinates
# create an iterator
# include all string entries and just one per iteration
# return 10 at a time, not too many, not too little
my $iterator = $bb->get_entries_iterator($seqid, $start, $end, 1, 10);
# return object wrapper
my $self = {
bb => $bb,
iter => $iterator,
seqid => $seqid,
start => $start,
end => $end,
entries => [],
};
return bless $self, $class;
}
sub next_seq {
my $self = shift;
my $entry = shift @{ $self->{entries} } || undef;
unless ($entry) {
$self->{entries} = $self->{iter}->next || undef;
return unless $self->{entries};
$entry = shift @{ $self->{entries} } || undef;
}
return unless $entry;
my @bits = split('\t', $entry->{string});
return Bio::ToolBox::SeqFeature->new(
-seq_id => $self->{seqid},
-start => $entry->{start} + 1, # compensate for 0-based coordinates
-end => $entry->{end},
-display_name => $bits[0] || undef,
-score => $bits[1] || 1,
-strand => $bits[2] || 0,
);
}
__END__
=head1 NAME
Bio::ToolBox::db_helper::big
=head1 DESCRIPTION
This module provides support for binary BigWig and BigBed files to
the L<Bio::ToolBox> package. It also provides minimal support for a
directory of one or more bigWig files as a combined database, known as a
BigWigSet.
=head1 USAGE
The module requires L<Bio::DB::Big> to be installed, which in turn
requires the L<libBigWig | https://github.com/dpryan79/libBigWig> C
library to be installed. This provides a simpler and easier-to-install
library compared to the UCSC Kent C libraries.
In general, this module should not be used directly. Use the methods
available in L<Bio::ToolBox::Data> or L<Bio::ToolBox::db_helper>.
All subroutines are exported by default.
=head2 Available subroutines
=over
=item collect_bigwig_score()
This subroutine will collect a single value from a binary bigWig file.
It uses the low-level summary method to collect the statistical
information and is therefore significantly faster than the other
methods, which rely upon parsing individual data points across the
region.
The subroutine is passed a parameter array reference. See below for details.
The object will return either a valid score or a null value.
=item collect_bigwigset_score()
Similar to collect_bigwig_score() but using a BigWigSet database of
BigWig files. Unlike individual BigWig files, BigWigSet features support
stranded data collection if a strand attribute is defined in the metadata
file.
The subroutine is passed a parameter array reference. See below for details.
=item collect_bigwig_scores()
This subroutine will collect only the score values from a binary BigWig file
for the specified database region. The positional information of the
scores is not retained.
The subroutine is passed a parameter array reference. See below for details.
The subroutine returns an array or array reference of the requested dataset
values found within the region of interest.
=item collect_bigwigset_scores()
Similar to collect_bigwig_scores() but using a BigWigSet database of
BigWig files. Unlike individual BigWig files, BigWigSet features support
stranded data collection if a strand attribute is defined in the metadata
file.
The subroutine is passed a parameter array reference. See below for details.
=item collect_bigwig_position_scores()
This subroutine will collect the score values from a binary BigWig file
for the specified database region keyed by position.
The subroutine is passed a parameter array reference. See below for details.
The subroutine returns a hash of the defined dataset values found within
the region of interest keyed by position. Note that only one value is
returned per position, regardless of the number of dataset features
passed. Usually this isn't a problem as only one dataset is examined at a
time.
=item collect_bigwigset_position_scores()
Similar to collect_bigwig_position_scores() but using a BigWigSet database
of BigWig files. Unlike individual BigWig files, BigWigSet features support
stranded data collection if a strand attribute is defined in the metadata
file.
The subroutine is passed a parameter array reference. See below for details.
=item open_bigwig_db()
This subroutine will open a BigWig database connection. Pass either the
local path to a bigWig file (.bw extension) or the URL of a remote bigWig
file. It will return the opened database object.
=item open_bigwigset_db()
This subroutine will open a BigWigSet database connection using a directory
of BigWig files and one metadata index file, as described in
Bio::DB::BigWigSet. Essentially, this treats a directory of BigWig files as
a single database with each BigWig file representing a different feature
with unique attributes (type, source, strand, etc).
Pass the subroutine a scalar value representing the local path to the
directory. It presumes a feature_type of 'region', as expected by the other
Bio::ToolBox::db_helper subroutines and modules. It will return the opened database
object.
=back
=head2 Data Collection Parameters Reference
The data collection subroutines are passed an array reference of parameters.
The recommended method for data collection is to use get_segment_score() method from
L<Bio::ToolBox::db_helper>.
The parameters array reference includes these items:
=over 4
=item 1. The chromosome or seq_id
=item 1. The start position of the segment to collect
=item 3. The stop or end position of the segment to collect
=item 4. The strand of the segment to collect
Should be standard BioPerl representation: -1, 0, or 1.
=item 5. The strandedness of the data to collect
A scalar value representing the desired strandedness of the data
to be collected. Acceptable values include "sense", "antisense",
or "all". Only those scores which match the indicated
strandedness are collected.
=item 6. The method for combining scores.
Acceptable values include mean, min, and max when collecting
single score over a genomic segment. This uses the built-in
statistic zoom levels of the bigWig.
=item 7. A database object.
Pass the opened L<Bio::DB::BigWigSet> database object when working
with BigWigSets.
=item 8 and higher. BigWig file names or BigWigSet database types.
Opened BigWig objects are cached. Both local and remote BigWig files
are supported.
=back
=head1 SUPPORT MODULES
This includes two additional object-oriented modules for supporting
BigWigSets and bigBed SeqFeature iteration.
=head2 Bio::ToolBox::db_helper::big::BigWigSet
This provides support for a BigWigSet, which is not natively supported
by the L<Bio::DB::Big> adapter, and is based on the concepts from the
L<Bio::DB::BigWigSet> adapter. However, it is B<NOT> a drop-in replacement,
only a few methods are provided, and only a few of these are similar to
the original adapter.
This adapter will still read a C<INI>-style F<metadata.txt>
file as described in L<Bio::DB::BigWigSet> for metadata. Briefly, this
file format is similar to below
[file1.bw]
name = mydata
type = ChIPSeq
[file2.bw]
name = mydata2
type = ChIPSeq
Each bigWig file in the directory should have a stanza entry with the
path and file name in the stanza header in square brackets. Metadata
is included as simple key = value pairs, where keys can be typical
SeqFeature attributes, including C<display_name> or C<name>,
C<primary_tag> or C<type>, and C<strand>.
B<NOTE:> Metadata text files are ideal, but not required. If a
metadata file is not present, appropriate metadata will be determined
from the bigWig file names, using the basename as the metadata C<name>
and possibly extracting the strand from the end of the filename, if
it ends in a F<_f> or F<_r>.
The following methods are available.
=over 4
=item new
Generate a new BigWigSet object. The path of the directory must be
passed as an argument. The contents of the directory will be read,
bigWig files located, metadata files (if any) read and processed.
Fasta sequences are not supported.
=item bigwig_names
Returns an array or array reference of the bigWig file names. These
are just the file names, without the path.
=item bigwigs
Returns an array or array reference of the full path for the bigWig
files. This is identical to the L<Bio::DB::BigWigSet> method.
=item get_bigwig
Given a bigwig name, this will return an opened bigwig database
L<Bio::DB::Big> object. This is identical to the L<Bio::DB::BigWigSet>
method.
=item get_bigwig_path
Given a bigwig name, this will return the full path to the corresponding
bigWig file.
=item metadata
This will return a hash reference pointing to the metadata hash structure,
the keys of which are bigWig names, and the values are hash references
for the metadata key = value metadata pairs. This is identical to the
L<Bio::DB::BigWigSet> method.
=item filter_bigwigs
Provide one or more names, primary tags, or types to filter the bigWig
files in the Set. For the purposes of this simple method, no distinction is
made whether the filtering criteria is a C<display_name>, C<primary_tag>,
or C<type>. The provided text strings will be used to search all the
metadata values, and the names of files with exact matches are returned.
For purposes of filtering, the following metadata keys are searched in the
following order: C<type>, C<name>, C<display_name>, and C<primary_tag>, and
the first match is kept.
=item filter_bigwigs_by_strand
Pass first the strand, and then optionally a list of bigWig names,
perhaps the results from filter_bigwigs(). If no names were passed, all
the names in the BigWigSet will be considered. The names of files whose
strand matches the given strand will be returned.
=back
=head2 Bio::ToolBox::db_helper::big::BedIteratorWrapper
This is an object wrapper around a bigBed database for retrieving items
and returning them as convenient SeqFeature objects.
Only the first 3 to 6 standard BED columns are supported: seq_id,
start, stop, name, score, and strand.
The following methods are provided.
=over 4
=item new
Pass the new method the following items: opened L<Bio::DB::Big> bigBed object,
chromosome, start, and end coordinates.
=item next_seq
This will return the next available feature in the established search interval
as a L<Bio::ToolBox::SeqFeature> object. The method name is consistent with
other L<Bio::Perl> compatible objects and iterators. Yeah, it sucks, and not
very apropos to the actual function. Oh well.
=back
=head1 AUTHOR
Timothy J. Parnell, PhD
Dept of Oncological Sciences
Huntsman Cancer Institute
University of Utah
Salt Lake City, UT, 84112
This package is free software; you can redistribute it and/or modify
it under the terms of the Artistic License 2.0.