package Statistics::Running;
use 5.006;
use strict;
use warnings;
use Data::Dumper;
our $VERSION = '0.13';
# overload these operators to have special meaning when
# operand(s) are Statistics::Running:
use overload
# add two stats object and adjust summed mean,stdev etc.
'+' => \&concatenate,
# check if two stats objects are same wrt mean,stdev,N BUT NOT histogram
'==' => \&equals,
# convert a stats object into a string, e.g. print $obj."\n";
'""' => \&stringify,
;
use Try::Tiny;
use Statistics::Histogram;
# this is for all numerical equality comparisons
use constant SMALL_NUMBER_FOR_EQUALITY => 1E-10;
# creates an obj. There are no input params
sub new {
my $class = $_[0];
my $parent = ( caller(1) )[3] || "N/A";
my $whoami = ( caller(0) )[3];
my $self = {
# these are internal variables to store mean etc. or used to calculate Kurtosis
'M1' => 0.0,
'M2' => 0.0,
'M3' => 0.0,
'M4' => 0.0,
'ABS-SUM' => 0.0,
'SUM' => 0.0,
'MIN' => 0.0,
'MAX' => 0.0,
'N' => 0, # number of data items inserted
# this histogram is updated each time a new data point is pushed in the object
# it just holds the number of items in each bin, so it is not too expensive.
# with this we get an idea of the Probability Distribution of the pushed data.
# Which may or may not be useful to users.
# Should you want to avoid this then use Statistics::Running::Tiny
'histo' => {
'num-bins' => -1,
'bins' => {
# b: [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary]
'b' => [], # length is 'num-bins'+1
# c: contains the counts, its size is equal to the number of bins
# the first cell contains counts in the interval [histo-left-boundary, bin1_right_boundary]
# the last cell contains counts of [binN-1_right_boundary, histo-right-boundary]
'c' => [], # length 'num-bins'
},
# cached stringified histogram, it is re-calculated only if data points added
# and asked to print histogram
'stringified' => undef,
# when asked to stringify a hist we actually use a cached string
# which needs to be recalculated whenever data is added or hist re-created
'needs-recalculate' => 1,
},
};
bless($self, $class);
$self->clear();
return $self
}
# returns the histogram bins (can be empty) in our internal format
sub histogram { return $_[0]->{'histo'} }
# if no params, it returns our bins as a hash
# otherwise it imports input bins in the form of a hash
# and before that it erases previous histogram and forms it according to input, e.g.
# sets bin-widths and numbins etc.
sub histogram_bins_hash {
my $self = $_[0];
my $bins = $_[1];
if( ! defined($bins) ){
# export to a hash
return $self->_bins2hash()
}
# import from a hash
$self->_hash2bins($bins);
}
# if no params, it returns our bins as a hash of the form returned by Statistics::Descriptive::frequency_distribution()
# otherwise it imports input bins in the form of a hash in the form returned by Statistics::Descriptive::frequency_distribution()
sub histogram_bins_stathash {
my $self = $_[0];
my $bi = $_[1];
if( ! defined($bi) ){
# export to a hash
return $self->_bins2stathash()
}
# import from a hash
$self->_stathash2bins($bi);
}
# return a string showing this histogram by calling Statistics::Histogram::print_histogram()
# we first convert our hist to stathash format
sub histogram_stringify {
my ($self, @opts) = @_;
if( $self->{'histo'}->{'needs-recalculate'} == 1 ){ $self->_histogram_recalculate(@opts) }
return $self->{'histo'}->{'stringified'}
}
# we need to recalculate each time a new data is added.
# but we do recalculate whenever it is needed, i.e. when we asked to print histogram
sub _histogram_recalculate {
my ($self, @stringify_opts) = @_;
my $histstr = "<no-bins>";
if( $self->{'histo'}->{'num-bins'} > 0 ){
Try::Tiny::try {
$histstr = Statistics::Histogram::print_histogram(
'hist' => $self->_bins2stathash(),
'x_min' => $self->{'histo'}->{'bins'}->{'b'}->[0],
use_linear_axes => 1,
@stringify_opts
)
} Try::Tiny::catch {
print STDERR "_histogram_recalculate() : error caught trying to stringify: $_\n";
$histstr = "<no-bins>";
};
}
$self->{'histo'}->{'stringified'} = $histstr;
$self->{'histo'}->{'needs-recalculate'} = 0;
}
# disable histogram logging, all existing histogram data is erased
sub histogram_disable {
my $self = $_[0];
$self->{'histo'}->{'num-bins'} = -1;
$self->{'histo'}->{'bins'}->{'b'} = [];
$self->{'histo'}->{'bins'}->{'c'} = [];
$self->{'histo'}->{'needs-recalculate'} = 1;
}
# returns the count in bin specified as 1st input param
sub histogram_count { return $_[0]->{'histo'}->{'c'}->[$_[1]] }
# enables histogram logging
# it expects some parameters for creating the histogram in various forms,
# e.g. by specifying the number of bins, bin-width and left boundary or
# by specifying a HASH or ARRAY of bin specifications for non-uniform bin
# sizes. HASH must be of the form 'FROM:TO'->counts
# ARRAY of bin boundaries of the form
# [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary]
# the number of bins is 1 less than the length of this array
sub histogram_enable {
my $self = $_[0];
my $params = $_[1]; # $_[1] // {} does not work for perl<5.10, ? : requests $_[1] twice, so Cish if( ! defined below...
if( ! defined($params) ){ $params = {} }
my ($m1, $m2, $m3);
if( defined($m1=$params->{'bins'}) ){
my $aref = ref($m1);
if( $aref eq 'ARRAY' ){
# an array of bin boundaries of the form
# [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary]
# the number of bins is 1 less than the length of this array
my @mm = @$m1;
$self->{'histo'}->{'num-bins'} = scalar(@mm)-1;
$self->{'histo'}->{'bins'}->{'b'} = [@mm];
$self->{'histo'}->{'bins'}->{'c'} = (0) x $self->{'histo'}->{'num-bins'};
} elsif( $aref eq 'HASH' ){
# a hashref keyed on bin-intervals in the form FROM:TO->counts
$self->_hash2bins($m1);
} else { die "parameter 'bins' expects either a HASHREF keyed on bin-intervals in the form FROM:TO->counts (and counts can be non-zero if that is a previous histogram), or an ARRAYREF with bin boundaries of the form [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary]. In this case the number of bins is 1 less than the length of the array." }
} elsif( defined($m1=$params->{'bin-width'})
&& defined($m2=$params->{'num-bins'})
&& defined($m3=$params->{'left-boundary'})
){
# we re-create our own bins based on num-bins etc.
$self->_histogram_create_bins_from_spec($m1, $m2, $m3)
} else {
# no params! we need params
print STDERR "enable_histogram() : failed to enable histogram because no histogram specification was supplied. Try enable_histogram({bin-width=>1, nun-bins=>10, left-boundary=>-5});\n";
}
$self->{'histo'}->{'needs-recalculate'} = 1;
}
# set existing histogram to zero counts
sub histogram_reset {
my $self = $_[0];
# no params, set all counts OF ALREADY EXISTING histogram to zero
my $m1 = $self->{'histo'}->{'bins'}->{'c'};
for(my $i=$self->{'histo'}->{'num-bins'};$i-->0;){ $m1->[$i] = 0 }
$self->{'histo'}->{'needs-recalculate'} = 1;
}
# push Data: a sample and process/update mean and all other stat measures
# also insert it in histogram
sub add {
my $self = $_[0];
my $x = $_[1];
my $aref = ref($x);
if( $aref eq '' ){
# a scalar input
my ($delta, $delta_n, $delta_n2, $term1);
my $n1 = $self->{'N'};
if( $n1 == 0 ){ $self->{'MIN'} = $self->{'MAX'} = $x }
else {
if( $x < $self->{'MIN'} ){ $self->{'MIN'} = $x }
if( $x > $self->{'MAX'} ){ $self->{'MAX'} = $x }
}
$self->{'SUM'} += $x; # add x to the total SUM
$self->{'ABS-SUM'} += abs($x); # add abs(x) to the total SUM
$self->{'N'} += 1; # increment sample size push in
my $n0 = $self->{'N'};
$delta = $x - $self->{'M1'};
$delta_n = $delta / $n0;
$delta_n2 = $delta_n * $delta_n;
$term1 = $delta * $delta_n * $n1;
$self->{'M1'} += $delta_n;
$self->{'M4'} += $term1 * $delta_n2 * ($n0*$n0 - 3*$n0 + 3)
+ 6 * $delta_n2 * $self->{'M2'}
- 4 * $delta_n * $self->{'M3'}
;
$self->{'M3'} += $term1 * $delta_n * ($n0 - 2)
- 3 * $delta_n * $self->{'M2'}
;
$self->{'M2'} += $term1;
# add data point to the internal histogram
$self->_histogram_add($x);
} elsif( $aref eq 'ARRAY' ){
# an array input
foreach (@$x){ $self->add($_) }
} else {
die "add(): only ARRAY and SCALAR can be handled (input was type '$aref')."
}
}
# copies input(=src) Running obj into current/self overwriting our data, this is not a clone()!
sub copy_from {
my $self = $_[0];
my $src = $_[1];
$self->{'M1'} = $src->M1();
$self->{'M2'} = $src->M2();
$self->{'M3'} = $src->M3();
$self->{'M4'} = $src->M4();
$self->set_N($src->get_N());
$self->_histogram_copy_from($src);
}
# clones current obj into a new Running obj with same values
sub clone {
my $self = $_[0];
my $newO = Statistics::Running->new();
$newO->{'M1'} = $self->M1();
$newO->{'M2'} = $self->M2();
$newO->{'M3'} = $self->M3();
$newO->{'M4'} = $self->M4();
$newO->set_N($self->get_N());
return $newO
}
# clears all data entered/calculated including histogram
sub clear {
my $self = $_[0];
$self->{'M1'} = 0.0;
$self->{'M2'} = 0.0;
$self->{'M3'} = 0.0;
$self->{'M4'} = 0.0;
$self->{'MIN'} = 0.0;
$self->{'MAX'} = 0.0;
$self->{'ABS-SUM'} = 0.0;
$self->{'SUM'} = 0.0;
$self->{'N'} = 0;
$self->histogram_reset();
}
# return the mean of the data entered so far
sub mean { return $_[0]->{'M1'} }
sub sum { return $_[0]->{'SUM'} }
sub abs_sum { return $_[0]->{'ABS-SUM'} }
sub min { return $_[0]->{'MIN'} }
sub max { return $_[0]->{'MAX'} }
# get number of total elements entered so far
sub get_N { return $_[0]->{'N'} }
sub variance {
my $self = $_[0];
my $m = $self->{'N'};
if( $m == 1 ){ return 0 }
return $self->{'M2'}/($m-1.0)
}
sub standard_deviation { return sqrt($_[0]->variance()) }
sub skewness {
my $self = $_[0];
my $m = $self->{'M2'};
if( $m == 0 ){ return 0 }
return sqrt($self->{'N'})
* $self->{'M3'} / ($m ** 1.5)
;
}
sub kurtosis {
my $self = $_[0];
my $m = $self->{'M2'};
if( $m == 0 ){ return 0 }
return $self->{'N'}
* $self->{'M4'}
/ ($m * $m)
- 3.0
;
}
# concatenates another Running obj with current
# AND returns a new Running obj with concatenated stats
# Current object is not modified.
sub concatenate {
my $self = $_[0]; # us
my $other = $_[1]; # another Running obj
my $combined = Statistics::Running->new();
my $selfN = $self->get_N();
my $otherN = $other->get_N();
my $selfM2 = $self->M2();
my $otherM2 = $other->M2();
my $selfM3 = $self->M3();
my $otherM3 = $other->M3();
my $combN = $selfN + $otherN;
$combined->set_N($combN);
my $delta = $other->M1() - $self->M1();
my $delta2 = $delta*$delta;
my $delta3 = $delta*$delta2;
my $delta4 = $delta2*$delta2;
$combined->{'M1'} = ($selfN*$self->M1() + $otherN*$other->M1()) / $combN;
$combined->{'M2'} = $selfM2 + $otherM2 +
$delta2 * $selfN * $otherN / $combN;
$combined->{'M3'} = $selfM3 + $otherM3 +
$delta3 * $selfN * $otherN * ($selfN - $otherN)/($combN*$combN) +
3.0*$delta * ($selfN*$otherM2 - $otherN*$selfM2) / $combN
;
$combined->{'M4'} = $self->{'M4'} + $other->{'M4'}
+ $delta4*$selfN*$otherN * ($selfN*$selfN - $selfN*$otherN + $otherN*$otherN) /
($combN*$combN*$combN)
+ 6.0*$delta2 * ($selfN*$selfN*$otherM2 + $otherN*$otherN*$selfM2)/($combN*$combN) +
4.0*$delta*($selfN*$otherM3 - $otherN*$selfM3) / $combN
;
$combined->{'SUM'} = $self->{'SUM'} + $other->{'SUM'};
$combined->{'ABS-SUM'} = $self->{'ABS-SUM'} + $other->{'ABS-SUM'};
$combined->{'MIN'} = $self->{'MIN'} < $other->{'MIN'} ? $self->{'MIN'} : $other->{'MIN'};
$combined->{'MAX'} = $self->{'MAX'} > $other->{'MAX'} ? $self->{'MAX'} : $other->{'MAX'};
# add the histograms only if structure matches:
if( $self->_equals_histograms_structure($other) ){
$combined->_histogram_copy_from($self);
$combined->_add_histograms($other);
}
return $combined;
}
# appends another Running obj INTO current
# histogram data is appended only if histogram specs are the same
# current obj (self) IS MODIFIED
sub append {
my $self = $_[0]; # us
my $other = $_[1]; # another Running obj
$self->copy_from($self+$other);
}
# equality only wrt to stats BUT NOT histogram
sub equals {
my $self = $_[0]; # us
my $other = $_[1]; # another Running obj
return
$self->get_N() == $other->get_N() &&
$self->equals_statistics($other)
}
sub equals_statistics {
my $self = $_[0]; # us
my $other = $_[1]; # another Running obj
return
abs($self->M1()-$other->M1()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY &&
abs($self->M2()-$other->M2()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY &&
abs($self->M3()-$other->M3()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY &&
abs($self->M4()-$other->M4()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY
}
# checks if structure is same and then if bin contents (counts) are same
# returns 1 if equals
# returns 0 if either structure or counts are not the same
sub equals_histograms {
my $self = $_[0]; # us
my $other = $_[1]; # another Running obj
# structure is not the same
if( $self->_equals_histograms_structure($other) == 0 ){ return 0 }
my $selfC = $self->{'histo'}->{'bins'}->{'c'};
my $otherC = $other->{'histo'}->{'bins'}->{'c'};
my $i;
for($i=$self->{'histo'}->{'num-bins'};$i-->0;){
if( $selfC->[$i] != $otherC->[$i] ){ return 0 }
}
return 1 # equal in structure and counts
}
# adds counts of histograms to us from other
# returns 0 if structures do not match
# returns 1 if counts added OK
sub _add_histograms {
my $self = $_[0]; # us
my $other = $_[1]; # another Running obj
# structure is not the same
if( $self->_equals_histograms_structure($other) == 0 ){ return 0 }
my $selfC = $self->{'histo'}->{'bins'}->{'c'};
my $otherC = $other->{'histo'}->{'bins'}->{'c'};
my $i;
for($i=$self->{'histo'}->{'num-bins'};$i-->0;){
$selfC->[$i] += $otherC->[$i];
}
$self->{'histo'}->{'needs-recalculate'} = 1;
return 1 # counts added
}
# print object as a string, string concat/printing is overloaded on this method
sub stringify {
my $self = $_[0];
return "N: ".$self->get_N()
.", mean: ".$self->mean()
.", range: ".$self->min()." to ".$self->max()
.", sum: ".$self->sum()
.", standard deviation: ".$self->standard_deviation()
.", kurtosis: ".$self->kurtosis()
.", skewness: ".$self->skewness()
.", histogram:\n".$self->histogram_stringify()
}
# internal methods, no need for anyone to know or use externally
sub set_N { $_[0]->{'N'} = $_[1] }
sub M1 { return $_[0]->{'M1'} }
sub M2 { return $_[0]->{'M2'} }
sub M3 { return $_[0]->{'M3'} }
sub M4 { return $_[0]->{'M4'} }
# copy src's histogram to us, erasing previous data and histo-format
sub _histogram_copy_from {
my $self = $_[0];
my $src = $_[1]; # a src stats object whose histogram we are copying onto us
$self->histogram_bins_hash($src->histogram_bins_hash());
}
# given bin-width, num-bins and left-boundary create the bin arrays
sub _histogram_create_bins_from_spec {
my ($self, $bw, $nb, $lb) = @_;
$self->{'histo'}->{'num-bins'} = $nb;
my @B = (0)x($nb+1);
my ($i);
my $v = $lb;
for($i=0;$i<=$nb;$i++){
$B[$i] = $v;
$v += $bw;
}
$self->{'histo'}->{'bins'}->{'b'} = \@B;
$self->{'histo'}->{'bins'}->{'c'} = [(0)x$nb];
$self->{'histo'}->{'needs-recalculate'} = 1;
}
# add a datapoint to the histogram, this is usually called only via the public add()
sub _histogram_add {
my $self = $_[0];
my $x = $_[1]; # value to add
my ($n, $i);
if( ($n=$self->{'histo'}->{'num-bins'}) <= 0 ){ return }
my $B = $self->{'histo'}->{'bins'}->{'b'};
for($i=0;$i<$n;$i++){
if( ($x > $B->[$i]) && ($x <= $B->[$i+1]) ){
$self->{'histo'}->{'bins'}->{'c'}->[$i]++;
$self->{'histo'}->{'needs-recalculate'} = 1; # need to recalc stringify
return
}
}
}
# given the bins and bin counts arrays, return a hash in the natural form:
# from-bin:to-bin -> count
# see also _bins2stathash for returning a hash of the format specified in Statistics::Descriptive
sub _bins2hash {
my $self = $_[0];
my %ret = ();
my $B = $self->{'histo'}->{'bins'}->{'b'};
my $C = $self->{'histo'}->{'bins'}->{'c'};
my $i;
for($i=$self->{'histo'}->{'num-bins'};$i-->0;){
$ret{$B->[$i].":".$B->[$i+1]} = $C->[$i]
}
return \%ret
}
# given the bins and bin counts arrays, return a hash with keys
# to-bin -> count
# whereas count is the count of the bin specified by to-bin and its previous key of the hash
sub _bins2stathash {
my $self = $_[0];
my %ret = ();
my $B = $self->{'histo'}->{'bins'}->{'b'};
my $C = $self->{'histo'}->{'bins'}->{'c'};
my $i;
for($i=$self->{'histo'}->{'num-bins'}-1;$i-->0;){
$ret{$B->[$i+1]} = $C->[$i]
}
return \%ret
}
# given a hash with keys
# from-bin:to-bin -> count
# erase and re-create the bin and counts arrays of histo.
# for a way to import Statistics::Descriptive frequency_distribution hash check _stathash2bins()
sub _hash2bins {
my $self = $_[0];
my $H = $_[1];
my @B = ();
my @C = ();
my @K = keys %$H;
$self->{'histo'}->{'num-bins'} = scalar(@K);
my ($acount, $akey);
my @X = map {
push(@B, $_->[1]); # left-bin (from)
push(@C, $H->{$_->[0]}); # counts
$_->[2]; # spit out the right-bin (to)
}
sort { $a->[1] <=> $b->[1] }
map { [ $_, split(/\:/, $_) ] }
@K
;
push(@B, $X[-1]);
$self->{'histo'}->{'bins'}->{'b'} = \@B;
$self->{'histo'}->{'bins'}->{'c'} = \@C;
$self->{'histo'}->{'needs-recalculate'} = 1;
}
# given a hash with keys
# to-bin -> count
# erase and re-create the bin and counts arrays of histo.
# the hash is exactly what Statistics::Descriptive::frequency_distribution() returns
# there is only one problem: what is the left-boundary? we will set it to -infinity.
sub _stathash2bins {
my $self = $_[0];
my $H = $_[1]; # hashref: exactly what Statistics::Descriptive::frequency_distribution() returns
my @B = ();
my @C = ();
my @K = keys %$H;
$self->{'histo'}->{'num-bins'} = scalar(@K);
my ($acount, $akey);
push(@B, -(~0 >> 1)); # -MAX_INT fuck you.
foreach my $k (sort { $a <=> $b } keys %$H){
push(@B, $k);
push(@C, $H->{$k});
}
$self->{'histo'}->{'bins'}->{'b'} = \@B;
$self->{'histo'}->{'bins'}->{'c'} = \@C;
$self->{'histo'}->{'needs-recalculate'} = 1;
}
# compares the structure of the histograms of us and another obj
# if histograms have same number of bins and same bin-specs (boundaries)
# then histograms are equal and returns 1
# if both histograms contain zero bins (not initialised) then also returns 1
# else, histogram structure differs and returns 0
sub _equals_histograms_structure {
my ($self, $other) = @_;
my $NB1 = $self->{'histo'}->{'num-bins'};
if( $NB1 != $other->{'histo'}->{'num-bins'} ){ return 0 }
# no bins, so equal!
if( $NB1 == -1 ){ return 1 }
my $b1 = $self->{'histo'}->{'bins'}->{'b'};
my $b2 = $other->{'histo'}->{'bins'}->{'b'};
for(my $i=$NB1+1;$i-->0;){
if( $b1->[$i] != $b2->[$i] ){ return 0 }
}
return 1 # equal histogram STRUCTURES (not bincounts)
}
1;
__END__
# end program, below is the POD
=pod
=encoding UTF-8
=head1 NAME
Statistics::Running - Basic descriptive statistics (mean/stdev/min/max/skew/kurtosis) and discrete Probability Distribution (via histogram) over data without the need to store data points ever. OOP style.
=head1 VERSION
Version 0.11
=head1 SYNOPSIS
use Statistics::Running;
my $ru = Statistics::Running->new();
for(1..100){
$ru->add(rand());
}
print "mean: ".$ru->mean()."\n";
$ru->add(12345);
print "mean: ".$ru->mean()."\n";
my $ru2 = Statistics::Running->new();
$ru2->histogram_enable({
'num-bins' => 10,
'bin-width' => 0.01,
'left-boundary' => 0
});
for(1..100){
$ru2->add(rand());
}
print "Probability Distribution of data:\n".$ru2->histogram_stringify()."\n";
# add two stat objects together (histograms are not!)
my $ru3 = $ru + $ru2;
print "mean of concatenated data: ".$ru3->mean()."\n";
$ru += $ru2;
print "mean after appending data: ".$ru->mean()."\n";
print "stats: ".$ru->stringify()."\n";
# example output:
print $ru2."\n";
N: 100, mean: 0.488978434779093, range: 0.0056063539679414 to 0.99129297226348, standard deviation: 0.298129905728534, kurtosis: -1.22046199974301, skewness: -0.0268827866000826, histogram:
0.000 - 0.010: 2 #####################################################
0.010 - 0.020: 2 #####################################################
0.020 - 0.030: 2 #####################################################
0.030 - 0.041: 2 #####################################################
0.041 - 0.051: 1 ###########################
0.051 - 0.062: 0 |
0.062 - 0.073: 2 #####################################################
0.073 - 0.083: 0 |
0.083 - 0.094: 1 ###########################
=head1 DESCRIPTION
Statistics are updated every time a new data point
is added in. The common practice to calculate descriptive
statistics for 5 data points as well as 1 billion points
is to store them in an array,
loop over the array to calculate the mean, then loop over the array
again to calculate standard deviation, as Sum (x_i-mean)**2.
Standard deviation is the reason data is stored in the array.
This module uses B.P.Welford's method to calculate descriptive
statistics by continually adjusting the stats and not storing
a single data point. Except from the computational and environmental
benefits of such an approach, B.P.Welford's method is also
immune to accumulated precision errors. It is stable and accurate.
For more details on the method and its stability look at this:
L<John D. Cook's article and C++ implementation|https://www.johndcook.com/blog/skewness_kurtosis>
A version without the histogram exists under L<Statistics::Running::Tiny>
and is faster, obviously. About 25% faster.
There are three amazing things about B.P.Welford's algorithm implemented here:
=over 4
=item 1. It calculates and keeps updating mean/standard-deviation etc. on
data without the need to store that data. As new data comes in, the
statistics are updated based on the state of a few variables (mean, number
of data points, etc.) but not the past data points. This includes the
calculation of standard deviation which most of us knew (wrongly) that
it requires a second pass on the data points, after the mean is calculated.
Well, B.P.Welford found a way to avoid this.
=item 2. The standard formula for standard deviation requires to sum
the square of the difference of each sample from the mean.
If samples are large numbers then you are summing differences of large
numbers. If further there is little difference between samples, and the
discrepancy from the mean is small, then you are prone to
precision errors which accumulate to destructive effect if the number of
samples is large. In contrast, B.P.Welford's algorithm does
not suffer from this, it is stable and accurate.
=item 3. B.P.Welford's online statistics algorithm
is quite a revolutionary idea and why is not an obligatory subject
in first-year programming courses is beyond comprehension.
Here is a way to decrease those CO2 emissions.
=back
The basis for the code in this module comes from
L<John D. Cook's article and C++ implementation|https://www.johndcook.com/blog/skewness_kurtosis>
=head1 EXPORT
Nothing, this is an Object Oriented module. Once you instantiate
an object all its methods are yours.
=head1 SUBROUTINES/METHODS
=head2 new
Constructor, initialises internal variables.
=head2 add
Update our statistics after one more data point/sample (or an
array of them) is presented to us.
my $ru1 = Statistics::Running->new();
for(1..100){
$ru1->add(rand());
print $ru1."\n";
}
Input can be a single data point (a scalar) or a reference
to an array of data points.
=head2 copy_from
Copy state of input object into current effectively making us like
them. Our previous state is forgotten. After that adding a new data point into
us will be with the new state copied.
my $ru1 = Statistics::Running->new();
for(1..100){
$ru1->add(rand());
}
my $ru2 = Statistics::Running->new();
for(1..100){
$ru2->add(rand());
}
# copy the state of ru1 into ru2. state of ru1 is forgotten.
$ru2->copy_from($ru1);
=head2 clone
Clone state of our object into a newly created object which is returned.
Our object and returned object are identical at the time of cloning.
my $ru1 = Statistics::Running->new();
for(1..100){
$ru1->add(rand());
}
my $ru2 = $ru1->clone();
=head2 clear
Clear our internal state as if no data points have ever been added into us.
As if we were just created. All state is forgotten and reset to zero, including histogram.
=head2 mean
Returns the mean of all the data pushed in us
=head2 sum
Returns the sum of all the data pushed in us (algebraic sum, not absolute sum)
=head2 abs_sum
Returns the sum of the absolute value of all the data pushed in us (this is not algebraic sum)
=head2 min
Returns the minimum data sample added in us
=head2 max
Returns the maximum data sample added in us
=head2 get_N
Returns the number of data points/samples inserted, and had
their descriptive statistics calculated, so far.
=head2 variance
Returns the variance of the data points/samples added onto us so far.
=head2 standard_deviation
Returns the standard deviation of the data points/samples added onto us so far. This is the square root of the variance.
=head2 skewness
Returns the skewness of the data points/samples added onto us so far.
=head2 kurtosis
Returns the kurtosis of the data points/samples added onto us so far.
=head2 concatenate
Concatenates our state with the input object's state and returns
a newly created object with the combined state. Our object and
input object are not modified. The overloaded symbol '+' points
to this sub.
=head2 append
Appends input object's state into ours.
Our state is modified. (input object's state is not modified)
The overloaded symbol '+=' points
to this sub.
=head2 histogram_enable
Enables histogram logging by creating a histogram with specified
parameters. These parameters can be of different formats:
my $ru1 = Statistics::Running->new();
$ru1->histogram_enable({
'num-bins' => 10,
'bin-width' => 0.01,
'left-boundary' => 0
});
# or, 2 bins: 0-1 and 1-2
$ru1->histogram_enable({
'0:1' => 0,
'1:2' => 1,
});
# or, 2 bins: 0-1 and 1-2
$ru1->histogram_enable([0,1,2]);
=over 3
=item 1. by specifying the number of bins, bin-width and left boundary as a
parameters hash, e.g. C< $ru->enable_histogram({'num-bins'=>5, 'bin-width'=>1, 'left-boundary'=>-2}); >
=item 2. by specifying a HASH where keys are 'FROM:TO' and values are the bin counts,
which can be zero, or even a positive integer if you want to start with some counts already.
=item 3. ARRAY of bin boundaries of the form
C< [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary] >
It follows that the number of bins will be 1 less than the length of this array.
=back
=head2 histogram_disable
Disable histogram logging, all existing histogram data is erased. Number of bins
is forgotten, along with bin boundaries, etc.
=head2 histogram_reset
Set existing histogram to zero counts.
=head2 histogram_count
Returns the count in bin specified by bin index (which is 0 to number-of-bins - 1)
=head2 equals
Check if our state (number of samples and all internal state) is
the same with input object's state. Equality here implies that
ALL statistics are equal (within a small number Statistics::Running::SMALL_NUMBER_FOR_EQUALITY)
=head2 equals_statistics
Check if our statistics only (and not sample size)
are the same with input object. E.g. it checks mean, variance etc.
but not sample size (as with the real equals()).
It returns 0 on non-equality. 1 if equal.
=head2 equals_histograms
Check if our histogram only (and not statitstics)
are the same with input object.
It returns 0 on non-equality. 1 if equal.
=head2 stringify
Returns a string description of descriptive statistics we know about
(mean, standard deviation, kurtosis, skewness) as well as the
number of data points/samples added onto us so far. Note that
this method is not necessary because stringification is overloaded
and the follow C< print $stats_obj."\n" > is equivalent to
C< print $stats_obj->stringify()."\n" >
=head1 Overloaded functionality
=over 3
=item 1. Addition of two statistics objects: C< my $ru3 = $ru1 + $ru2 >
=item 2. Test for equality: C< if( $ru2 == $ru3 ){ ... } >
=item 3. Stringification: C< print $ru1."\n" >
=back
=head1 Testing for Equality
In testing if two objects are the same, their means, standard deviations
etc. are compared. This is done using
C< if( ($self->mean() - $other->mean()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY ){ ... } >
=head1 BENCHMARKS
Run C< make bench > for benchmarks which report the maximum number of data points inserted
per second (in your system).
=head1 SEE ALSO
=over 4
=item 1. L<Wikipedia|http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm>
=item 2. L<John D. Cook's article and C++ implementation|https://www.johndcook.com/blog/skewness_kurtosis>
was used both as inspiration and as the basis for the formulas for C< kurtosis() >
and C< skewness() >
=item 3. L<Statistics::Welford> This module is equivalent but it
does not provide C< kurtosis() > and C< skewness() > which
current module does. Additionally,
current module builds a Histogram for inserted data as a discrete
approximation of the Probability Distribution data comes from.
=item 4. L<Statistics::Running::Tiny> This is the exact same module but
without histogram capabilities. That makes it
a bit faster than current module only when data is inserted.
Space-wise, the histogram does not take much
space. It is just an array of bins and the number
of items (not the original data items themselves!) it contains.
Run C< make bench > to get a report on the maximum number
of data point insertions per unit time in your system.
L<Statistics::Running::Tiny> is approximately 25% faster than this module.
=back
=head1 AUTHOR
Andreas Hadjiprocopis, C<< <bliako at cpan.org> >>
=head1 BUGS
Please report any bugs or feature requests to C<bug-statistics-running at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Statistics-Running>. I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Statistics::Running
You can also look for information at:
=over 4
=item * RT: CPAN's request tracker (report bugs here)
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Statistics-Running>
=item * AnnoCPAN: Annotated CPAN documentation
L<http://annocpan.org/dist/Statistics-Running>
=item * CPAN Ratings
L<http://cpanratings.perl.org/d/Statistics-Running>
=item * Search CPAN
L<http://search.cpan.org/dist/Statistics-Running/>
=back
=head1 DEDICATIONS
Almaz
=head1 ACKNOWLEDGEMENTS
B.P.Welford, John Cook.
=head1 LICENSE AND COPYRIGHT
Copyright 2018-2019 Andreas Hadjiprocopis.
This program is free software; you can redistribute it and/or modify it
under the terms of the the Artistic License (2.0). You may obtain a
copy of the full license at:
L<http://www.perlfoundation.org/artistic_license_2_0>
Any use, modification, and distribution of the Standard or Modified
Versions is governed by this Artistic License. By using, modifying or
distributing the Package, you accept this license. Do not use, modify,
or distribute the Package, if you do not accept this license.
If your Modified Version has been derived from a Modified Version made
by someone other than you, you are nevertheless required to ensure that
your Modified Version complies with the requirements of this license.
This license does not grant you the right to use any trademark, service
mark, tradename, or logo of the Copyright Holder.
This license includes the non-exclusive, worldwide, free-of-charge
patent license to make, have made, use, offer to sell, sell, import and
otherwise transfer the Package with respect to any patent claims
licensable by the Copyright Holder that are necessarily infringed by the
Package. If you institute patent litigation (including a cross-claim or
counterclaim) against any party alleging that the Package constitutes
direct or contributory patent infringement, then this Artistic License
to you shall terminate on the date that such litigation is filed.
Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
=cut