package Statistics::Running; use 5.006; use strict; use warnings; use Data::Dumper; our $VERSION = '0.13'; # overload these operators to have special meaning when # operand(s) are Statistics::Running: use overload # add two stats object and adjust summed mean,stdev etc. '+' => \&concatenate, # check if two stats objects are same wrt mean,stdev,N BUT NOT histogram '==' => \&equals, # convert a stats object into a string, e.g. print $obj."\n"; '""' => \&stringify, ; use Try::Tiny; use Statistics::Histogram; # this is for all numerical equality comparisons use constant SMALL_NUMBER_FOR_EQUALITY => 1E-10; # creates an obj. There are no input params sub new { my $class = $_[0]; my $parent = ( caller(1) )[3] || "N/A"; my $whoami = ( caller(0) )[3]; my $self = { # these are internal variables to store mean etc. or used to calculate Kurtosis 'M1' => 0.0, 'M2' => 0.0, 'M3' => 0.0, 'M4' => 0.0, 'ABS-SUM' => 0.0, 'SUM' => 0.0, 'MIN' => 0.0, 'MAX' => 0.0, 'N' => 0, # number of data items inserted # this histogram is updated each time a new data point is pushed in the object # it just holds the number of items in each bin, so it is not too expensive. # with this we get an idea of the Probability Distribution of the pushed data. # Which may or may not be useful to users. # Should you want to avoid this then use Statistics::Running::Tiny 'histo' => { 'num-bins' => -1, 'bins' => { # b: [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary] 'b' => [], # length is 'num-bins'+1 # c: contains the counts, its size is equal to the number of bins # the first cell contains counts in the interval [histo-left-boundary, bin1_right_boundary] # the last cell contains counts of [binN-1_right_boundary, histo-right-boundary] 'c' => [], # length 'num-bins' }, # cached stringified histogram, it is re-calculated only if data points added # and asked to print histogram 'stringified' => undef, # when asked to stringify a hist we actually use a cached string # which needs to be recalculated whenever data is added or hist re-created 'needs-recalculate' => 1, }, }; bless($self, $class); $self->clear(); return $self } # returns the histogram bins (can be empty) in our internal format sub histogram { return $_[0]->{'histo'} } # if no params, it returns our bins as a hash # otherwise it imports input bins in the form of a hash # and before that it erases previous histogram and forms it according to input, e.g. # sets bin-widths and numbins etc. sub histogram_bins_hash { my $self = $_[0]; my $bins = $_[1]; if( ! defined($bins) ){ # export to a hash return $self->_bins2hash() } # import from a hash $self->_hash2bins($bins); } # if no params, it returns our bins as a hash of the form returned by Statistics::Descriptive::frequency_distribution() # otherwise it imports input bins in the form of a hash in the form returned by Statistics::Descriptive::frequency_distribution() sub histogram_bins_stathash { my $self = $_[0]; my $bi = $_[1]; if( ! defined($bi) ){ # export to a hash return $self->_bins2stathash() } # import from a hash $self->_stathash2bins($bi); } # return a string showing this histogram by calling Statistics::Histogram::print_histogram() # we first convert our hist to stathash format sub histogram_stringify { my ($self, @opts) = @_; if( $self->{'histo'}->{'needs-recalculate'} == 1 ){ $self->_histogram_recalculate(@opts) } return $self->{'histo'}->{'stringified'} } # we need to recalculate each time a new data is added. # but we do recalculate whenever it is needed, i.e. when we asked to print histogram sub _histogram_recalculate { my ($self, @stringify_opts) = @_; my $histstr = ""; if( $self->{'histo'}->{'num-bins'} > 0 ){ Try::Tiny::try { $histstr = Statistics::Histogram::print_histogram( 'hist' => $self->_bins2stathash(), 'x_min' => $self->{'histo'}->{'bins'}->{'b'}->[0], use_linear_axes => 1, @stringify_opts ) } Try::Tiny::catch { print STDERR "_histogram_recalculate() : error caught trying to stringify: $_\n"; $histstr = ""; }; } $self->{'histo'}->{'stringified'} = $histstr; $self->{'histo'}->{'needs-recalculate'} = 0; } # disable histogram logging, all existing histogram data is erased sub histogram_disable { my $self = $_[0]; $self->{'histo'}->{'num-bins'} = -1; $self->{'histo'}->{'bins'}->{'b'} = []; $self->{'histo'}->{'bins'}->{'c'} = []; $self->{'histo'}->{'needs-recalculate'} = 1; } # returns the count in bin specified as 1st input param sub histogram_count { return $_[0]->{'histo'}->{'c'}->[$_[1]] } # enables histogram logging # it expects some parameters for creating the histogram in various forms, # e.g. by specifying the number of bins, bin-width and left boundary or # by specifying a HASH or ARRAY of bin specifications for non-uniform bin # sizes. HASH must be of the form 'FROM:TO'->counts # ARRAY of bin boundaries of the form # [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary] # the number of bins is 1 less than the length of this array sub histogram_enable { my $self = $_[0]; my $params = $_[1]; # $_[1] // {} does not work for perl<5.10, ? : requests $_[1] twice, so Cish if( ! defined below... if( ! defined($params) ){ $params = {} } my ($m1, $m2, $m3); if( defined($m1=$params->{'bins'}) ){ my $aref = ref($m1); if( $aref eq 'ARRAY' ){ # an array of bin boundaries of the form # [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary] # the number of bins is 1 less than the length of this array my @mm = @$m1; $self->{'histo'}->{'num-bins'} = scalar(@mm)-1; $self->{'histo'}->{'bins'}->{'b'} = [@mm]; $self->{'histo'}->{'bins'}->{'c'} = (0) x $self->{'histo'}->{'num-bins'}; } elsif( $aref eq 'HASH' ){ # a hashref keyed on bin-intervals in the form FROM:TO->counts $self->_hash2bins($m1); } else { die "parameter 'bins' expects either a HASHREF keyed on bin-intervals in the form FROM:TO->counts (and counts can be non-zero if that is a previous histogram), or an ARRAYREF with bin boundaries of the form [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary]. In this case the number of bins is 1 less than the length of the array." } } elsif( defined($m1=$params->{'bin-width'}) && defined($m2=$params->{'num-bins'}) && defined($m3=$params->{'left-boundary'}) ){ # we re-create our own bins based on num-bins etc. $self->_histogram_create_bins_from_spec($m1, $m2, $m3) } else { # no params! we need params print STDERR "enable_histogram() : failed to enable histogram because no histogram specification was supplied. Try enable_histogram({bin-width=>1, nun-bins=>10, left-boundary=>-5});\n"; } $self->{'histo'}->{'needs-recalculate'} = 1; } # set existing histogram to zero counts sub histogram_reset { my $self = $_[0]; # no params, set all counts OF ALREADY EXISTING histogram to zero my $m1 = $self->{'histo'}->{'bins'}->{'c'}; for(my $i=$self->{'histo'}->{'num-bins'};$i-->0;){ $m1->[$i] = 0 } $self->{'histo'}->{'needs-recalculate'} = 1; } # push Data: a sample and process/update mean and all other stat measures # also insert it in histogram sub add { my $self = $_[0]; my $x = $_[1]; my $aref = ref($x); if( $aref eq '' ){ # a scalar input my ($delta, $delta_n, $delta_n2, $term1); my $n1 = $self->{'N'}; if( $n1 == 0 ){ $self->{'MIN'} = $self->{'MAX'} = $x } else { if( $x < $self->{'MIN'} ){ $self->{'MIN'} = $x } if( $x > $self->{'MAX'} ){ $self->{'MAX'} = $x } } $self->{'SUM'} += $x; # add x to the total SUM $self->{'ABS-SUM'} += abs($x); # add abs(x) to the total SUM $self->{'N'} += 1; # increment sample size push in my $n0 = $self->{'N'}; $delta = $x - $self->{'M1'}; $delta_n = $delta / $n0; $delta_n2 = $delta_n * $delta_n; $term1 = $delta * $delta_n * $n1; $self->{'M1'} += $delta_n; $self->{'M4'} += $term1 * $delta_n2 * ($n0*$n0 - 3*$n0 + 3) + 6 * $delta_n2 * $self->{'M2'} - 4 * $delta_n * $self->{'M3'} ; $self->{'M3'} += $term1 * $delta_n * ($n0 - 2) - 3 * $delta_n * $self->{'M2'} ; $self->{'M2'} += $term1; # add data point to the internal histogram $self->_histogram_add($x); } elsif( $aref eq 'ARRAY' ){ # an array input foreach (@$x){ $self->add($_) } } else { die "add(): only ARRAY and SCALAR can be handled (input was type '$aref')." } } # copies input(=src) Running obj into current/self overwriting our data, this is not a clone()! sub copy_from { my $self = $_[0]; my $src = $_[1]; $self->{'M1'} = $src->M1(); $self->{'M2'} = $src->M2(); $self->{'M3'} = $src->M3(); $self->{'M4'} = $src->M4(); $self->set_N($src->get_N()); $self->_histogram_copy_from($src); } # clones current obj into a new Running obj with same values sub clone { my $self = $_[0]; my $newO = Statistics::Running->new(); $newO->{'M1'} = $self->M1(); $newO->{'M2'} = $self->M2(); $newO->{'M3'} = $self->M3(); $newO->{'M4'} = $self->M4(); $newO->set_N($self->get_N()); return $newO } # clears all data entered/calculated including histogram sub clear { my $self = $_[0]; $self->{'M1'} = 0.0; $self->{'M2'} = 0.0; $self->{'M3'} = 0.0; $self->{'M4'} = 0.0; $self->{'MIN'} = 0.0; $self->{'MAX'} = 0.0; $self->{'ABS-SUM'} = 0.0; $self->{'SUM'} = 0.0; $self->{'N'} = 0; $self->histogram_reset(); } # return the mean of the data entered so far sub mean { return $_[0]->{'M1'} } sub sum { return $_[0]->{'SUM'} } sub abs_sum { return $_[0]->{'ABS-SUM'} } sub min { return $_[0]->{'MIN'} } sub max { return $_[0]->{'MAX'} } # get number of total elements entered so far sub get_N { return $_[0]->{'N'} } sub variance { my $self = $_[0]; my $m = $self->{'N'}; if( $m == 1 ){ return 0 } return $self->{'M2'}/($m-1.0) } sub standard_deviation { return sqrt($_[0]->variance()) } sub skewness { my $self = $_[0]; my $m = $self->{'M2'}; if( $m == 0 ){ return 0 } return sqrt($self->{'N'}) * $self->{'M3'} / ($m ** 1.5) ; } sub kurtosis { my $self = $_[0]; my $m = $self->{'M2'}; if( $m == 0 ){ return 0 } return $self->{'N'} * $self->{'M4'} / ($m * $m) - 3.0 ; } # concatenates another Running obj with current # AND returns a new Running obj with concatenated stats # Current object is not modified. sub concatenate { my $self = $_[0]; # us my $other = $_[1]; # another Running obj my $combined = Statistics::Running->new(); my $selfN = $self->get_N(); my $otherN = $other->get_N(); my $selfM2 = $self->M2(); my $otherM2 = $other->M2(); my $selfM3 = $self->M3(); my $otherM3 = $other->M3(); my $combN = $selfN + $otherN; $combined->set_N($combN); my $delta = $other->M1() - $self->M1(); my $delta2 = $delta*$delta; my $delta3 = $delta*$delta2; my $delta4 = $delta2*$delta2; $combined->{'M1'} = ($selfN*$self->M1() + $otherN*$other->M1()) / $combN; $combined->{'M2'} = $selfM2 + $otherM2 + $delta2 * $selfN * $otherN / $combN; $combined->{'M3'} = $selfM3 + $otherM3 + $delta3 * $selfN * $otherN * ($selfN - $otherN)/($combN*$combN) + 3.0*$delta * ($selfN*$otherM2 - $otherN*$selfM2) / $combN ; $combined->{'M4'} = $self->{'M4'} + $other->{'M4'} + $delta4*$selfN*$otherN * ($selfN*$selfN - $selfN*$otherN + $otherN*$otherN) / ($combN*$combN*$combN) + 6.0*$delta2 * ($selfN*$selfN*$otherM2 + $otherN*$otherN*$selfM2)/($combN*$combN) + 4.0*$delta*($selfN*$otherM3 - $otherN*$selfM3) / $combN ; $combined->{'SUM'} = $self->{'SUM'} + $other->{'SUM'}; $combined->{'ABS-SUM'} = $self->{'ABS-SUM'} + $other->{'ABS-SUM'}; $combined->{'MIN'} = $self->{'MIN'} < $other->{'MIN'} ? $self->{'MIN'} : $other->{'MIN'}; $combined->{'MAX'} = $self->{'MAX'} > $other->{'MAX'} ? $self->{'MAX'} : $other->{'MAX'}; # add the histograms only if structure matches: if( $self->_equals_histograms_structure($other) ){ $combined->_histogram_copy_from($self); $combined->_add_histograms($other); } return $combined; } # appends another Running obj INTO current # histogram data is appended only if histogram specs are the same # current obj (self) IS MODIFIED sub append { my $self = $_[0]; # us my $other = $_[1]; # another Running obj $self->copy_from($self+$other); } # equality only wrt to stats BUT NOT histogram sub equals { my $self = $_[0]; # us my $other = $_[1]; # another Running obj return $self->get_N() == $other->get_N() && $self->equals_statistics($other) } sub equals_statistics { my $self = $_[0]; # us my $other = $_[1]; # another Running obj return abs($self->M1()-$other->M1()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY && abs($self->M2()-$other->M2()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY && abs($self->M3()-$other->M3()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY && abs($self->M4()-$other->M4()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY } # checks if structure is same and then if bin contents (counts) are same # returns 1 if equals # returns 0 if either structure or counts are not the same sub equals_histograms { my $self = $_[0]; # us my $other = $_[1]; # another Running obj # structure is not the same if( $self->_equals_histograms_structure($other) == 0 ){ return 0 } my $selfC = $self->{'histo'}->{'bins'}->{'c'}; my $otherC = $other->{'histo'}->{'bins'}->{'c'}; my $i; for($i=$self->{'histo'}->{'num-bins'};$i-->0;){ if( $selfC->[$i] != $otherC->[$i] ){ return 0 } } return 1 # equal in structure and counts } # adds counts of histograms to us from other # returns 0 if structures do not match # returns 1 if counts added OK sub _add_histograms { my $self = $_[0]; # us my $other = $_[1]; # another Running obj # structure is not the same if( $self->_equals_histograms_structure($other) == 0 ){ return 0 } my $selfC = $self->{'histo'}->{'bins'}->{'c'}; my $otherC = $other->{'histo'}->{'bins'}->{'c'}; my $i; for($i=$self->{'histo'}->{'num-bins'};$i-->0;){ $selfC->[$i] += $otherC->[$i]; } $self->{'histo'}->{'needs-recalculate'} = 1; return 1 # counts added } # print object as a string, string concat/printing is overloaded on this method sub stringify { my $self = $_[0]; return "N: ".$self->get_N() .", mean: ".$self->mean() .", range: ".$self->min()." to ".$self->max() .", sum: ".$self->sum() .", standard deviation: ".$self->standard_deviation() .", kurtosis: ".$self->kurtosis() .", skewness: ".$self->skewness() .", histogram:\n".$self->histogram_stringify() } # internal methods, no need for anyone to know or use externally sub set_N { $_[0]->{'N'} = $_[1] } sub M1 { return $_[0]->{'M1'} } sub M2 { return $_[0]->{'M2'} } sub M3 { return $_[0]->{'M3'} } sub M4 { return $_[0]->{'M4'} } # copy src's histogram to us, erasing previous data and histo-format sub _histogram_copy_from { my $self = $_[0]; my $src = $_[1]; # a src stats object whose histogram we are copying onto us $self->histogram_bins_hash($src->histogram_bins_hash()); } # given bin-width, num-bins and left-boundary create the bin arrays sub _histogram_create_bins_from_spec { my ($self, $bw, $nb, $lb) = @_; $self->{'histo'}->{'num-bins'} = $nb; my @B = (0)x($nb+1); my ($i); my $v = $lb; for($i=0;$i<=$nb;$i++){ $B[$i] = $v; $v += $bw; } $self->{'histo'}->{'bins'}->{'b'} = \@B; $self->{'histo'}->{'bins'}->{'c'} = [(0)x$nb]; $self->{'histo'}->{'needs-recalculate'} = 1; } # add a datapoint to the histogram, this is usually called only via the public add() sub _histogram_add { my $self = $_[0]; my $x = $_[1]; # value to add my ($n, $i); if( ($n=$self->{'histo'}->{'num-bins'}) <= 0 ){ return } my $B = $self->{'histo'}->{'bins'}->{'b'}; for($i=0;$i<$n;$i++){ if( ($x > $B->[$i]) && ($x <= $B->[$i+1]) ){ $self->{'histo'}->{'bins'}->{'c'}->[$i]++; $self->{'histo'}->{'needs-recalculate'} = 1; # need to recalc stringify return } } } # given the bins and bin counts arrays, return a hash in the natural form: # from-bin:to-bin -> count # see also _bins2stathash for returning a hash of the format specified in Statistics::Descriptive sub _bins2hash { my $self = $_[0]; my %ret = (); my $B = $self->{'histo'}->{'bins'}->{'b'}; my $C = $self->{'histo'}->{'bins'}->{'c'}; my $i; for($i=$self->{'histo'}->{'num-bins'};$i-->0;){ $ret{$B->[$i].":".$B->[$i+1]} = $C->[$i] } return \%ret } # given the bins and bin counts arrays, return a hash with keys # to-bin -> count # whereas count is the count of the bin specified by to-bin and its previous key of the hash sub _bins2stathash { my $self = $_[0]; my %ret = (); my $B = $self->{'histo'}->{'bins'}->{'b'}; my $C = $self->{'histo'}->{'bins'}->{'c'}; my $i; for($i=$self->{'histo'}->{'num-bins'}-1;$i-->0;){ $ret{$B->[$i+1]} = $C->[$i] } return \%ret } # given a hash with keys # from-bin:to-bin -> count # erase and re-create the bin and counts arrays of histo. # for a way to import Statistics::Descriptive frequency_distribution hash check _stathash2bins() sub _hash2bins { my $self = $_[0]; my $H = $_[1]; my @B = (); my @C = (); my @K = keys %$H; $self->{'histo'}->{'num-bins'} = scalar(@K); my ($acount, $akey); my @X = map { push(@B, $_->[1]); # left-bin (from) push(@C, $H->{$_->[0]}); # counts $_->[2]; # spit out the right-bin (to) } sort { $a->[1] <=> $b->[1] } map { [ $_, split(/\:/, $_) ] } @K ; push(@B, $X[-1]); $self->{'histo'}->{'bins'}->{'b'} = \@B; $self->{'histo'}->{'bins'}->{'c'} = \@C; $self->{'histo'}->{'needs-recalculate'} = 1; } # given a hash with keys # to-bin -> count # erase and re-create the bin and counts arrays of histo. # the hash is exactly what Statistics::Descriptive::frequency_distribution() returns # there is only one problem: what is the left-boundary? we will set it to -infinity. sub _stathash2bins { my $self = $_[0]; my $H = $_[1]; # hashref: exactly what Statistics::Descriptive::frequency_distribution() returns my @B = (); my @C = (); my @K = keys %$H; $self->{'histo'}->{'num-bins'} = scalar(@K); my ($acount, $akey); push(@B, -(~0 >> 1)); # -MAX_INT fuck you. foreach my $k (sort { $a <=> $b } keys %$H){ push(@B, $k); push(@C, $H->{$k}); } $self->{'histo'}->{'bins'}->{'b'} = \@B; $self->{'histo'}->{'bins'}->{'c'} = \@C; $self->{'histo'}->{'needs-recalculate'} = 1; } # compares the structure of the histograms of us and another obj # if histograms have same number of bins and same bin-specs (boundaries) # then histograms are equal and returns 1 # if both histograms contain zero bins (not initialised) then also returns 1 # else, histogram structure differs and returns 0 sub _equals_histograms_structure { my ($self, $other) = @_; my $NB1 = $self->{'histo'}->{'num-bins'}; if( $NB1 != $other->{'histo'}->{'num-bins'} ){ return 0 } # no bins, so equal! if( $NB1 == -1 ){ return 1 } my $b1 = $self->{'histo'}->{'bins'}->{'b'}; my $b2 = $other->{'histo'}->{'bins'}->{'b'}; for(my $i=$NB1+1;$i-->0;){ if( $b1->[$i] != $b2->[$i] ){ return 0 } } return 1 # equal histogram STRUCTURES (not bincounts) } 1; __END__ # end program, below is the POD =pod =encoding UTF-8 =head1 NAME Statistics::Running - Basic descriptive statistics (mean/stdev/min/max/skew/kurtosis) and discrete Probability Distribution (via histogram) over data without the need to store data points ever. OOP style. =head1 VERSION Version 0.11 =head1 SYNOPSIS use Statistics::Running; my $ru = Statistics::Running->new(); for(1..100){ $ru->add(rand()); } print "mean: ".$ru->mean()."\n"; $ru->add(12345); print "mean: ".$ru->mean()."\n"; my $ru2 = Statistics::Running->new(); $ru2->histogram_enable({ 'num-bins' => 10, 'bin-width' => 0.01, 'left-boundary' => 0 }); for(1..100){ $ru2->add(rand()); } print "Probability Distribution of data:\n".$ru2->histogram_stringify()."\n"; # add two stat objects together (histograms are not!) my $ru3 = $ru + $ru2; print "mean of concatenated data: ".$ru3->mean()."\n"; $ru += $ru2; print "mean after appending data: ".$ru->mean()."\n"; print "stats: ".$ru->stringify()."\n"; # example output: print $ru2."\n"; N: 100, mean: 0.488978434779093, range: 0.0056063539679414 to 0.99129297226348, standard deviation: 0.298129905728534, kurtosis: -1.22046199974301, skewness: -0.0268827866000826, histogram: 0.000 - 0.010: 2 ##################################################### 0.010 - 0.020: 2 ##################################################### 0.020 - 0.030: 2 ##################################################### 0.030 - 0.041: 2 ##################################################### 0.041 - 0.051: 1 ########################### 0.051 - 0.062: 0 | 0.062 - 0.073: 2 ##################################################### 0.073 - 0.083: 0 | 0.083 - 0.094: 1 ########################### =head1 DESCRIPTION Statistics are updated every time a new data point is added in. The common practice to calculate descriptive statistics for 5 data points as well as 1 billion points is to store them in an array, loop over the array to calculate the mean, then loop over the array again to calculate standard deviation, as Sum (x_i-mean)**2. Standard deviation is the reason data is stored in the array. This module uses B.P.Welford's method to calculate descriptive statistics by continually adjusting the stats and not storing a single data point. Except from the computational and environmental benefits of such an approach, B.P.Welford's method is also immune to accumulated precision errors. It is stable and accurate. For more details on the method and its stability look at this: L A version without the histogram exists under L and is faster, obviously. About 25% faster. There are three amazing things about B.P.Welford's algorithm implemented here: =over 4 =item 1. It calculates and keeps updating mean/standard-deviation etc. on data without the need to store that data. As new data comes in, the statistics are updated based on the state of a few variables (mean, number of data points, etc.) but not the past data points. This includes the calculation of standard deviation which most of us knew (wrongly) that it requires a second pass on the data points, after the mean is calculated. Well, B.P.Welford found a way to avoid this. =item 2. The standard formula for standard deviation requires to sum the square of the difference of each sample from the mean. If samples are large numbers then you are summing differences of large numbers. If further there is little difference between samples, and the discrepancy from the mean is small, then you are prone to precision errors which accumulate to destructive effect if the number of samples is large. In contrast, B.P.Welford's algorithm does not suffer from this, it is stable and accurate. =item 3. B.P.Welford's online statistics algorithm is quite a revolutionary idea and why is not an obligatory subject in first-year programming courses is beyond comprehension. Here is a way to decrease those CO2 emissions. =back The basis for the code in this module comes from L =head1 EXPORT Nothing, this is an Object Oriented module. Once you instantiate an object all its methods are yours. =head1 SUBROUTINES/METHODS =head2 new Constructor, initialises internal variables. =head2 add Update our statistics after one more data point/sample (or an array of them) is presented to us. my $ru1 = Statistics::Running->new(); for(1..100){ $ru1->add(rand()); print $ru1."\n"; } Input can be a single data point (a scalar) or a reference to an array of data points. =head2 copy_from Copy state of input object into current effectively making us like them. Our previous state is forgotten. After that adding a new data point into us will be with the new state copied. my $ru1 = Statistics::Running->new(); for(1..100){ $ru1->add(rand()); } my $ru2 = Statistics::Running->new(); for(1..100){ $ru2->add(rand()); } # copy the state of ru1 into ru2. state of ru1 is forgotten. $ru2->copy_from($ru1); =head2 clone Clone state of our object into a newly created object which is returned. Our object and returned object are identical at the time of cloning. my $ru1 = Statistics::Running->new(); for(1..100){ $ru1->add(rand()); } my $ru2 = $ru1->clone(); =head2 clear Clear our internal state as if no data points have ever been added into us. As if we were just created. All state is forgotten and reset to zero, including histogram. =head2 mean Returns the mean of all the data pushed in us =head2 sum Returns the sum of all the data pushed in us (algebraic sum, not absolute sum) =head2 abs_sum Returns the sum of the absolute value of all the data pushed in us (this is not algebraic sum) =head2 min Returns the minimum data sample added in us =head2 max Returns the maximum data sample added in us =head2 get_N Returns the number of data points/samples inserted, and had their descriptive statistics calculated, so far. =head2 variance Returns the variance of the data points/samples added onto us so far. =head2 standard_deviation Returns the standard deviation of the data points/samples added onto us so far. This is the square root of the variance. =head2 skewness Returns the skewness of the data points/samples added onto us so far. =head2 kurtosis Returns the kurtosis of the data points/samples added onto us so far. =head2 concatenate Concatenates our state with the input object's state and returns a newly created object with the combined state. Our object and input object are not modified. The overloaded symbol '+' points to this sub. =head2 append Appends input object's state into ours. Our state is modified. (input object's state is not modified) The overloaded symbol '+=' points to this sub. =head2 histogram_enable Enables histogram logging by creating a histogram with specified parameters. These parameters can be of different formats: my $ru1 = Statistics::Running->new(); $ru1->histogram_enable({ 'num-bins' => 10, 'bin-width' => 0.01, 'left-boundary' => 0 }); # or, 2 bins: 0-1 and 1-2 $ru1->histogram_enable({ '0:1' => 0, '1:2' => 1, }); # or, 2 bins: 0-1 and 1-2 $ru1->histogram_enable([0,1,2]); =over 3 =item 1. by specifying the number of bins, bin-width and left boundary as a parameters hash, e.g. C< $ru->enable_histogram({'num-bins'=>5, 'bin-width'=>1, 'left-boundary'=>-2}); > =item 2. by specifying a HASH where keys are 'FROM:TO' and values are the bin counts, which can be zero, or even a positive integer if you want to start with some counts already. =item 3. ARRAY of bin boundaries of the form C< [histo-left-boundary, bin1_right_boundary, bin2_right_boundary, ... binN-1_right_boundary, histo-right-boundary] > It follows that the number of bins will be 1 less than the length of this array. =back =head2 histogram_disable Disable histogram logging, all existing histogram data is erased. Number of bins is forgotten, along with bin boundaries, etc. =head2 histogram_reset Set existing histogram to zero counts. =head2 histogram_count Returns the count in bin specified by bin index (which is 0 to number-of-bins - 1) =head2 equals Check if our state (number of samples and all internal state) is the same with input object's state. Equality here implies that ALL statistics are equal (within a small number Statistics::Running::SMALL_NUMBER_FOR_EQUALITY) =head2 equals_statistics Check if our statistics only (and not sample size) are the same with input object. E.g. it checks mean, variance etc. but not sample size (as with the real equals()). It returns 0 on non-equality. 1 if equal. =head2 equals_histograms Check if our histogram only (and not statitstics) are the same with input object. It returns 0 on non-equality. 1 if equal. =head2 stringify Returns a string description of descriptive statistics we know about (mean, standard deviation, kurtosis, skewness) as well as the number of data points/samples added onto us so far. Note that this method is not necessary because stringification is overloaded and the follow C< print $stats_obj."\n" > is equivalent to C< print $stats_obj->stringify()."\n" > =head1 Overloaded functionality =over 3 =item 1. Addition of two statistics objects: C< my $ru3 = $ru1 + $ru2 > =item 2. Test for equality: C< if( $ru2 == $ru3 ){ ... } > =item 3. Stringification: C< print $ru1."\n" > =back =head1 Testing for Equality In testing if two objects are the same, their means, standard deviations etc. are compared. This is done using C< if( ($self->mean() - $other->mean()) < Statistics::Running::SMALL_NUMBER_FOR_EQUALITY ){ ... } > =head1 BENCHMARKS Run C< make bench > for benchmarks which report the maximum number of data points inserted per second (in your system). =head1 SEE ALSO =over 4 =item 1. L =item 2. L was used both as inspiration and as the basis for the formulas for C< kurtosis() > and C< skewness() > =item 3. L This module is equivalent but it does not provide C< kurtosis() > and C< skewness() > which current module does. Additionally, current module builds a Histogram for inserted data as a discrete approximation of the Probability Distribution data comes from. =item 4. L This is the exact same module but without histogram capabilities. That makes it a bit faster than current module only when data is inserted. Space-wise, the histogram does not take much space. It is just an array of bins and the number of items (not the original data items themselves!) it contains. Run C< make bench > to get a report on the maximum number of data point insertions per unit time in your system. L is approximately 25% faster than this module. =back =head1 AUTHOR Andreas Hadjiprocopis, C<< >> =head1 BUGS Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc Statistics::Running You can also look for information at: =over 4 =item * RT: CPAN's request tracker (report bugs here) L =item * AnnoCPAN: Annotated CPAN documentation L =item * CPAN Ratings L =item * Search CPAN L =back =head1 DEDICATIONS Almaz =head1 ACKNOWLEDGEMENTS B.P.Welford, John Cook. =head1 LICENSE AND COPYRIGHT Copyright 2018-2019 Andreas Hadjiprocopis. This program is free software; you can redistribute it and/or modify it under the terms of the the Artistic License (2.0). You may obtain a copy of the full license at: L Any use, modification, and distribution of the Standard or Modified Versions is governed by this Artistic License. By using, modifying or distributing the Package, you accept this license. Do not use, modify, or distribute the Package, if you do not accept this license. If your Modified Version has been derived from a Modified Version made by someone other than you, you are nevertheless required to ensure that your Modified Version complies with the requirements of this license. This license does not grant you the right to use any trademark, service mark, tradename, or logo of the Copyright Holder. This license includes the non-exclusive, worldwide, free-of-charge patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Package with respect to any patent claims licensable by the Copyright Holder that are necessarily infringed by the Package. If you institute patent litigation (including a cross-claim or counterclaim) against any party alleging that the Package constitutes direct or contributory patent infringement, then this Artistic License to you shall terminate on the date that such litigation is filed. Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =cut