package Statistics::Histogram;
# ABSTRACT: Create a standard histogram for command-line display

use strict;
use warnings;

use Carp;
use Statistics::Descriptive;

use parent qw( Exporter );

our @EXPORT = qw( &get_histogram );
our $VERSION = '0.1';

use constant DEFAULT_BINS => 10;
use constant CHART_WIDTH  => 80;

##############################################################################
# get_histogram( $data, $num_bins, $use_linear_axes )
#
# - $data: Required, arrayref of numbers to chart
# - $num_bins: Optional integer, defaults to 10 bins
# - $use_linear_axes: Optional boolean, defaults to false for logarithmic axes
# - $use_integral_bins: Optional boolean, forces bins to be integers and linear axes
#
# - Returns a multiline string containing user-readable ascii histogram
#

sub get_histogram {
    my ($data, $num_bins, $use_linear_axes, $use_integral_bins) = @_;

    unless ( $use_integral_bins ) {
        $num_bins ||= DEFAULT_BINS;
    }

    croak "Can't create histogram: no data\n" unless @$data;

    my $stats = Statistics::Descriptive::Full->new();
    $stats->add_data(@$data);

    my $return = '';

    # Display some useful statistics at the top of the chart.

    $return .= "Count: " . $stats->count . "\n";
    $return .= sprintf "Range: %6.3f - %6.3f; Mean: %6.3f; Median: %6.3f; Stddev: %6.3f\n",
                $stats->min,
                $stats->max,
                $stats->mean,
                $stats->median,
                $stats->standard_deviation;

    $return .= sprintf "Percentiles:  90th: %6.3f; 95th: %6.3f; 99th: %6.3f\n",
                scalar($stats->percentile(90)),
                scalar($stats->percentile(95)),
                scalar($stats->percentile(99));


    # Calculate the histogram data. If the caller wants logarithmic axes,
    # first calculate the natural log of each value (+1 to work around 
    # zero values.)
    
    my %hist;

    if ( $use_integral_bins ) {
        $use_linear_axes = 1;
        my $min = $stats->min;
        my $max = $stats->max;
        my @bins;
        if ( !defined $num_bins ) {
            @bins = ( $min .. $max );
        }
        else {
            my $step_size = int( ($max-$min+1) / $num_bins );
            for ( my $i=$min; $i<$max; $i += $step_size ) {
                push @bins, $i;
            }
            push @bins, $max;
        }
        %hist = $stats->frequency_distribution(\@bins);
    }
    elsif ( $use_linear_axes ) {
        %hist = $stats->frequency_distribution($num_bins);
    }
    else {
        my $stats_log = Statistics::Descriptive::Full->new();
        $stats_log->add_data(map { log (1+$_) } grep { $_ > 0 } @$data);
        %hist = $stats_log->frequency_distribution($num_bins);
    }

    # Generate the chart
    
    $return .= print_histogram(
        hist  => \%hist, 
        x_min => $stats->min, 
        use_linear_axes => $use_linear_axes, 
        use_integral_bins => $use_integral_bins,
        chart_width => (CHART_WIDTH)[0],
    );

    return $return;
}

##############################################################################
# print_histogram( %args )
#
# - hist: Required hashref of histogram data from frequency_distribution()
# - x_min: Required value of lowest X value in original data, used for label
#           on first bin
# - use_linear_axes: Required boolean to choose linear vs logarithmic axes
# - use_integral_bins: Force bins to be integers and axes to be linear.
# - chart_width: Optional integer for max width of chart in characters, 
#                 defaults to 80.
#

sub print_histogram {
    my %args = (
        use_linear_axes => 0,
        use_integral_bins => 0,
        chart_width => 80,
        @_
    );

    my $hist = $args{hist};

    my @bins = sort { $a <=> $b } keys %$hist;

    my $ymax = 0;
    foreach my $bin (@bins) {
        $ymax = $hist->{$bin} if $ymax < $hist->{$bin};
    }

    if ($ymax == 0) {
        croak "Can't create histogram: no data\n";
    }

    # Max bar width is 27 characters less than chart width, for labels.
    $args{chart_width} = 28 if $args{chart_width} < 28;
    my $y_scale = ($args{chart_width} - 27) / $ymax;

    my $return = '';

    for my $i (0 .. $#bins) {

        my $y = $hist->{$bins[$i]} * $y_scale + 0.5;

        my $bar;
        if ($y < 0.001) {
            $bar = '';
        } elsif ($y < 1) {
            $bar = '|';
        } else {
            $bar = '#' x $y;
        }

        my ($x_low, $x_high);

        if ( $args{use_linear_axes} ) {
            $x_low = ( $i == 0 ? $args{x_min} : $bins[$i-1] );
            $x_high = $bins[$i];
        }
        else {
            # Subtract 1 from each exp() because we added 1 when generating 
            # the logarithmic data
            $x_low = ($i == 0 ? $args{x_min} : (exp $bins[$i-1])-1);
            $x_high = (exp $bins[$i])-1;
        }

        my $num_f = $args{use_integral_bins} ? '%8d' : '%8.3f';
        my $epsilon = $args{use_integral_bins} ? 1.001 : 0.001;

        if ( ( $x_high-$x_low ) < $epsilon ) {
            $return .= sprintf "           $num_f: %5d %s\n",
                        $x_high,
                        $hist->{$bins[$i]},
                        $bar;
        }
        else {
            $return .= sprintf "$num_f - $num_f: %5d %s\n",
                        $x_low,
                        $x_high,
                        $hist->{$bins[$i]},
                        $bar;
        }
    }

    return $return;
}

1;



=pod

=head1 NAME

Statistics::Histogram - Create a standard histogram for command-line display

=head1 VERSION

version 0.2

=head1 SYNOPSIS

  use Statistics::Histogram;

  my @data = <>;
  chomp @data;

  print get_histogram(\@data);

=head1 DESCRIPTION

This module exports a single routine, get_histogram, which expects
an array reference as its only required argument. The array should contain
a sequence of numbers, and the response will be an ascii-formatted
histogram, including some header lines providing statistics.

=head1 METHODS

=head2 get_histogram

  print get_histogram($array_ref);
  print get_histogram($array_ref, $num_bins, $use_linear_axes, $use_integral_bins);

There are three optional arguments: $num_bins, $use_linear_axes, and use_integral_bins.

=over 4

=item num_bins

$num_bins defaults to 10, and controls the maximum number of bins in the chart. Depending on the data, there may be fewer bins if there are fewer than $num_bins unique values.

=item use_linear_axes

$use_linear_axes defaults to false, which will create a chart with logarithmic axes. This is most useful for data derived from software timing metrics, which tend to be non-Normal and biased towards the axes. 

=item use_integral_bins

$use_integral_bins defaults to false. If true it forces use_linear_axes and makes the bins fall on integral values. This is good for plotting time-series data, like the number of events in each hour of the day.

=back

=head1 SEE ALSO

=over 4

=item *

L<Statistics::Descriptive>

=back

=head1 AUTHOR

Douglas Webb <doug.webb@gmail.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2005 by Douglas Webb.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut


__END__