package Text::BibTeX::Validate;

use strict;
use warnings;

# ABSTRACT: validator for BibTeX format
our $VERSION = '0.3.0'; # VERSION

use Algorithm::CheckDigits;
use Data::Validate::Email qw( is_email_rfc822 );
use Data::Validate::URI qw( is_uri );
use Scalar::Util qw( blessed );
use Text::BibTeX::Validate::Warning;

require Exporter;
our @ISA = qw( Exporter );
our @EXPORT_OK = qw(
    clean_BibTeX
    shorten_DOI
    validate_BibTeX
);

my @months = qw(
    january
    february
    march
    april
    may
    june
    july
    august
    september
    october
    november
    december
);

=head1 NAME

Text::BibTeX::Validate - validator for BibTeX format

=head1 SYNOPSIS

    use Text::BibTeX;
    use Text::BibTeX::Validate qw( validate_BibTeX );

    my $bibfile = Text::BibTeX::File->new( 'bibliography.bib' );
    while( my $entry = Text::BibTeX::Entry->new( $bibfile ) ) {
        for my $warning (validate_BibTeX( $entry )) {
            print STDERR "$warning\n";
        }
    }

=head1 DESCRIPTION

Text::BibTeX::Validate checks the standard fields of BibTeX entries for
their compliance with their format. In particular, value of C<email> is
checked against RFC 822 mandated email address syntax, value of C<doi>
is checked to start with C<10.> and contain at least one C</> and so on.
Some nonstandard fields as C<isbn>, C<issn> and C<url> are also checked.
Failures of checks are returned as instances of
L<Text::BibTeX::Validate::Warning|Text::BibTeX::Validate::Warning>.

=head1 METHODS

=cut

sub shorten_DOI($);

=head2 validate_BibTeX( $what )

Takes plain Perl hash reference containing BibTeX fields and their
values, as well as L<Text::BibTeX::Entry|Text::BibTeX::Entry> instances
and returns an array of validation messages as instances of
L<Text::BibTeX::Validate::Warning|Text::BibTeX::Validate::Warning>.

=cut

sub validate_BibTeX
{
    my( $what ) = @_;
    my $entry = _convert( $what );

    my @warnings;

    # Report and remove empty keys
    for my $key (sort keys %$entry) {
        next if defined $entry->{$key};
        push @warnings,
             _warn_value( 'undefined value', $entry, $key );
        delete $entry->{$key};
    }

    if( exists $entry->{email} &&
        !defined is_email_rfc822 $entry->{email} ) {
        push @warnings,
             _warn_value( 'value \'%(value)s\' does not look like valid ' .
                          'email address',
                          $entry,
                          'email' );
    }

    if( exists $entry->{doi} ) {
        my $doi = $entry->{doi};
        my $doi_now = shorten_DOI $doi;

        if( $doi_now !~ m|^10\.[^/]+/| ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' does not look like valid DOI',
                              $entry,
                              'doi' );
        } elsif( $doi ne $doi_now ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' is better written as \'%(suggestion)s\'',
                              $entry,
                              'doi',
                              { suggestion => $doi_now } );
        }
    }

    # Validated according to BibTeX recommendations
    if( exists $entry->{month} ) {
        if( $entry->{month} =~ /^0?[1-9]|1[12]$/ ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' is better written as \'%(suggestion)s\'',
                              $entry,
                              'month',
                              { suggestion => ucfirst substr( $months[$entry->{month}-1], 0, 3 ) } );
        } elsif( grep { lc $entry->{month} eq $_ && length $_ > 3 } @months ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' is better written as \'%(suggestion)s\'',
                              $entry,
                              'month',
                              { suggestion => ucfirst substr( $entry->{month}, 0, 3 ) } );
        } elsif( !(grep { lc $entry->{month} eq substr( $_, 0, 3 ) ||
                          lc $entry->{month} eq substr( $_, 0, 3 ) . '.' } @months) ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' does not look like valid month',
                              $entry,
                              'month' );
        }
    }

    if( exists $entry->{year} ) {
        # Sometimes bibliographies list the next year to show that they
        # are going to be published soon.
        my @localtime = localtime;
        if( $entry->{year} !~ /^[0-9]{4}$/ ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' does not look like valid year',
                              $entry,
                              'year' );
        } elsif( $entry->{year} > $localtime[5] + 1901 ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' is too far in the future',
                              $entry,
                              'year' );
        }
    }

    # Both keys are nonstandard
    for my $key ('isbn', 'issn') {
        next if !exists $entry->{$key};
        my $check = CheckDigits( $key );
        if( $key eq 'isbn' ) {
            my $value = $entry->{$key};
            $value =~ s/-//g;
            if( length $value == 13 ) {
                $check = CheckDigits( 'isbn13' );
            }
        }
        next if $check->is_valid( $entry->{$key} );
        push @warnings,
             _warn_value( 'value \'%(value)s\' does not look like valid %(FIELD)s',
                          $entry,
                          $key,
                          { FIELD => uc $key } );
    }

    # Both keys are nonstandard
    for my $key ('eprint', 'url') {
        next if !exists $entry->{$key};
        next if defined is_uri $entry->{$key};

        if( $entry->{$key} =~ /^(.*)\n$/ && defined is_uri $1 ) {
            # BibTeX converted from YAML (i.e., Debian::DEP12) might
            # have trailing newline character attached.
            push @warnings,
                 _warn_value( 'URL has trailing newline character',
                              $entry,
                              $key,
                              { suggestion => $1 } );
            next;
        }

        push @warnings,
             _warn_value( 'value \'%(value)s\' does not look like valid URL',
                          $entry,
                          $key );
    }

    # Nonstandard
    if( exists $entry->{pmid} ) {
        if( $entry->{pmid} =~ /^PMC[0-9]{7}$/ ) {
            push @warnings,
                 _warn_value( 'PMCID \'%(value)s\' is provided instead of PMID',
                              $entry,
                              'pmid' );
        } elsif( $entry->{pmid} !~ /^[1-9][0-9]*$/ ) {
            push @warnings,
                 _warn_value( 'value \'%(value)s\' does not look like valid PMID',
                              $entry,
                              'pmid' );
        }
    }

    return @warnings;
}

=head2 clean_BibTeX( $what )

Takes the same input as C<validate_BibTeX> and attempts to reconcile
trivial issues like dropping the resolver URL part of DOIs (see
C<shorten_DOI> method) and converting month numbers into three-letter
abbreviations.

=cut

sub clean_BibTeX
{
    my( $what ) = @_;
    my $entry = _convert( $what );

    # Deleting undefined values prior to the validation
    for (keys %$entry) {
        delete $entry->{$_} if !defined $entry->{$_};
    }

    my @warnings = validate_BibTeX( $entry );
    my @suggestions = grep { $_->{suggestion} } @warnings;

    for my $suggestion (@suggestions) {
        $entry->{$suggestion->{field}} = $suggestion->{suggestion};
    }

    return $entry;
}

=head2 shorten_DOI( $doi )

Remove the resolver URL part, as well as C<doi:> prefixes, from DOIs.

=cut

sub shorten_DOI($)
{
    my( $doi ) = @_;

    return $doi if $doi =~ s|^https?://(dx\.)?doi\.org/||;
    return $doi if $doi =~ s|^doi:||;
    return $doi;
}

sub _convert
{
    my( $what ) = @_;

    if( blessed $what && $what->isa( 'Text::BibTeX::Entry' ) ) {
        $what = { map { $_ => $what->get($_) } $what->fieldlist };
    }

    # TODO: check for duplicated keys
    return { map { lc $_ => $what->{$_} } keys %$what };
}

sub _warn_value
{
    my( $message, $entry, $field, $extra ) = @_;
    $extra = {} unless $extra;
    return Text::BibTeX::Validate::Warning->new(
            $message,
            { field => $field,
              value => $entry->{$field},
              %$extra } );
}

=head1 AUTHORS

Andrius Merkys, E<lt>merkys@cpan.orgE<gt>

=cut

1;