use strict;
use warnings;
package Email::Folder::Mbox;
{
  $Email::Folder::Mbox::VERSION = '0.860';
}
# ABSTRACT: reads raw RFC822 mails from an mbox file

use Carp;
use IO::File;
use Email::Folder::Reader;
use parent 'Email::Folder::Reader';


sub defaults {
    ( eol => "\n")
}

sub _open_it {
    my $self = shift;
    my $file = $self->{_file};
    my $fh = $self->{fh};

    unless ($file eq "FH" and $fh) {
        # sanity checking
        croak "$file does not exist" unless (-e $file);
        croak "$file is a directory" if (-d $file);

        local $/ = $self->{eol};
        $fh = $self->_get_fh($file);
    }

    if (seek $fh, tell($fh), 0) {
        # Enable using seek only if $fh is seekable
        $self->{seekable} = 1;
    } else {
        # Otherwise use cache for simulating backward seeks
        $self->{cache} = [];
    }

    if ($self->{seek_to}) {
        unless ($self->{seekable}) {
            croak "$file is not seekable but seek_to was set";
        }
        # we were told to seek.  hope it all goes well
        seek $fh, $self->{seek_to}, 0;
    }
    else {
        local $/ = $self->{eol};
        my $firstline = <$fh>;
        if ($firstline) {
            croak "$file is not an mbox file" unless $firstline =~ /^From /;
        }
        $self->{from} = $firstline;
    }

    $self->{_fh} = $fh;
}

sub _get_fh {
    my $self = shift;
    my $file = shift;
    my $fh = IO::File->new($file) or croak "Cannot open $file";
    binmode($fh);
    return $fh;
}

sub _read_nextline {
    my $self = shift;
    if (not $self->{seekable} and @{$self->{cache}}) {
        return shift @{$self->{cache}};
    }
    my $fh = $self->{_fh};
    return <$fh>;
}

use constant debug => 0;
my $count;

sub next_from {
    my $self = shift;
    $self->_open_it unless $self->{_fh};
    return $self->{from};
}

sub next_messageref {
    my $self = shift;

    my $fh = $self->{_fh} || $self->_open_it;
    local $/ = $self->{eol};

    $self->{messageid} = undef;

    my $mail = '';
    my $prev = '';
    my $last;
    my $inheaders = 1;
    ++$count;
    print "$count starting scanning at line $.\n" if debug;

    while (my $line = _read_nextline($self)) {
        if ($line eq $/ && $inheaders) { # end of headers
            print "$count end of headers at line $.\n" if debug;
            $inheaders = 0; # stop looking for the end of headers
            my $pos; # where to go back to if it goes wrong
            $pos = tell $fh if $self->{seekable};

            # look for a content length header, and try to use that
            if ($mail =~ m/^Content-Length:\s*(\d+)$/mi) {
                my @cache;
                $mail .= $prev;
                $prev = '';
                my $length = $1;
                print " Content-Length: $length\n" if debug;
                my $read = '';
                while (my $bodyline = _read_nextline($self)) {
                    push @cache, $bodyline unless $self->{seekable};
                    last if length $read >= $length;
                    # unescape From_
                    $bodyline =~ s/^>(>*From )/$1/ if $self->{unescape};
                    $read .= $bodyline;
                }
                # grab the next line (should be /^From / or undef)
                my $next = _read_nextline($self);
                if (!defined $next || $next =~ /^From /) {
                    $self->{from} = $next;
                    $mail .= "$/$read";
                    return \$mail;
                }
                push @cache, $next unless $self->{seekable};
                # seek back and scan line-by-line like the header
                # wasn't here
                print " Content-Length assertion failed '$next'\n" if debug;
                if ($self->{seekable}) {
                    seek $fh, $pos, 0;
                }
                else {
                    unshift @{$self->{cache}}, @cache;
                }
            }

            # much the same, but with Lines:
            if ($mail =~ m/^Lines:\s*(\d+)$/mi) {
                my @cache;
                $mail .= $prev;
                $prev = '';
                my $lines = $1;
                print " Lines: $lines\n" if debug;
                my $read = '';
                for (1 .. $lines) {
                    my $bodyline = _read_nextline($self);
                    last unless defined $bodyline;
                    push @cache, $bodyline unless $self->{seekable};
                    # unescape From_
                    $bodyline =~ s/^>(>*From )/$1/ if $self->{unescape};
                    $read .= $bodyline;
                }
                my $ign = _read_nextline($self); # trailing newline
                my $next = _read_nextline($self);
                if (!defined $next || $next =~ /^From /) {
                    $self->{from} = $next;
                    $mail .= "$/$read";
                    return \$mail;
                }
                push @cache, $ign, $next unless $self->{seekable};
                # seek back and scan line-by-line like the header
                # wasn't here
                print " Lines assertion failed '$next'\n" if debug;
                if ($self->{seekable}) {
                    seek $fh, $pos, 0;
                }
                else {
                    unshift @{$self->{cache}}, @cache;
                }
            }
        }

        if ($prev eq $/ && ($line =~ $self->_from_line_re)) {
            $last = $line;
            last;
        }

        if ($inheaders && !defined $self->{messageid} && ($line =~ /^Message-Id:\s*(.+)/i)) {
            $self->{messageid} = $1;
        }

        $mail .= $prev;
        $prev = $line;

        # unescape From_
        $prev =~ s/^>(>*From )/$1/ if $self->{unescape};
    }
    $mail .= $prev;
    print "$count end of message line $.\n" if debug;
    $self->{from} = $last;
    return unless $mail;
    return \$mail;
}

sub next_message {
    my $self = shift;
    my $ref = $self->next_messageref;
    return unless $ref;
    return ${$ref};
}

my @FROM_RE;
BEGIN {
  @FROM_RE = (
    # according to mutt:
    #   A valid message separator looks like:
    #   From [ <return-path> ] <weekday> <month> <day> <time> [ <tz> ] <year>
    qr/^From (?:\S+\s+)?(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)/,

    # though, as jwz rants, only this is reliable and portable
    qr/^From /,
  );
}

sub _from_line_re {
  return $FROM_RE[ $_[0]->{jwz_From_} ? 1 : 0 ];
}

sub tell {
    my $self = shift;
    return tell $self->{_fh};
}

sub messageid {
    my $self = shift;
    return $self->{messageid};
}

1;

__END__
=pod

=encoding UTF-8

=head1 NAME

Email::Folder::Mbox - reads raw RFC822 mails from an mbox file

=head1 VERSION

version 0.860

=head1 SYNOPSIS

This isa Email::Folder::Reader - read about its API there.

=head1 DESCRIPTION

Does exactly what it says on the tin - fetches raw RFC822 mails from an
mbox.

The mbox format is described at http://www.qmail.org/man/man5/mbox.html

We attempt to read an mbox as through it's the mboxcl2 variant,
falling back to regular mbox mode if there is no C<Content-Length>
header to be found.

=head2 OPTIONS

The new constructor takes extra options.

=over

=item C<fh>

When filename is set to C<"FH"> than Email::Folder::Mbox will read mbox
archive from filehandle C<fh> instead from disk file C<filename>.

=item C<eol>

This indicates what the line-ending style is to be.  The default is
C<"\n">, but for handling files with mac line-endings you would want
to specify C<eol =E<gt> "\x0d">

=item C<jwz_From_>

The value is taken as a boolean that governs what is used match as a
message separator.

If false we use the mutt style

 /^From \S+\s+(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)/
 /^From (?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)/;

If true we use

 /^From /

In deference to this extract from L<http://www.jwz.org/doc/content-length.html>

  Essentially the only safe way to parse that file format is to
  consider all lines which begin with the characters ``From ''
  (From-space), which are preceded by a blank line or
  beginning-of-file, to be the division between messages.  That is, the
  delimiter is "\n\nFrom .*\n" except for the very first message in the
  file, where it is "^From .*\n".

  Some people will tell you that you should do stricter parsing on
  those lines: check for user names and dates and so on.  They are
  wrong.  The random crap that has traditionally been dumped into that
  line is without bound; comparing the first five characters is the
  only safe and portable thing to do. Usually, but not always, the next
  token on the line after ``From '' will be a user-id, or email
  address, or UUCP path, and usually the next thing on the line will be
  a date specification, in some format, and usually there's nothing
  after that.  But you can't rely on any of this.

Defaults to false.

=item C<unescape>

This boolean value indicates whenever lines which starts with

 /^>+From /

should be unescaped (= removed leading '>' char). This is needed for
mboxrd and mboxcl variants. But there is no way to detect for used mbox
variant, so default value is false.

=item C<seek_to>

Seek to an offset when opening the mbox.  When used in combination with
->tell you may be able to resume reading, with a trailing wind.

=item C<next_message>

This returns next message as string

=item C<next_messageref>

This returns next message as ref to string

=item C<tell>

This returns the current filehandle position in the mbox.

=item C<next_from>

This returns the From_ line for next message. Call it before ->next_message.

=item C<messageid>

This returns the messageid of last read message. Call if after ->next_message.

=back

=head1 AUTHORS

=over 4

=item *

Simon Wistow <simon@thegestalt.org>

=item *

Richard Clamp <richardc@unixbeard.net>

=item *

Pali <pali@cpan.org>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2006 by Simon Wistow.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut