package Text::Original;
use 5.006; use strict; use warnings;
use Memoize;

=head1 NAME

Text::Original - Find original, non-quoted text in a message


    use Text::Original;
    my $sentence = first_sentence($email->body);



our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw( first_lines first_paragraph first_sentence) ] ); 
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = ( @{ $EXPORT_TAGS{'all'} } );

our $VERSION = '1.5';

=head2 first_lines

    first_lines($text, 20);

Returns the a number of lines after the first non blank, none quoted
line of the body of the email.

It will guess at attribution lines and skip them as well.

It will return super cited lines. This is the super-citers'
fault, not ours.

It won't catch all types of attribution lines;

It can optionally be passed a number of lines to get.


sub first_lines {
    my $text = shift;
    my $num  = shift || 1;

    return _significant_signal($text, lines => $num);

=head2 first_paragraph

Returns the first original paragraph of the message


sub first_paragraph {
    return _significant_signal(shift, para => 1);

=head2 first_sentence

Returns the first original sentence of the message


sub first_sentence {
    my $text = first_paragraph(shift);
    $text =~ s/(\p{STerm}) .*/$1/s; 
    return $text;

# Kudos to Damian Conway for this bit.
my $quotechar = qq{[!#%=|:]};
my $quotechunk = qq{(?:$quotechar(?![a-z])|[a-z]*>+)};
my $quoter = qq{(?:(?i)(?:$quotechunk(?:[ \\t]*$quotechunk)*))};

sub _significant_signal {
    my $text = shift;
    my %opts = @_;

    my $return = "";
    my $lines  = 0;

    # get all the lines from the main part of the body
    my @lines = split /$/m, $text;

    # right, find the start of the original content or quoted
    # content (i.e. skip past the attributation)
    my $not_started = 1;
    while (@lines && $not_started) {
        # next line
        local $_ = shift @lines;
        #print "}}$_";

        # blank lines, euurgh
        next if /^\s*$/;
        # quotes (we don't count quoted From's)
        next if /^\s*>(?!From)/;
        # Other kinds of quoter:
        next if /^\s*$quoter/;
        # skip obvious attribution
        next if /^\s*On (Mon|Tue|Wed|Thu|Fri|Sat|Sun)/i;
        next if /\d{4}-?\w{2,3}-?\d{2}.*\d+:\d+:\d+/i; # Looks like a date
        next if /^\w+(\s\w+)?:$/; # lathos' minimalist attributions. :)
        next if /^\s*.+=? wrote:/i;

        # skip signed messages
        next if /^\s*-----/;
        next if /^Hash:/;

        # annoying hi messages (this won't work with i18n)
        next if /^\s*(?:hello|hi|hey|greetings|salut
                        |good (?:morning|afternoon|day|evening))

        # snips
        next if m~\s*                          # whitespace
                  [<.=-_*+({\[]*?              # opening bracket
                  (?:snip|cut|delete|deleted)  # snip?
                  [^>}\]]*?                    # some words?
                  [>.=-_*+)}\]]*?              # closing bracket
                 \s*$                          # end of the line

        # [.. foo ..] or or so on
        next if m~\s*\[?\.\..*?\.\.]?\s*$~;

        # ... or [...]
        next if m~\s*\[?\.\.\.]?\s*$~;

        # if we got this far then we've probably got past the
        # attibutation lines
        unshift @lines, $_;  # undo the shift
        undef $not_started;  # and say we've started.

    # okay, let's _try_ to build up some content then
    foreach (@lines) {
        # are we at the end of a paragraph?
        last if (defined $opts{'para'}  # paragraph mode?
                 && $opts{'para'}==1
                 && $lines>0            # got some lines aready?
                 && /^\s*$/);           # and now we've found a gap?

        # blank lines, euurgh
        next if /^\s*$/;
        # quotes (we don't count quoted From's)
        next if /^\s*>(?!From)/;

        # if we got this far then the line was a useful one

        # sort of munged Froms
        $return .= "\n" if $lines>1;
        $return .= $_;
        last if (defined $opts{'lines'} && $opts{'lines'}==$lines);
    return $return;



=head1 EXPORTS

All of the above.

=head1 AUTHOR

Simon Wistow and the Mariachi project.

Packaged by Simon Cozens

Currently maintained by Simon Wistow <>


Copyright 2004 The Siesta Project

This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.