use strict;
use warnings;
package HTML::SummaryBasic;
our $VERSION = 0.2;
=head1 NAME
HTML::SummaryBasic - Basic summary info from HTML.
=head1 SYNOPSIS
use HTML::SummaryBasic;
my $p = new HTML::SummaryBasic {
PATH => "input.html",
# or HTML => '<html>...</html>',
NOT_AVAILABLE => undef,
};
foreach (keys %{$p->{SUMMARY}}){
warn "$_ ... $p->{SUMMARY}->{$_}\n";
}
=head1 DEPENDENCIES
use HTML::TokeParser;
use HTML::HeadParser;
=cut
use Carp;
use HTML::TokeParser;
=head1 DESCRIPTION
From a file or string of HTML, creates a hash of useful summary information from C<meta> and C<body> elements of an HTML document.
=head1 GLOBAL VARIABLE
=item $NOT_AVAILABLE
Value for empty fields. Default is C<[Not Available]>. May be over-ridden directly by supplying the constructor with a field of the same name.
See L<THE SUMMARY STRUCTURE>.
=cut
our $NOT_AVAILABLE = '[Not available]';
=head1 CONSTRUCTOR (new)
Accepts a hash-like structure...
=over 4
=item HTML or PATH
Ref to a scalar of HTML, or plain string that is the path to an HTML file to process.
=item SUMMARY
Filled after C<get_summary> is called (see L<METHOD get_summary> and
L<THE SUMMARY STRUCTURE>).
=item FIELDS
An array of C<meta> tag C<name>s whose C<content> value should be
placed into the respective slots of the C<SUMMARY> field after
C<get_summary> has been called.
=back
=head2 THE SUMMARY STRUCTURE
A field of the object which is a hash, with key/values as follows:
=over 4
=item AUTHOR
HTML C<meta> tag C<X-META-AUTHOR>.
=item TITLE
Text of the element of the same name.
=item DESCRIPTION
Content of the C<meta> tag named C<X-META-DESCRIPTION>.
=item LAST_MODIFIED_META, LAST_MODIFIED_FILE
Time since of the modification of the file,
respectively according to any C<meta> tag of the same name,
with a C<X-META-> prefix; failing that, according to the file system.
=item CREATED_META, CREATED_FILE
As above, but relating to the creation date of the file.
=item FIRST_PARA
The first HTML C<p> element of the document.
=item HEADLINE
The first C<h1> tag; failing that, the first C<h2>; failing that,
the value of C<$NOT_AVAILABLE>.
=item PLUS...
Any meta-fields specified in the C<FIELDS> field.
=back
=cut
sub new {
my $class = shift;
$class = ref($class)? ref($class) : $class;
my $self = bless {}, $class;
my $args = ref($_[0])? shift : {@_};
# Defaults
$self->{SUMMARY} = {};
# Load parameters
$self->{uc $_} = $args->{$_} foreach keys %$args;
croak "Required parameter field missing : $_" if not $self->{PATH};
$self->_get_summary();
return $self;
}
sub _get_summary {
my ($self,$path) = @_;
my ($p,$token, $html);
if (defined $path){
if (ref $path){
$html = $path;
delete $self->{PATH};
} else {
$self->{PATH} = $path;
}
}
if ($self->{PATH}){
$html = $self->_load_file()
or return undef;
}
# Get first para
if (not $p = new HTML::TokeParser( $html ) ){
warn "HTML::TokeParser could not initiate: $!";
return undef;
}
if ($token = $p->get_tag('h1')){
$self->{SUMMARY}->{HEADLINE} = $p->get_trimmed_text;
} else {
$p = new HTML::TokeParser( $html );
if ($token = $p->get_tag('h2')){
$self->{SUMMARY}->{HEADLINE} = $p->get_trimmed_text;
} else {
$self->{SUMMARY}->{HEADLINE} = $self->{NOT_AVAILABLE};
}
}
if (not $p = new HTML::TokeParser( $html ) ){
warn "HTML::TokeParser could not initiate: $!";
return undef;
}
if ($token = $p->get_tag('p')){
$self->{SUMMARY}->{FIRST_PARA} = $p->get_trimmed_text;
} else {
$self->{SUMMARY}->{FIRST_PARA} = $self->{NOT_AVAILABLE}
}
$p = new HTML::TokeParser( $html );
$p->get_tag('title');
$self->{SUMMARY}->{TITLE} = $p->get_text('/title') || $self->{NOT_AVAILABLE};
{
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks) = stat $self->{PATH};
$self->{SUMMARY}->{LAST_MODIFIED_FILE} = scalar localtime ( $mtime ) || $self->{NOT_AVAILABLE};
$self->{SUMMARY}->{LAST_MODIFIED_FILE} =~ s/\s+/ /g;
$self->{SUMMARY}->{CREATED_FILE} = scalar localtime ( $ctime ) || $self->{NOT_AVAILABLE};
$self->{SUMMARY}->{CREATED_FILE} =~ s/\s+/ /g;
}
my $collect = {
map {$_=>1} (
keys %{$self->{FIELDS}},
qw(
AUTHOR DESCRIPTION
LAST-MODIFIED CREATED
)
)
};
$self->{SUMMARY}->{$_} = $self->{NOT_AVAILABLE} foreach keys %$collect;
$p = new HTML::TokeParser( $html );
while (my $tag = $p->get_tag('meta') ){
my $name = uc $tag->[1]->{name};
I:
for my $i (1..2){
$name =~ s/^X-META-//i if $i == 2;
if (exists $collect->{$name} ){
$self->{SUMMARY}->{$name} = $tag->[1]->{content};
last I;
}
}
}
$self->{SUMMARY}->{LAST_MODIFIED_META} = delete $self->{SUMMARY}->{"LAST-MODIFIED"};
$self->{SUMMARY}->{CREATED_META} = delete $self->{SUMMARY}->{"CREATED"};
return 1;
}
# Return a reference to a scalar of HTML, or C<undef> on failure, setting C<$!> with an error message.
sub _load_file {
my ($self,$path) = @_;
local *IN;
return $path if ref $path;
if (defined $path){
$self->{PATH} = $path
}
elsif (not $self->{PATH}){
warn "load_file requires a path argument, or that the PATH field be set";
return undef;
}
if (not open IN, $self->{PATH}){
warn "load_file could not open $self->{PATH}";
return undef;
}
read IN, $_, -s IN;
close IN;
return \$_;
}
1;
=head1 TODO
Maybe work on URI as well as file paths.
=head1 SEE ALSO
L<HTML::TokeParser>, L<HTML::HeadParser>.
=head1 AUTHOR
Lee Goddard (LGoddard@CPAN.org)
=head1 COPYRIGHT
Copyright 2000-2001 Lee Goddard.
This library is free software; you may use and redistribute it or modify it
undef the same terms as Perl itself.