package Dezi::Aggregator;
use Moose;
with 'Dezi::Role';
use Carp;
use Types::Standard qw( Bool Str InstanceOf Int Object );
use Dezi::Types qw( DeziFileOrCodeRef DeziEpoch );
use Dezi::Utils;
use SWISH::Filter;
use Dezi::Indexer::Doc;
use Scalar::Util qw( blessed );
use Data::Dump qw( dump );
use namespace::autoclean;
our $VERSION = '0.016';
has 'set_parser_from_type' => ( is => 'rw', isa => Bool, default => sub {1} );
has 'indexer' => (
is => 'rw',
isa => InstanceOf ['Dezi::Indexer'],
);
has 'doc_class' => (
is => 'rw',
isa => Str,
required => 1,
default => sub {'Dezi::Indexer::Doc'},
);
has 'swish_filter_obj' => (
is => 'rw',
isa => InstanceOf ['SWISH::Filter'],
default => sub { SWISH::Filter->new }
);
has 'test_mode' => ( is => 'rw', isa => Bool, default => sub {0} );
has 'filter' => ( is => 'rw', isa => DeziFileOrCodeRef, coerce => 1, );
has 'ok_if_newer_than' => ( is => 'rw', isa => DeziEpoch );
has 'progress' => ( is => 'rw', isa => Object ); # Term::ProgressBar
has 'count' => ( is => 'ro', isa => Int );
=pod
=head1 NAME
Dezi::Aggregator - document aggregation base class
=head1 SYNOPSIS
package MyAggregator;
use Moose;
extends 'Dezi::Aggregator';
sub get_doc {
my ($self, $url) = @_;
# do something to create a Dezi::Indexer::Doc object from $url
return $doc;
}
sub crawl {
my ($self, @where) = @_;
foreach my $place (@where) {
# do something to search $place for docs to pass to get_doc()
}
}
1;
=head1 DESCRIPTION
Dezi::Aggregator is a base class that defines the basic API for writing
an aggregator. Only two methods are required: get_doc() and crawl(). See
the SYNOPSIS for the prototypes.
See Dezi::Aggregator::FS and Dezi::Aggregator::Spider for examples
of aggregators that crawl the filesystem and web, respectively.
=head1 METHODS
=head2 BUILD
Set object flags per Dezi::Class API. These are also accessors,
and include:
=over
=item set_parser_from_type
This will set the parser() value in swish_filter() based on the
MIME type of the doc_class() object.
=item indexer
A Dezi::Indexer object.
=item doc_class
The name of the Dezi::Indexer::Doc-derived class to use in get_doc().
Default is Dezi::Indexer::Doc.
=item swish_filter_obj
A SWISH::Filter object. If not passed in new() one is created for you.
=item test_mode
Dry run mode, just prints info on stderr but does not
build index.
=item filter
Value should be a CODE ref. This is passed through to set_filter()
internally at BUILD() time. If you need to adjust the filter
after the Aggregator object is created, use set_filter().
=item ok_if_newer_than
Value should be a Unix timestamp (epoch seconds). Default is undef.
If set, aggregators should skip files that have a modification time
older than the timestamp.
You may get/set the ok_if_newer_than value with the ok_if_newer_than()
attribute method, but use set_ok_if_newer_than() to include validation
of the supplied I<timestamp> value.
=item progress( I<Term::ProgressBar object> )
Get/set a progress object. The default used in the examples/swish3
script is Term::ProgressBar. If set, it will be incremented
just like count() is.
=back
=cut
sub BUILD {
my $self = shift;
$self->{__progress_so_far} = 0;
$self->{__progress_next} = 0;
if ( $self->filter ) {
$self->set_filter( $self->filter );
}
}
=head2 config
Returns the Dezi::Indexer::Config object from the Indexer
being used. This is a read-only method (accessor not mutator).
=cut
sub config {
return shift->indexer->config;
}
=head2 count
Returns the total number of doc_class() objects returned by get_doc().
=cut
=head2 crawl( I<@where> )
Override this method in your subclass. It does the aggregation,
and passes each doc_class() object from get_doc() to indexer->process().
=cut
sub crawl {
my $self = shift;
confess ref($self) . " does not implement crawl()";
}
=head2 get_doc( I<url> )
Override this method in your subclass. Should return a doc_class()
object.
=cut
sub get_doc {
my $self = shift;
confess ref($self) . " does not implement get_doc()";
}
=head2 swish_filter( I<doc_class_object> )
Passes the content() of the I<doc_class_object> through SWISH::Filter
and transforms it to something index-able. Returns
the I<doc_class_object>, filtered.
B<NOTE:> This method should be called by all aggregators after
get_doc() and before passing to the indexer().
See the SWISH::Filter documentation.
=cut
sub swish_filter {
my $self = shift;
my $doc = shift;
unless ( $doc && blessed($doc) && $doc->isa('Dezi::Indexer::Doc') ) {
croak "Dezi::Indexer::Doc-derived object required";
}
if ( $self->debug ) {
warn "checking filter for " . $doc->url;
}
unless ( defined $doc->parser ) {
if ( $self->set_parser_from_type ) {
my $type = $doc->type || 'default';
$doc->parser(
Dezi::Utils->get_parser_for_mime(
$type,
( $self->indexer ? $self->indexer->swish3 : undef ),
)
);
}
}
my $sfo = $self->swish_filter_obj;
if ( $sfo->can_filter( $doc->type ) ) {
if ( $self->debug ) {
warn sprintf
"debug=%d can_filter true for %s with parser %s for type %s",
$self->debug, $doc->url,
$doc->parser, $doc->type;
}
my $content = $doc->content;
my $url = $doc->url;
my $type = $doc->type;
my $f = $sfo->convert(
document => \$content,
content_type => $type,
name => $url
);
if ( !$f
|| !$f->was_filtered
|| $f->is_binary ) # is is_binary necessary?
{
warn "skipping $url - filtering error\n";
return;
}
if ( $self->debug > 1 ) {
warn "$url [$type] was filtered\n";
if ( $doc->content ne ${ $f->fetch_doc } ) {
warn sprintf "content changed:'%s'\n", ${ $f->fetch_doc };
}
}
$doc->content( ${ $f->fetch_doc } );
# leave type and parser as-is
# since we want to store original mime in indexer.
# TODO test this.
# what about parser?
# since type will have changed ( $f->content_type ) from original
# the parser type might also have changed?
$doc->parser( $f->swish_parser_type ) if $self->set_parser_from_type;
}
else {
if ( $self->debug ) {
warn sprintf(
"No filter applied to %s - cannot filter %s (parser %s)\n",
$doc->url, $doc->type, $doc->parser, );
warn sprintf( " available filter: %s\n", $_ )
for $sfo->filter_list;
}
}
}
=head2 set_filter( I<code_ref> )
Use I<code_ref> as the C<doc_class> filter. This method called by BUILD() if
C<filter> param set in constructor.
=cut
sub set_filter {
my $self = shift;
my $filter = shift;
unless ( ref($filter) eq 'CODE' ) {
croak "filter must be a CODE ref";
}
# cheat a little by using this code instead of the default
# method in doc_class
{
no strict 'refs';
no warnings 'redefine';
#warn "setting filter as method: " . $self->{doc_class} . '::filter';
*{ $self->{doc_class} . '::filter' } = $filter;
}
}
=head2 set_ok_if_newer_than( I<timestamp> )
Set the ok_if_newer_than attribute. I<timestamp> should be a Unix
epoch value.
=cut
sub set_ok_if_newer_than {
my $self = shift;
my $ts = shift || 0;
if ( $ts =~ m/\D/ ) {
croak "timestamp should be an integer";
}
$self->ok_if_newer_than($ts);
}
#
# private methods
#
sub _increment_count {
my $self = shift;
my $count = shift || 1;
$self->{count} += $count;
if ( $self->{progress} ) {
$self->{__progress_so_far} += $count;
if ( $self->{__progress_so_far} >= $self->{__progress_next} ) {
$self->{__progress_next}
= $self->{progress}->update( $self->{__progress_so_far} );
}
}
return $self;
}
sub _apply_file_rules {
my ( $self, $file, $file_rules ) = @_;
if ( !$file_rules
&& !exists $self->{_file_rules}
&& $self->config->FileRules )
{
# cache obj
$self->{_file_rules} = File::Rules->new( $self->config->FileRules );
}
if ( $file_rules or exists $self->{_file_rules} ) {
$self->debug and warn "$file [applying DeziFileRules]\n";
my $rules = $file_rules || $self->{_file_rules};
my $match = $rules->match($file);
return $match;
}
return 0; # no rules
}
sub _apply_file_match {
my ( $self, $file ) = @_;
# TODO
return 0; # no-op for now
}
__PACKAGE__->meta->make_immutable;
1;
__END__
=head1 AUTHOR
Peter Karman, E<lt>perl@peknet.comE<gt>
=head1 BUGS
Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>.
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Dezi
You can also look for information at:
=over 4
=item * Mailing list
L<http://lists.swish-e.org/listinfo/users>
=item * RT: CPAN's request tracker
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App>
=item * AnnoCPAN: Annotated CPAN documentation
L<http://annocpan.org/dist/Dezi-App>
=item * CPAN Ratings
L<http://cpanratings.perl.org/d/Dezi-App>
=item * Search CPAN
L<http://search.cpan.org/dist/Dezi-App/>
=back
=head1 COPYRIGHT AND LICENSE
Copyright 2008-2018 by Peter Karman
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 SEE ALSO
L<http://swish-e.org/>