package App::Wallflower;
$App::Wallflower::VERSION = '1.015';
use strict;
use warnings;
use Getopt::Long qw( GetOptionsFromArray );
use Pod::Usage;
use Carp;
use Plack::Util ();
use URI;
use Wallflower;
use Wallflower::Util qw( links_from );
use List::Util qw( uniqstr max );
use Path::Tiny;
sub _default_options {
return (
follow => 1,
environment => 'deployment',
host => ['localhost'],
verbose => 1,
errors => 1,
);
}
# [ activating option, coderef ]
my @callbacks = (
[
errors => sub {
my ( $url, $response ) = @_;
my ( $status, $headers, $file ) = @$response;
return if $status == 200;
if ( $status == 301 ) {
my $i = 0;
$i += 2
while $i < @$headers && lc( $headers->[$i] ) ne 'location';
printf "$status %s -> %s\n", $url->path, $headers->[ $i + 1 ] || '?';
}
else {
printf "$status %s\n", $url->path;
}
},
],
[
verbose => sub {
my ( $url, $response ) = @_;
my ( $status, $headers, $file ) = @$response;
return if $status != 200;
printf "$status %s%s\n", $url->path,
$file && " => $file [${\-s $file}]";
},
],
[
tap => sub {
my ( $url, $response ) = @_;
my ( $status, $headers, $file ) = @$response;
if ( $status == 301 ) {
my $i = 0;
$i += 2
while $i < @$headers && lc( $headers->[$i] ) ne 'location';
note( "$url => " . ( $headers->[ $i + 1 ] || '?' ) );
}
elsif ( $status == 304 ) {
SKIP: { skip( $url, 1 ); }
}
else {
is( $status, 200, $url->path );
}
},
],
);
sub new_with_options {
my ( $class, $args ) = @_;
my $input = (caller)[1];
$args ||= [];
# save previous configuration
my $save = Getopt::Long::Configure();
# ensure we use Getopt::Long's default configuration
Getopt::Long::ConfigDefaults();
# get the command-line options (modifies $args)
my %option = _default_options();
GetOptionsFromArray(
$args, \%option,
'application=s', 'destination|directory=s',
'index=s', 'environment=s',
'follow!', 'filter|files|F',
'quiet', 'include|INC=s@',
'verbose!', 'errors!', 'tap!',
'host=s@',
'url|uri=s',
'parallel=i',
'help', 'manual',
'tutorial', 'version',
) or pod2usage(
-input => $input,
-verbose => 1,
-exitval => 2,
);
# restore Getopt::Long configuration
Getopt::Long::Configure($save);
# simple on-line help
pod2usage( -verbose => 1, -input => $input ) if $option{help};
pod2usage( -verbose => 2, -input => $input ) if $option{manual};
pod2usage(
-verbose => 2,
-input => do {
require Pod::Find;
Pod::Find::pod_where( { -inc => 1 }, 'Wallflower::Tutorial' );
},
) if $option{tutorial};
print "wallflower version $Wallflower::VERSION\n" and exit
if $option{version};
# application is required
pod2usage(
-input => $input,
-verbose => 1,
-exitval => 2,
-message => 'Missing required option: application'
) if !exists $option{application};
# create the object
return $class->new(
option => \%option,
args => $args,
);
}
sub new {
my ( $class, %args ) = @_;
my %option = ( _default_options(), %{ $args{option} || {} } );
my $args = $args{args} || [];
my @cb = @{ $args{callbacks} || [] };
# application is required
croak "Option application is required" if !exists $option{application};
# setup TAP
if ( $option{tap} ) {
require Test::More;
import Test::More;
if ( $option{parallel} ) {
my $tb = Test::Builder->new;
$tb->no_plan;
$tb->use_numbers(0);
}
$option{quiet} = 1; # --tap = --quiet
if ( !exists $option{destination} ) {
$option{destination} = Path::Tiny->tempdir( CLEANUP => 1 );
}
}
# --quiet = --no-verbose --no-errors
$option{verbose} = $option{errors} = 0 if $option{quiet};
# add the hostname passed via --url to the list built with --host
push @{ $option{host} }, URI->new( $option{url} )->host
if $option{url};
# pre-defined callbacks
push @cb, map $_->[1], grep $option{ $_->[0] }, @callbacks;
# include option
my $path_sep = $Config::Config{path_sep} || ';';
$option{inc} = [ split /\Q$path_sep\E/, join $path_sep,
@{ $option{include} || [] } ];
local $ENV{PLACK_ENV} = $option{environment};
local @INC = ( @{ $option{inc} }, @INC );
my $self = {
option => \%option,
args => $args,
callbacks => \@cb,
seen => {}, # keyed on $url->path
todo => [],
wallflower => Wallflower->new(
application => ref $option{application}
? $option{application}
: Plack::Util::load_psgi( $option{application} ),
( destination => $option{destination} )x!! $option{destination},
( index => $option{index} )x!! $option{index},
( url => $option{url} )x!! $option{url},
),
};
# setup parallel processing
if ( $self->{option}{parallel} ) {
require Fcntl;
import Fcntl qw( :seek :flock );
$self->{_parent_} = $$;
$self->{_forked_} = 0;
$self->{_ipc_dir_} = Path::Tiny->tempdir(
CLEANUP => 1,
TEMPLATE => 'wallflower-XXXX'
);
}
return bless $self, $class;
}
sub run {
my ($self) = @_;
( my $args, $self->{args} ) = ( $self->{args}, [] );
my $method = $self->{option}{filter} ? '_process_args' : '_process_queue';
$self->$method(@$args);
if ( $self->{option}{parallel} ) { $self->_wait_for_kids; }
elsif ( $self->{option}{tap} ) { done_testing(); }
}
sub _push_todo {
my ( $self, @items ) = @_;
my $seen = $self->{seen};
my $todo = $self->{todo};
my $host_ok = $self->_host_regexp;
# add to the to-do list
@items = uniqstr # unique
grep !$seen->{$_}, # not already seen
map ref() ? $_->path : $_, # paths
grep !ref || !$_->scheme # from URI
|| eval { $_->host =~ $host_ok }, # pointing only to expected hosts
@items;
push @$todo, @items;
if ( $self->{option}{parallel} ) {
if ( $self->{_parent_} == $$ ) { $self->_aggregate_todo(@items); }
else { $self->_save_todo; }
}
}
sub _aggregate_todo {
my ( $self, @items ) = @_;
my $TODO = $self->{_ipc_dir_}->child('__TODO__');
my $latest = ( stat $TODO )[9] || 0;
# aggregate all child todo into ours and save it as __TODO__
local *ARGV;
@ARGV = ( $TODO, glob $self->{_ipc_dir_}->child('todo-*') );
no warnings 'inplace'; # some files may already be gone
my $fh = File::Temp->new(
TEMPLATE => "__TODO__-XXXX",
DIR => $self->{_ipc_dir_},
);
print $fh uniqstr @ARGV ? <> : (), map "$_\n", @items;
close $fh;
rename "$fh", $TODO
or die "Can't rename $fh to $TODO: $!";
# the parent to-do list is always empty
$self->{todo} = [];
# fork all kids
if ( !$self->{_forked_} ) {
for ( 1 .. $self->{option}{parallel} ) {
if ( not my $pid = fork ) {
$self->{_pidfile_} = Path::Tiny->tempfile(
TEMPLATE => "pid-$$-XXXX",
DIR => $self->{_ipc_dir_},
);
delete $self->{_seen_fh_}; # will reopen
return;
}
elsif ( !defined $pid ) {
warn "Couldn't fork: $!";
}
else {
$self->{_forked_}++;
}
}
sleep 1; # give them time to settle
}
}
sub _save_todo {
my ($self) = @_;
# save the child todo
my $fh = File::Temp->new(
TEMPLATE => "todo-$$-XXXX",
DIR => $self->{_ipc_dir_},
);
print $fh map "$_\n", @{ $self->{todo} };
close $fh;
$self->{_todo_fh_} = $fh; # deletes previous one
}
# returns a boolean indicating if the update can be trusted
sub _update_todo {
my ($self) = @_;
my $todo = $self->{todo};
my $TODO = $self->{_ipc_dir_}->child('__TODO__');
my $SEEN = $self->{_ipc_dir_}->child('__SEEN__');
return if !-e $TODO;
my $certainty = # this update can be trusted if __TODO__ is the
( stat $TODO )[9] > max( 0, map +(stat)[9] || 0, # most recent
$SEEN, glob $self->{_ipc_dir_}->child('todo-*')); # file of all
# read from the shared todo
open my $fh, '<', $TODO or die "Can't open $TODO: $!";
@$todo = <$fh>;
chomp(@$todo);
return $certainty;
}
sub _next_todo {
my ($self) = @_;
my $seen = $self->{seen};
my $todo = $self->{todo};
my $next;
if ( $self->{option}{parallel} ) {
# in parallel mode, the parent does not render anything
return if $self->{_parent_} == $$;
TODO:
# read from the shared seen file
my $SEEN = $self->{_ipc_dir_}->child('__SEEN__');
my $seen_fh = $self->{_seen_fh_} ||= do {
open my $fh, -e $SEEN ? '+<' : '+>', $SEEN
or die "Can't open $SEEN in read-write mode: $!";
$fh->autoflush(1);
$fh;
};
flock( $seen_fh, LOCK_EX() ) or die "Cannot lock $SEEN: $!\n";
seek( $seen_fh, 0, SEEK_CUR() );
while (<$seen_fh>) { chomp; $seen->{$_}++; }
# find a todo item not seen
( $next, @$todo ) = uniqstr grep !$seen->{$_}, @$todo;
# or update todo and try again
if ( !defined $next ) {
my $certain = $self->_update_todo;
( $next, @$todo ) = uniqstr grep !$seen->{$_}, @$todo;
# if we can't trust the update, try the entire thing again
if ( !defined $next && !$certain ) {
flock( $seen_fh, LOCK_UN() ) or die "Cannot unlock $SEEN: $!\n";
sleep 1;
goto TODO;
}
}
# write to the shared seen file
if ( defined $next ) { # /!\ NOT ELSE /!\
seek( $seen_fh, 0, SEEK_END() );
print $seen_fh "$next\n";
}
flock( $seen_fh, LOCK_UN() ) or die "Cannot unlock $SEEN: $!\n";
}
else {
( $next, @$todo ) = uniqstr grep !$seen->{$_}, @$todo;
}
# nothing to do
return undef if !defined $next;
$seen->{$next}++;
return URI->new($next);
}
sub _wait_for_kids {
my ($self) = @_;
return if $self->{_parent_} != $$;
while ( @{ [ glob( $self->{_ipc_dir_}->child('pid-*') ) ] } ) {
$self->_aggregate_todo;
sleep 1;
}
if ( $self->{option}{tap} ) {
my $count;
my $SEEN = $self->{_ipc_dir_}->child( '__SEEN__' );
open my $fh, '<', $SEEN or die "Can't open $SEEN: $!";
seek $fh, 0, SEEK_SET();
$count++ while <$fh>;
my $tb = Test::Builder->new;
$tb->no_ending(1);
$tb->done_testing($count);
}
}
sub _process_args {
my $self = shift;
local *ARGV;
@ARGV = @_;
while (<>) {
# ignore blank lines and comments
next if /^\s*(#|$)/;
chomp;
$self->_process_queue("$_");
# child processes should not process the filter input
last if $self->{option}{parallel} && $self->{_parent_} != $$;
}
}
sub _process_queue {
my ( $self, @queue ) = @_;
my ( $wallflower, $seen ) = @{$self}{qw( wallflower seen )};
my $follow = $self->{option}{follow};
# I'm just hanging on to my friend's purse
local $ENV{PLACK_ENV} = $self->{option}{environment};
local @INC = ( @{ $self->{option}{inc} }, @INC );
$self->_push_todo( @queue ? @queue : ('/') );
while ( my $url = $self->_next_todo ) {
# get the response
my $response = $wallflower->get($url);
# run the callbacks
$_->( $url => $response ) for @{ $self->{callbacks} };
# obtain links to resources
my ( $status, $headers, $file ) = @$response;
if ( ( $status == 200 || $status == 304 ) && $follow ) {
$self->_push_todo( links_from( $response => $url ) );
}
# follow 301 Moved Permanently
elsif ( $status == 301 ) {
require HTTP::Headers;
my $l = HTTP::Headers->new(@$headers)->header('Location');
$self->_push_todo($l) if $l;
}
}
}
sub _host_regexp {
my ($self) = @_;
my $re = join '|',
map { s/\./\\./g; s/\*/.*/g; $_ }
@{ $self->{option}{host} };
return qr{^(?:$re)$};
}
1;
__END__
=pod
=head1 NAME
App::Wallflower - Class performing the moves for the wallflower program
=head1 VERSION
version 1.015
=head1 SYNOPSIS
# this is the actual code for wallflower
use App::Wallflower;
App::Wallflower->new_with_options( \@ARGV )->run;
=head1 DESCRIPTION
L<App::Wallflower> is a container for functions for the L<wallflower>
program.
=head2 new_with_options
App::Wallflower->new_with_options( \@ARGV );
Process options in the provided array reference (modifying it),
and return a object ready to be C<run()>.
See L<wallflower> for the list of options and their usage.
=head2 new
App::Wallflower->new( option => \%option, args => \@args );
Create an object ready to be C<run()>.
C<option> is a hashref of options as produced by L<Getopt::Long>, and
C<args> is an array ref of optional arguments to be processed by C<run()>
=head2 run
Make L<wallflower> dance.
Process the remaining arguments according to the options,
i.e. either as URLs to save or as files containing lists of URLs to save.
=head1 AUTHOR
Philippe Bruhat (BooK) <book@cpan.org>
=head1 COPYRIGHT AND LICENSE
Copyright 2012-2018 by Philippe Bruhat (BooK).
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut