package Scrappy::Scraper; BEGIN { $Scrappy::Scraper::VERSION = '0.94112090'; } # load OO System use Moose; # load other libraries use Data::Dumper; use File::Util; use Scrappy::Logger; use Scrappy::Plugin; use Scrappy::Queue; use Scrappy::Scraper::Control; use Scrappy::Scraper::Parser; use Scrappy::Scraper::UserAgent; use Scrappy::Session; use Try::Tiny; use URI; use Web::Scraper; use WWW::Mechanize; # html content attribute has 'content' => ( is => 'rw', isa => 'Any' ); # access control object has 'control' => ( is => 'ro', isa => 'Scrappy::Scraper::Control', default => sub { Scrappy::Scraper::Control->new; } ); # debug attribute has 'debug' => ( is => 'rw', isa => 'Bool', default => 1 ); # log object has 'logger' => ( is => 'ro', isa => 'Scrappy::Logger', default => sub { Scrappy::Logger->new; } ); # parser object has 'parser' => ( is => 'ro', isa => 'Scrappy::Scraper::Parser', default => sub { Scrappy::Scraper::Parser->new; } ); # plugins object has 'plugins' => ( is => 'ro', isa => 'Any', default => sub { Scrappy::Plugin->new; } ); # queue object has 'queue' => ( is => 'ro', isa => 'Scrappy::Queue', default => sub { Scrappy::Queue->new; } ); # session object has 'session' => ( is => 'ro', isa => 'Scrappy::Session', default => sub { Scrappy::Session->new; } ); # user-agent object has 'user_agent' => ( is => 'ro', isa => 'Scrappy::Scraper::UserAgent', default => sub { Scrappy::Scraper::UserAgent->new; } ); # www-mechanize object (does most of the heavy lifting, gets passed around alot) has 'worker' => ( is => 'ro', isa => 'WWW::Mechanize', default => sub { WWW::Mechanize->new; } ); sub back { my $self = shift; # specify user-agent $self->worker->add_header("User-Agent" => $self->user_agent->name) if defined $self->user_agent->name; # set html response $self->content(''); try { $self->worker->back; $self->content($self->worker->content); } catch { $self->log("error", "navigating to the previous page failed"); }; return unless $self->content; $self->log("info", "navigated back to " . $self->url . " successfully"); $self->stash->{history} = [] unless defined $self->stash->{history}; push @{$self->stash->{history}}, $self->url; $self->worker->{cookie_jar}->scan( sub { my ($version, $key, $val, $path, $domain, $port, $path_spec, $secure, $expires, $discard, $hash ) = @_; $self->session->stash('cookies' => {}) unless defined $self->session->stash('cookies'); $self->session->stash->{'cookies'}->{$domain}->{$key} = { version => $version, key => $key, val => $val, path => $path, domain => $domain, port => $port, path_spec => $path_spec, secure => $secure, expires => $expires, discard => $discard, hash => $hash }; $self->session->write; } ); return $self->url; } sub cookies { my $self = shift; $self->worker->{cookie_jar} = $_[0] if defined $_[0]; return $self->worker->{cookie_jar}; } sub domain { return shift->worker->base->host; } sub download { my $self = shift; my ($url, $dir, $file) = @_; $url = URI->new(@_); # specify user-agent $self->worker->add_header("User-Agent" => $self->user_agent->name) if defined $self->user_agent->name; # set html response if ($url && $dir && $file) { $dir =~ s/[\\\/]+$//; return unless $self->get($url); $self->store(join '/', $dir, $file); $self->log("info", "$url was downloaded to " . join('/', $dir, $file) . " successfully"); $self->back; } elsif ($url && $dir) { $dir =~ s/[\\\/]+$//; return unless $self->get($url); my @chars = ('a' .. 'z', 'A' .. 'Z', 0 .. 9); my $filename = $self->worker->response->filename; $filename = $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . '.downlaod' unless $filename; $self->store(join '/', $dir, $filename); $self->log("info", "$url was downloaded to " . join('/', $dir, $filename) . " successfully"); $self->back; } elsif ($url) { return unless $self->get($url); my @chars = ('a' .. 'z', 'A' .. 'Z', 0 .. 9); my $filename = $self->worker->response->filename; $filename = $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . $chars[rand(@chars)] . '.downlaod' unless $filename; $dir = $url->path; $dir =~ s/^\///g; $dir =~ s/\/$filename$//; File::Util->new->make_dir($dir) unless -d $dir || !$dir; $self->store(join '/', $dir, $filename); $self->log("info", "$url was downloaded to " . join('/', $dir, $filename) . " successfully"); $self->back; } else { croak( "To download data from a URI you must supply at least a valid URI " . "and download directory path"); } $self->stash->{history} = [] unless defined $self->stash->{history}; push @{$self->stash->{history}}, $url; $self->worker->{params} = {}; $self->worker->{params} = {map { ($_ => $url->query_form($_)) } $url->query_form}; sleep $self->pause; return $self; } sub dumper { shift; return Data::Dumper::Dumper(@_); } sub form { my $self = shift; my $url = URI->new($self->url); # TODO: need to figure out how to determine the form action before submit # specify user-agent $self->worker->add_header("User-Agent" => $self->user_agent->name) if defined $self->user_agent->name; # set html response $self->content(''); my @args = @_; try { $self->content($self->worker->submit_form(@args)); }; if ($self->content) { # access control if ($self->control->is_allowed($self->content)) { $self->log("warn", "$url was not fetched, the url is prohibited"); return 0; } else { $self->log("info", "form posted from $url successfully", @_); } } else { $self->log("error", "error POSTing form from $url", @_); } #$self->stash->{history} = [] unless defined $self->stash->{history}; #push @{$self->stash->{history}}, $url; $self->worker->{cookie_jar}->scan( sub { my ($version, $key, $val, $path, $domain, $port, $path_spec, $secure, $expires, $discard, $hash ) = @_; $self->session->stash('cookies' => {}) unless defined $self->session->stash('cookies'); $self->session->stash->{'cookies'}->{$domain}->{$key} = { version => $version, key => $key, val => $val, path => $path, domain => $domain, port => $port, path_spec => $path_spec, secure => $secure, expires => $expires, discard => $discard, hash => $hash }; $self->session->write; } ); $self->worker->{params} = {}; $self->worker->{params} = {map { ($_ => $url->query_form($_)) } $url->query_form}; sleep $self->pause; return $self; } sub get { my $self = shift; my $url = URI->new(@_); # specify user-agent $self->worker->add_header("User-Agent" => $self->user_agent->name) if defined $self->user_agent->name; # set html response $self->content(''); try { $self->content($self->worker->get($url)); }; if ($self->content) { # access control if (!$self->control->is_allowed($self->content)) { $self->log("warn", "$url was not fetched, the url is prohibited"); return 0; } else { $self->log("info", "$url was fetched successfully"); } } else { $self->log("error", "error GETing $url"); } $self->stash->{history} = [] unless defined $self->stash->{history}; push @{$self->stash->{history}}, $url; $self->worker->{cookie_jar}->scan( sub { my ($version, $key, $val, $path, $domain, $port, $path_spec, $secure, $expires, $discard, $hash ) = @_; $self->session->stash('cookies' => {}) unless defined $self->session->stash('cookies'); $self->session->stash->{'cookies'}->{$domain}->{$key} = { version => $version, key => $key, val => $val, path => $path, domain => $domain, port => $port, path_spec => $path_spec, secure => $secure, expires => $expires, discard => $discard, hash => $hash }; $self->session->write; } ) if $self->session->file; $self->worker->{params} = {}; $self->worker->{params} = {map { ($_ => $url->query_form($_)) } $url->query_form}; sleep $self->pause; return $self; } sub page_data { my $self = shift; my ($data, @args); if (scalar(@_) % 2) { $data = shift; @args = @_; } else { if (@_ == 2) { @args = @_; } else { $data = shift; } } if ($data) { $self->worker->update_html($data); } return $self->worker->content(@args); } sub page_content_type { return shift->worker->content_type; } sub page_ishtml { return shift->worker->is_html; } sub page_loaded { return shift->worker->success; } sub page_match { my $self = shift; my $pattern = shift; my $url = shift || $self->url; $url = URI->new($url); my $options = shift || {}; croak("route can't be defined without a valid URL pattern") unless $pattern; my $route = $self->stash->{patterns}->{$pattern}; # does route definition already exist? unless (keys %{$route}) { $route->{on_match} = $options->{on_match}; # define options if (my $host = $options->{host}) { $route->{host} = $host; $route->{host_re} = ref $host ? $host : qr(^\Q$host\E$); } $route->{pattern} = $pattern; # compile pattern my @capture; $route->{pattern_re} = do { if (ref $pattern) { $route->{_regexp_capture} = 1; $pattern; } else { $pattern =~ s! \{((?:\{[0-9,]+\}|[^{}]+)+)\} | # /blog/{year:\d{4}} :([A-Za-z0-9_]+) | # /blog/:year (\*) | # /blog/*/* ([^{:*]+) # normal string ! if ($1) { my ($name, $pattern) = split /:/, $1, 2; push @capture, $name; $pattern ? "($pattern)" : "([^/]+)"; } elsif ($2) { push @capture, $2; "([^/]+)"; } elsif ($3) { push @capture, '__splat__'; "(.+)"; } else { quotemeta($4); } !gex; qr{^$pattern$}; } }; $route->{capture} = \@capture; $self->stash->{patterns}->{$route->{pattern}} = $route; } # match if ($route->{host_re}) { unless ($url->host =~ $route->{host_re}) { return 0; } } if (my @captured = ($url->path =~ $route->{pattern_re})) { my %args; my @splat; if ($route->{_regexp_capture}) { push @splat, @captured; } else { for my $i (0 .. @{$route->{capture}} - 1) { if ($route->{capture}->[$i] eq '__splat__') { push @splat, $captured[$i]; } else { $args{$route->{capture}->[$i]} = $captured[$i]; } } } my $match = +{ (label => $route->{label}), %args, (@splat ? (splat => \@splat) : ()) }; if ($route->{on_match}) { my $ret = $route->{on_match}->($self, $match); return 0 unless $ret; } $match->{params} = {%args}; $match->{params}->{splat} = \@splat if @splat; return $match; } return 0; } sub page_reload { my $self = shift; # specify user-agent $self->worker->add_header("User-Agent" => $self->user_agent->name) if defined $self->user_agent->name; # set html response $self->content(''); try { $self->content($self->worker->reload); }; $self->content ? $self->log("info", "page reloaded successfully") : $self->log("error", "error reloading page"); my $url = $self->url; $self->stash->{history} = [] unless defined $self->stash->{history}; push @{$self->stash->{history}}, $url; $self->worker->{cookie_jar}->scan( sub { my ($version, $key, $val, $path, $domain, $port, $path_spec, $secure, $expires, $discard, $hash ) = @_; $self->session->stash('cookies' => {}) unless defined $self->session->stash('cookies'); $self->session->stash->{'cookies'}->{$domain}->{$key} = { version => $version, key => $key, val => $val, path => $path, domain => $domain, port => $port, path_spec => $path_spec, secure => $secure, expires => $expires, discard => $discard, hash => $hash }; $self->session->write; } ); return $self; } sub page_status { return shift->worker->status; } sub page_text { return shift->page_data(format => 'text'); } sub page_title { return shift->worker->title; } sub plugin { my ($self, @plugins) = @_; foreach (@plugins) { with $self->plugins->load_plugin($_); } return $self; } sub post { my $self = shift; my $url = URI->new($_[0]); # access control unless ($self->control->is_allowed($url)) { $self->log("warn", "$url was not fetched, the url is prohibited"); return 0; } # specify user-agent $self->worker->add_header("User-Agent" => $self->user_agent->name) if defined $self->user_agent->name; # set html response $self->content(''); my @args = @_; try { $self->content($self->worker->post(@args)); }; if ($self->content) { # access control if ($self->control->is_allowed($self->content)) { $self->log("warn", "$url was not fetched, the url is prohibited"); return 0; } else { $self->log("info", "posted data to $_[0] successfully", @_); } } else { $self->log("error", "error POSTing data to $_[0]", @_); } $self->stash->{history} = [] unless defined $self->stash->{history}; push @{$self->stash->{history}}, $url; $self->worker->{cookie_jar}->scan( sub { my ($version, $key, $val, $path, $domain, $port, $path_spec, $secure, $expires, $discard, $hash ) = @_; $self->session->stash('cookies' => {}) unless defined $self->session->stash('cookies'); $self->session->stash->{'cookies'}->{$domain}->{$key} = { version => $version, key => $key, val => $val, path => $path, domain => $domain, port => $port, path_spec => $path_spec, secure => $secure, expires => $expires, discard => $discard, hash => $hash }; $self->session->write; } ); $self->worker->{params} = {}; $self->worker->{params} = {map { ($_ => $url->query_form($_)) } $url->query_form}; sleep $self->pause; return $self; } sub proxy { my $self = shift; my $proxy = pop @_; my @protocol = @_; $self->worker->proxy([@protocol], $proxy); $self->log("info", "Set proxy $proxy using protocol(s) " . join ' and ', @protocol); return $self; } sub request_denied { my $self = shift; my ($last) = reverse @{$self->stash->{history}}; return 1 if ($self->url ne $last); } sub select { my ($self, $selector, $html) = @_; my $parser = Scrappy::Scraper::Parser->new; $parser->html($html ? $html : $self->content); return $parser->select($selector); } sub log { my $self = shift; my $type = shift; my @args = @_; if ($self->debug) { if ($type eq 'info') { $self->logger->info(@args); } elsif ($type eq 'warn') { $self->logger->warn(@args); } elsif ($type eq 'error') { $self->logger->error(@args); } else { warn $type; $self->logger->event($type, @args); } return 1; } else { return 0; } } sub pause { my $self = shift; if (defined $_[0]) { if ($_[1]) { my @range = (($_[0] < $_[1] ? $_[0] : 0) .. $_[1]); $self->worker->{pause_range} = [$_[0], $_[1]]; $self->worker->{pause} = $range[rand(@range)]; } else { $self->worker->{pause} = $_[0]; $self->worker->{pause_range} = [0, 0] unless $_[0]; } } else { my $interval = $self->worker->{pause} || 0; # select the next random pause value from the range if (defined $self->worker->{pause_range}) { my @range = @{$self->worker->{pause_range}}; $self->pause(@range) if @range == 2; } $self->log("info", "processing was halted for $interval seconds") if $interval > 0; return $interval; } } sub response { return shift->worker->response; } sub stash { my $self = shift; $self->{stash} = {} unless defined $self->{stash}; if (@_) { my $stash = @_ > 1 ? {@_} : $_[0]; if ($stash) { if (ref $stash eq 'HASH') { $self->{stash}->{$_} = $stash->{$_} for keys %{$stash}; } else { return $self->{stash}->{$stash}; } } } return $self->{stash}; } sub store { # return shift->worker->save_content(@_); # oh no i didnt just rewrite www:mech save_content, oh yes i did # ... in hope to avoid content encoding issues my $self = shift; my $filename = shift; open(my $fh, '>', $filename) or $self->worker->die("Unable to create $filename: $!"); if ( $self->worker->content_type =~ m{^text/} || $self->worker->content_type =~ m{^application/(atom|css|javascript|json|rss|xml)}) { # text $self->worker->response->decode; print {$fh} $self->worker->response->content or $self->worker->die("Unable to write to $filename: $!"); } else { # binary binmode $fh; print {$fh} $self->worker->response->content or $self->worker->die("Unable to write to $filename: $!"); } close $fh or $self->worker->die("Unable to close $filename: $!"); return $self; } sub url { return $_[0]->worker->uri if $_[0]->content; } 1;