package Scrappy::Project; BEGIN { $Scrappy::Project::VERSION = '0.94112090'; } use Carp; use File::Find::Rule; use Scrappy; use Moose::Role; has app => ( is => 'ro', isa => 'Any', default => sub { my $self = shift; $self->scraper(Scrappy->new); my $meta = $self->meta; return $meta->has_method('setup') ? $self->setup : $self; } ); has parsers => ( is => 'ro', isa => 'Any', default => sub { my $self = shift; my $class = ref $self; my @parsers = (); $class =~ s/::/\//g; my @files = File::Find::Rule->file()->name('*.pm')->in(map {"$_/$class"} @INC); my %parsers = map { $_ => 1 } @files; #uniquenes for my $parser (keys %parsers) { my ($plug) = $parser =~ /($class\/.*)\.pm/; if ($plug) { $plug =~ s/\//::/g; push @parsers, $plug; } } return [@parsers]; } ); has registry => ( is => 'ro', isa => 'HashRef', default => sub { # map parsers my $parsers = {}; my @parsers = @{shift->parsers}; foreach my $parser (@parsers) { $parsers->{$parser} = $parser; $parsers->{lc($parser)} = $parser; } return $parsers; } ); has records => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); has routes => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); has scraper => ( is => 'rw', isa => 'Scrappy' ); sub route { my $self = shift; my $options = {}; # basic definition ($options->{route}, $options->{parser}) = @_ if scalar @_ == 2; # odd definition if (@_ % 2) { my $route = shift; $options = {@_}; $options->{route} = $route; } # check route and parser spec die "Error defining route, must have a route and parser assignment" unless $options->{route} && $options->{parser}; # covert parser from shortcut if used if ($options->{parser} !~ ref($self) . "::") { my $parser = $options->{parser}; # make fully-quaified parser name $parser = ucfirst $parser; $parser = join("::", map(ucfirst, split '-', $parser)) if $parser =~ /\-/; $parser = join("", map(ucfirst, split '_', $parser)) if $parser =~ /\_/; $options->{parser} = ref($self) . "::$parser"; } # find action if not specified #unless ( defined $options->{action} ) { # my ($action) = $options->{parser} =~ /\#(.*)$/; # $options->{parser} =~ s/\#(.*)$//; # $options->{action} = $action; #} $self->routes->{$options->{route}} = $options; delete $self->routes->{$options->{route}}->{route}; return $self; } sub parse_document { my ($self, $url) = @_; my $scraper = $self->scraper; croak("Unable to fetch document, URL is not defined") unless $url; croak("Can't parse document, No routes defined") unless keys %{$self->routes}; # try to match against route(s) foreach my $route (keys %{$self->routes}) { my $this = $scraper->page_match($route, $url); if ($this) { my $parser = $self->routes->{$route}->{parser}; #my $action = $self->routes->{$route}->{action}; no warnings 'redefine'; no strict 'refs'; my $module = $parser; $module =~ s/::/\//g; $module = "$module.pm"; require $module; my $new = $parser->new; $new->scraper($scraper); $self->records->{$route} = [] unless defined $self->records->{$route}; my $record = $new->parse($this); push @{$self->records->{$route}}, $record; return $record; } } return 0; } sub crawl { my ($class, $starting_url) = @_; my $self = ref $class ? $class : $class->new; croak("Error, can't execute without a starting url") unless $starting_url; my $q = $self->scraper->queue; $q->add($starting_url); while (my $url = $q->next) { # parse document data $self->scraper->get($url); $self->parse_document($url) if $self->scraper->page_loaded && $self->scraper->page_ishtml && $self->scraper->page_status == 200; } return $self->records; } 1;