package WWW::Search::TheITJobBoard; use strict; use warnings; our $VERSION = '0.04'; our $DEBUG = 0; our $MAINTAINER = 'Lee Goddard new('TheITJobBoard', _debug => undef); $oSearch->native_query( WWW::Search::escape_query("perl"), jobtype => WWW::Search::TheITJobBoard::CONTRACT, ); while (my $oResult = $oSearch->next_result){ warn Dumper $oResultr; } =head1 DEPENDENCIES L, L. =head1 DESCRIPTION Gets jobs from the UK IT job site, I. A sub-class of L that uses L to return C objects for each result found when querying C. One frustrating aspect of I is that, unlike I (L, it doesn't provide an option to list jobs with full descriptions. So this module offers the ability to create such a list: provide the constructor parameter C, with a value of C or C to get details as HTML or plain text. This will extend the C objects' C field, and also add a C
key, which is itself a hash with interesting keys such as C and C - those keys come directly from the HTML page, so YMMV. At the time of writing, valid options for The IT Job Board search are as below. These should be passed to C, as shown in the example. =over 4 =item keywords THe keywords your target job description should contain. Default is C, of course. =item jobtype Valid values are: C<1> for contract (our default), C<2> for permenant, and C<0> for either You may use constants: C, C, C. =cut use constant ANY => 0; use constant CONTRACT => 1; use constant PERM => 2; =item days The age of the posting, in days, according to the site's records. A value of C<0> represents any age. Our default is C<1>. =item orderby Not especially relevant for us: valid values are C<1> to order by relevance to the keywords; C<2> to order by date posted; C<3> orders by salary; C<4> puts non-agency jobs first, which is the default. You may use constants: C, C, C, C =cut use constant RELEVANCE => 1; use constant DATE => 2; use constant SALARY => 3; use constant NONAGENCY => 4; =item locations[] This ugly-named entity limits the search by location. The default is to return all jobs, regardless. Valid values are: C to return all jobs; C<180> for UK, C<124> for Netherlands, C<93> for Germany, C<69> for France, C<308> for Switzerland, C<170> for Republic Of Ireland, C<3> for Austria, C<301> for 'the rest Of the world,' C<254> for 'other European.' You may supply C instead of C, but you will still have to access the value you set via the latter. =item currpage The number of the page to start at, indexed from C<1>. =item lang Defaults to C for English, but you could try other two-letter ISO codes. =back =cut # "native_setup_search()" is invoked before the search. It is passed a # single argument: the escaped, native version of the query. # # Blx, it takes opts too sub native_query { my ($self, $native_query, $opts) = (shift, shift, ref($_[0])? shift : {@_}); return $self->SUPER::native_query( $native_query, $opts); } # =head2 METHOD native_setup_search # # Sets up default search parameters. B that the first argument is the same as the option C, above. Unlike other modules in this namespace, you may provide options, after the query, # as either a reference to a hash, or as a list. # # =cut sub native_setup_search { my ($self, $native_query, $opts) = (shift, shift, ref($_[0])? shift : {@_}); $self->user_agent('non-robot'); $self->{_hits_per_page} = 100; $self->{_next_to_retrieve} = 1; $self->{search_base_url} ||= ''; $self->{search_base_path} ||= '/index.php'; $self->{search_url} ||= $self->{search_base_url} . $self->{search_base_path}; $self->{_options}->{keywords} ||= $native_query || 'PERL'; $self->{_options}->{task} ||= 'JobSearch'; $self->{_options}->{xc} ||= '0'; $self->{_options}->{lang} ||= 'en'; if ($self->{_options}->{locations}){ $self->{_options}->{'locations[]'} = $self->{_options}->{locations}; delete $self->{_options}->{locations}; } $self->{_options}->{'locations[]'} ||= undef; # 180=UK $self->{_options}->{jobtype} ||= '1'; # 0=any, 1=contract, 2=perm $self->{_options}->{days} ||= '1'; # 0=all, otherwise literal $self->{_options}->{orderby} ||= '4'; # 1=relevance, 2=date posted, 3=salary, 4=non-agency $self->{_options}->{currpage} ||= 1; # Current page of results $self->{_debug} ||= $DEBUG; if ($opts){ foreach my $i (keys %$opts){ $self->{_options}->{$i} = $opts->{$i}; } } $self->{_next_url} = $self->{search_url} .'?' . $self->hash_to_cgi_string($self->{_options}); } =head2 METHOD native_retrieve_some Retrieve some results. Returns the number of results found. Will make as many HTTP requests as necessary to get to the maximum results you specify. =cut # After WWW::Search::Yahoo::Advanced sub native_retrieve_some { my $self = shift; $self->{state} = WWW::Search::SEARCH_BEFORE; $self->{number_retrieved} ||= 0; $self->{_done_uris} = {}; # printf STDERR (" + %s::native_retrieve_some()\n", __PACKAGE__) if $self->{_debug}; # Initially _next_url is preset; we then undef it and try to get it from the HTML returned by the query: HTTP_REQUEST: while (defined $self->{_next_url} and not exists $self->{_done_uris}->{$self->{_next_url}}){ # fast exit if already done # If this is not the first page of results, sleep so as to not overload the server: $self->user_agent_delay if 1 < $self->{_next_to_retrieve}; $self->{state} = WWW::Search::SEARCH_RETRIEVING; # Get one page of results: print STDERR " + submitting URL (", $self->{_next_url}, ")\n" if $self->{_debug}; $self->{response} = $self->http_request($self->http_method, $self->{_next_url}); print STDERR " + got response\n", $self->{response}->headers->as_string, "\n" if 2 <= $self->{_debug}; $self->{_prev_url} = $self->{_next_url}; $self->{_done_uris}->{$self->{_next_url}} ++; $self->{state} = WWW::Search::SEARCH_UNDERWAY; # Assume there are no more results, unless we find out otherwise when we parse the html: $self->{_next_url} = undef; print STDERR " --- HTTP response is:\n", $self->{response}->as_string if 4 < $self->{_debug}; if (! $self->{response}->is_success) { if ($self->{_debug}) { print STDERR " --- HTTP request failed, response is:\n", $self->{response}->as_string; } return undef; } my $html = $self->{response}->content; # Parse the output: # No matches: if ($html =~ /There are no vacancies matching your search criteria/g){ if ($self->{_debug}) { print STDERR " --- search failed: there are no vacancies matching your search criteria"; } return undef; } $self->{_parser} = HTML::TokeParser->new(\$html); # Nice ppl they are, they provide a
containing lots of
# so loop over div class jobdet: PARSE_PAGE: while (my $token = $self->{_parser}->get_tag("div")) { # A 'div' with a 'class' attribute: if ($token->[1] and defined $token->[1]->{class}){ # Found a job description: use Data::Dumper; die Dumper $token if not $token->[1]->{class}; if ($token->[1]->{class} eq 'jobdet'){ my $xtoken_a = $self->{_parser}->get_tag("a"); if (not $xtoken_a){ print STDERR " --- unexpected result format (code 'a'): please inform author"; return undef; } my $title = $self->{_parser}->get_trimmed_text("/a"); if (not $title){ print STDERR " --- unexpected result format (code '/a'): please inform author"; return undef; } my $token_br = $self->{_parser}->get_tag("br"); if (not $token_br){ print STDERR " --- unexpected result format (code 'br'): please inform author"; return undef; } my $all = $self->{_parser}->get_trimmed_text("/p"); if (not $token_br){ print STDERR " --- unexpected result format (code '/p'): please inform author"; return undef; } my ($last_posted) = $all =~ /LAST POSTED: (\d{2}\/\d{2}\/\d{4} \d{2}:\d{2}:\d{2})/g; my $hit = WWW::SearchResult->new; $hit->add_url( ($xtoken_a->[1]->{href} =~ /^\//? $self->{search_base_url} : '') . $xtoken_a->[1]->{href} ); $hit->title( $title ); $hit->description( $all ); $hit->change_date( $last_posted ); # Get the details, if user desires: if ($self->{detailed}){ $hit->{details} = {} if not exists $hit->{details}; my $response = $self->http_request('GET', $hit->url); print STDERR " + got response for detail\n", $response->headers->as_string, "\n" if 2 <= $self->{_debug}; print STDERR " --- HTTP response for detail is:\n", $response->as_string if 4 < $self->{_debug}; if (! $response->is_success) { if ($self->{_debug}) { print STDERR " --- HTTP request for detail failed, response is:\n", $response->as_string; } return undef; } my $p = HTML::TokeParser->new(\$response->content); # Get to the div id "jobdescription" while (my $xtoken = $p->get_tag("div")){ last if $xtoken->[1]->{id} and $xtoken->[1]->{id} eq 'jobdescription'; } # Just get text? if (lc $self->{detailed} ne 'html'){ $hit->{description} = $p->get_trimmed_text('/div') } else { $hit->{description} = ''; while ($token = $p->get_token){ # reuse $token if ($token->[0] eq 'S'){ $hit->{description} .= $token->[4]; } elsif ($token->[0] eq 'S'){ $hit->{description} .= $token->[2]; } elsif ($token->[0] eq 'T'){ $hit->{description} .= $token->[2]; } } } # Get to the table after the div/id=jobparticulars and div/id=jobs while (my $xtoken = $p->get_tag("div")){ last if $xtoken->[1]->{id} and $xtoken->[1]->{id} eq 'jobs'; } $p->get_tag("table"); # use the table labels as hash keys - naughty...but nice! my $key; while (my $xtoken = $p->get_token){ last if $xtoken->[0] eq 'E' and $xtoken->[1] eq 'table'; next if $xtoken->[0] ne 'S' or $xtoken->[1] ne'td'; # Set the key if (exists $xtoken->[2]->{class} and $xtoken->[2]->{class} eq 'jobslabel'){ $key = $p->get_trimmed_text('/td'); $key =~ s/\s+/_/sg; $key =~ s/[^\w\d_]+//sg; $key = lc $key; } # Set the value elsif ($key) { $hit->{details}->{$key} = $p->get_trimmed_text('/td'); undef $key; } } # Next token of details' table } # End getting details push @{$self->{cache}}, $hit; $self->{_num_hits} ++; } # End found job } # End found div/class # Links to more results elsif ($token->[1]->{id} and $token->[1]->{id} eq 'prevnextpage'){ while (my $xtoken_a = $self->{_parser}->get_tag("a")){ next unless $xtoken_a->[1]->{title} and $xtoken_a->[1]->{title} eq 'Next'; if ($WIN){ # Weird bug in the latest (3.19) HTML::Parser.ppm TokeParser binary from AS # So have to do this: ($xtoken_a->[1]->{href}) = $xtoken_a->[3] =~ /href=([^\s>]+)/; ($xtoken_a->[1]->{href}) = $xtoken_a->[1]->{href} =~ /^.(.*).$/; } my $uri = ($xtoken_a->[1]->{href} =~ /^\//? $self->{search_base_url} : '') . $xtoken_a->[1]->{href}; # IF we've not done it: if (not exists $self->{_done_uris}->{$uri}){ $self->{_next_url} = $uri; print STDERR " + next URI: ".$self->{_next_uri} if $self->{_debug}; last PARSE_PAGE; } # else { it's the end of the result set } } } # End found more results } # Next PARSE_PAGE $self->{_options}->{currpage} ++; last HTTP_REQUEST if $self->{number_retrieved} + $self->{_num_hits} >= $self->{maximum_to_retrieve}; } # Next HTTP_REQUEST $self->{state} = WWW::Search::SEARCH_DONE; return $self->{_num_hits}; } 1; __END__ =head1 BUGS Posibly. Please use to report them. =head1 SEE ALSO This module was composed after reading L, L, L and L. If this module is useful to you, check out the latter too. =head1 COPYRIGHT Copyright (C) Lee Goddard, 2006. Some Rights Reserved. =head1 LICENCE This work is licensed under a I: L. =for html Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 2.0 England & Wales License. =end html =for xml =end xml =cut