=head1 NAME KinoSearch::Docs::Tutorial - sample indexing and search applications =head1 DESCRIPTION The following sample code for invindexer.plx and search.cgi can be used to create a simple search engine. It requires the html presentation of the US Constitution included in the distribution for KinoSearch, under C. Note that a proper indexer for html documents would not rely on quick-n-dirty regular expressions for stripping tags, as this one does for the sake of brevity -- it would use a dedicated parsing module such as L. =head2 invindexer.plx #!/usr/bin/perl use strict; use warnings; use File::Spec; use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; ### In order for invindexer.plx to work correctly, you must modify ### $source_dir, $path_to_invindex, and possibly $base_url. ### ### $source_dir must lead to the directory containing the US ### Constitution html files. ### ### $path_to_invindex is the future location of the invindex. ### ### $base_url should reflect the location of the us_constitution directory ### when accessed via a web browser. my $source_dir = ''; my $path_to_invindex = ''; my $base_url = '/us_constitution'; opendir( my $source_dh, $source_dir ) or die "Couldn't opendir '$source_dir': $!"; my @filenames = grep {/\.html/} readdir $source_dh; closedir $source_dh or die "Couldn't closedir '$source_dir': $!"; ### STEP 1: Choose an Analyzer. my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en', ); ### STEP 2: Create a InvIndexer object. my $invindexer = KinoSearch::InvIndexer->new( analyzer => $analyzer, invindex => $path_to_invindex, create => 1, ); ### STEP 3: Define fields. $invindexer->spec_field( name => 'title' ); $invindexer->spec_field( name => 'bodytext', vectorized => 1, ); $invindexer->spec_field( name => 'url', indexed => 0, ); foreach my $filename (@filenames) { next if $filename eq 'index.html'; my $filepath = File::Spec->catfile( $source_dir, $filename ); open( my $fh, '<', $filepath ) or die "couldn't open file '$filepath': $!"; my $content = do { local $/; <$fh> }; ### STEP 4: Start a new document. my $doc = $invindexer->new_doc; $content =~ m#(.*?)#s or die "couldn't isolate title in '$filepath'"; my $title = $1; $content =~ m#
(.*?)
#s or die "couldn't isolate bodytext in '$filepath'"; my $bodytext = $1; $bodytext =~ s/<.*?>/ /gsm; # quick and dirty tag stripping ### STEP 5: Set the value for each field. $doc->set_value( url => "$base_url/$filename" ); $doc->set_value( title => $title ); $doc->set_value( bodytext => $bodytext ); ### STEP 6 Add the document to the invindex. $invindexer->add_doc($doc); ### STEP 7 Repeat steps 3-5 for each document in the collection. } ### STEP 8 Finalize the invindex. $invindexer->finish; =head2 search.cgi #!/usr/bin/perl -T use strict; use warnings; use CGI; use List::Util qw( max min ); use POSIX qw( ceil ); use KinoSearch::Searcher; use KinoSearch::Analysis::PolyAnalyzer; use KinoSearch::Highlight::Highlighter; my $cgi = CGI->new; my $q = $cgi->param('q'); my $offset = $cgi->param('offset'); my $hits_per_page = 10; $q = '' unless defined $q; $offset = 0 unless defined $offset; ### In order for search.cgi to work, $path_to_invindex must be modified so ### that it points to the invindex created by invindexer.plx, and ### $base_url may have to change to reflect where a web-browser should ### look for the us_constitution directory. my $path_to_invindex = ''; my $base_url = '/us_constitution'; ### STEP 1: Specify the same Analyzer used to create the invindex. my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en', ); ### STEP 2: Create a Searcher object. my $searcher = KinoSearch::Searcher->new( invindex => $path_to_invindex, analyzer => $analyzer, ); ### STEP 3: Feed a query to the Search object. my $hits = $searcher->search($q); ### STEP 4: Arrange for highlighted excerpts to be created. my $highlighter = KinoSearch::Highlight::Highlighter->new( excerpt_field => 'bodytext' ); $hits->create_excerpts( highlighter => $highlighter ); ### STEP 5: Process the search. $hits->seek( $offset, $hits_per_page ); ### STEP 6: Format the results however you like. # create result list my $report = ''; while ( my $hit = $hits->fetch_hit_hashref ) { my $score = sprintf( "%0.3f", $hit->{score} ); $report .= qq|

$hit->{title} $score
$hit->{excerpt}
$hit->{url}

|; } $q =~ s/"/"/g; # display info about the number of hits, paging links my $total_hits = $hits->total_hits; my $num_hits_info; if ( !length $q ) { # no query, no display $num_hits_info = ''; } elsif ( $total_hits == 0 ) { # alert the user that their search failed $num_hits_info = qq|

No matches for $q

|; } else { # calculate the nums for the first and last hit to display my $last_result = min( ( $offset + $hits_per_page ), $total_hits ); my $first_result = min( ( $offset + 1 ), $last_result ); # display the result nums, start paging info $num_hits_info = qq|

Results $first_result-$last_result of $total_hits for $q.

Results Page: |; # calculate first and last hits pages to display / link to my $current_page = int( $first_result / $hits_per_page ) + 1; my $last_page = ceil( $total_hits / $hits_per_page ); my $first_page = max( 1, ( $current_page - 9 ) ); $last_page = min( $last_page, ( $current_page + 10 ) ); # create a url for use in paging links my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string; $href .= ";offset=0" unless $href =~ /offset=/; # generate the "Prev" link; if ( $current_page > 1 ) { my $new_offset = ( $current_page - 2 ) * $hits_per_page; $href =~ s/(?<=offset=)\d+/$new_offset/; $num_hits_info .= qq|<= Prev\n|; } # generate paging links for my $page_num ( $first_page .. $last_page ) { if ( $page_num == $current_page ) { $num_hits_info .= qq|$page_num \n|; } else { my $new_offset = ( $page_num - 1 ) * $hits_per_page; $href =~ s/(?<=offset=)\d+/$new_offset/; $num_hits_info .= qq|$page_num\n|; } } # generate the "Next" link if ( $current_page != $last_page ) { my $new_offset = $current_page * $hits_per_page; $href =~ s/(?<=offset=)\d+/$new_offset/; $num_hits_info .= qq|Next =>\n|; } # finish paging links $num_hits_info .= "

\n"; } # blast it all out print "Content-type: text/html\n\n"; print < KinoSearch: $q
$report $num_hits_info

Powered by KinoSearch

END_HTML =head1 COPYRIGHT Copyright 2005-2006 Marvin Humphrey =head1 LICENSE, DISCLAIMER, BUGS, etc. See L version 0.15.