#!/usr/bin/perl -w
#	$Id: oofilesearch 0.006 2005-02-07$

=head1	NAME

	oofilesearch - File selection by keywords


	oofilesearch -R "D:\Documents\*.sxw" openoffice desktop XML
	produces the list of the OOo-Writer documents present in the given
	directory and its subdirectories, and containing the words
	"openoffice", "desktop" AND "XML"
	oofilesearch -command "rm -f %f" "*.sxc *.sxw" lost dismiss cancel

	executes the "rm -f filename" (i.e. deletes the file in a Unix system)
	for each OOo-Writer or OOo-Calc file present in the current directory
	and containing the words "lost", "dismiss" AND "cancel"

=head1	USAGE

	oofilesearch [-options] <file filter> <keyword list>


	This utility allows the user to retrieve a list of files matching
	a given set of keywords or regular expressions. A file is selected
	when it contains, in its text and/or in its metadata (title, subject,
	keywords or description), all the given search strings.

	The selected files are echoed to the standard output (one file per
	line), so this utility can be used as a filter piping its results
	to another program. Alternatively, a given shell command can be
	launched by the script each time a file matches, allowing on-the-fly
	processing of the selected documents.

	The files filter may content one or more space-separated paths.
	Each path may content jokers. So it's possible to explore several
	directories and/or several filename patterns. 

	All the arguments after the file filter are processed as search

=head1	OPTIONS

	-R -recursive
		include the subdirectories of each given search directory
	-verbose -trace -debug
		echo some processing comments
		activate the warning messages of the OpenOffice::OODoc API
	-log <file>
		like -verbose, but then messages are sent to the given file
		and don't pollute the standard output
	-result <file>
	-output <file>
		send the list of matching files to the given file and not
		to the standard output
	-criteria <file>
		get search criteria from a file (one per line); the loaded
		search keywords may be combined with additional criteria
		passed with the command line, if any.
	-command <command> -exec <command>
		execute a shell command for each matching file ; if the
		command string contains "%f", this substring is replaced
		with the name of the selected file ; if this option is
		provided, the selection list is not echoed to the standard
		output ; if -verbose is on, the value returned by the
		command is echoed
	-encoding <encoding>
		selects the user's character set ; this option is mandatory
		if one or more search criteria contain characters not
		belonging to the default character set



use	OpenOffice::OODoc	1.301;
use	Getopt::Long;

our	$VERSION		= 0.006;


my	$recursive	= undef;
my	$verbose	= undef;
my	$warnings	= undef;
my	$command	= undef;
my	$result		= undef;
my	$log		= undef;
my	$list		= undef;
my	$character_set	= undef;
my	$LOG		= *STDOUT;

	'R|recursive'			=> \$recursive,
	'verbose|trace|debug'		=> \$verbose,
	'warnings'			=> \$warnings,
	'log=s'				=> \$log,
	'result|output=s'		=> \$result,
	'command|exec=s'		=> \$command,
	'criteria=s'			=> \$list,
	'encoding=s'			=> \$character_set


my	@keywords	= ();
my	$count		= 0;


sub	horodate
	my @d = localtime();
	return sprintf
		"[%02d/%02d/%04d %02d:%02d:%02d] ",
		$d[3], $d[4] + 1, $d[5] + 1900, $d[2], $d[1], $d[0]

sub	message
	my $text = shift;
	return unless ($verbose);
	print $LOG horodate() . "$text\n";
	print $LOG "\t$_\n" for @_;


sub	matching_file
	my $file	= shift;
	my @words	= @_;
	my $n		= scalar @words;
	my $text	= "";

	my $oof	= ooFile($file);
	unless ($oof)
		message	"$file doesn't look like an OpenOffice.org file";
		return undef;
	my $meta	= ooMeta(archive => $oof) or message
				"$file doesn't contain metadata";
	if ($meta)
		my $title = $meta->title;
		if ($title)
			message "Title: \"$title\"";
			$text .= $title;
			message "Title: <UNTITLED>";
		$text .= ($meta->keywords || "");
		$text .= ($meta->subject || "");
		$text .= ($meta->description || "");

	my $content	= ooText(archive => $oof) or message
				"$file doesn't have a regular content";
	if ($content)
		$text .= ($content->getTextContent || "");


	return undef unless $text;
	while (@words)
		my $word = shift @words or next;
		return undef unless $text =~ /$word/i;
	return 1;


sub	file_selection
	my @list = @_;
	my $number = scalar @list;

	message "$number file(s) in the search list";
	FILE: foreach my $file (@list)
		unless (-r $file)
			message "$file : unreadable";
			next FILE;
		if (-l $file)
			message "$file : symbolic link, ignored";
			next FILE;
		if ((-d $file) && $recursive)
			message "Searching in $file";
			next FILE;
		unless (-s $file)
			message "$file : empty";
			next FILE;
		unless (-f $file)
			message "$file is not a regular file";
			next FILE;
		message "Processing $file";
		if (matching_file($file, @keywords))
			message "OK! $file matches all the criteria";
			if ($command)	
				my $cmd = $command;
				$cmd =~ s/\%f/$file/g;
				message "Executing command: $cmd";
				my $r = system $cmd;
				message "Command result is $r";
				print $RESULT "$file\n";
			message "file $file doesn't match";

# main program

if ($result)
	open RESULT, ">", $result
		or die "output file $result is unwritable\n";

if ($log)
	open OUTPUT, ">>", $log
		or die "log file $log is unwritable\n";
	$verbose = 1;

if ($list)
	message "Loading a keyword list from $list";
	my $m;
	open LIST, "<", $list or warn "file $list is unreadable\n";
	while ($m = <LIST>)
		chomp $m;
		push @keywords, $m;
	close LIST;

localEncoding($character_set)	if $character_set;

die "Usage: oofilesearch [-options] <filefilter> [keywords]\n"
		unless $ARGV[0];

message "Starting the search...";

my $filter = shift @ARGV;
push @keywords, @ARGV;
die "Empty keword list.\n" unless @keywords;
message "Keyword list:", @keywords;
unless ($warnings)
	$SIG{'__WARN__'} = sub {};

message "Finished - $count file(s) selected";