#!/usr/local/bin/perl -w

# ----------------------------------------------------------------------
# urifind - find URIs in a document and dump them to STDOUT.
# Copyright (C) 2003 darren chamberlain <darren@cpan.org>
# ----------------------------------------------------------------------

use strict;

our $VERSION = 20160806;

use File::Basename qw(basename);
use Getopt::Long qw(GetOptions);
use IO::File;
use URI::Find;
use URI::Find::Schemeless;

# What to do, and how
my $help = 0;
my $version = 0;
my $sort = 0;
my $reverse = 0;
my $unique = 0;
my $prefix = 0;
my $noprefix = 0;
my @pats = ();
my @schemes = ();
my $dump = 0;
my $schemeless = 0;

Getopt::Long::Configure(qw{no_ignore_case bundling});
GetOptions(
    's!'   => \$sort,
    'u!'   => \$unique,
    'p!'   => \$prefix,
    'n!'   => \$noprefix,
    'r!'   => \$reverse,
    'h!'   => \$help,
    'v!'   => \$version,
    'd!'   => sub { $dump = 1 },
    'D!'   => sub { $dump = 2 },
    'P=s@' => \@pats,
    'S=s@' => \@schemes,
    'schemeless!' => \$schemeless,
);

if ($help || $version) {
    my $prog = basename($0);

    if ($help) {
        print <<HELP;
$prog - find URIs in a document and dump them to STDOUT.

    $prog [OPTIONS] file1 [file2[, file3[, ...]]]

Options:

    -s          Sort results.
    -r          Reverse sort results (implies -s).
    -u          Return unique results only.
    -n          Don't include filename in output.
    -p          Include filename in output (0 by default, but 1 if
                multiple files are included on the command line).
    -P \$re      Print only lines matching regex '\$re' 
                (may be specified multiple times).
    -S \$scheme  Only this scheme (may be specified multiple times).
    -h          This help screen.
    -v          Display version and exit.
    -d          Dump compiled regexes and continue.
    -D          Dump compiled regexes and exit.

HELP
    }
    else {
        printf "$prog v.%.02f\n", $VERSION;
    }

    exit(0);
}

my (@uris, $count);
unshift @ARGV, \*STDIN unless @ARGV;

if (($prefix + $noprefix) > 1) {
    my $prog = basename $0;
    die "Can't specify -p and -n at the same time; try $prog -h\n";
}


# Print filename with matches?  -p / -n
# If there is more than one file, then show filenames by
# default, unless explicitly asked not to (-n)
if (@ARGV > 1) {
    $prefix = 1 unless $noprefix;
}
else {
    $prefix = 0 unless $prefix;
}

# Add schemes to the list of regexen
if (@schemes) {
    unshift @pats => sprintf '^(\b%s\b):' => join '\b|\b' => @schemes;
}

# If we are dumping (-d, -D), then dump.  Exit if -D.
if ($dump) {
    print STDERR "\$scheme = '" . (defined $pats[0] ? $pats[0] : '') . "'\n";
    print STDERR "\@pats = ('" . join("', '", @pats) . "')\n";
    exit if $dump == 2;
}

# Find the URIs
for my $argv (@ARGV) {
    my ($name, $fh, $data);

    $argv = \*STDIN if ($argv eq '-');

    if (ref $argv eq 'GLOB') {
        local $/;
        $data = <$argv>;
        $name = '<stdin>'
    }
    else {
        local $/;
        $fh = IO::File->new($argv) or die "Can't open $argv: $!";
        $data = <$fh>;
        $name = $argv;
    }

    my $class = $schemeless ? "URI::Find::Schemeless" : "URI::Find";
    my $finder = $class->new(sub { push @uris => [ $name, $_[0] ] });
    $finder->find(\$data);
}

# Apply patterns, in @pats
for my $pat (@pats) {
    @uris = grep { $_->[1] =~ /$pat/ } @uris;
}

# Remove redundant links
if ($unique) {
    my %unique;
    @uris = grep { ++$unique{$_->[1]} == 1 } @uris;
}

# Sort links, possibly in reverse
if ($sort || $reverse) {
    if ($reverse) {
        @uris = sort { $b->[1] cmp $a->[1] } @uris;
    }
    else {
        @uris = sort { $a->[1] cmp $b->[1] } @uris;
    }
}

# Flatten the arrayrefs
if ($prefix) {
    @uris = map { join ': ' => @$_ } @uris;
}
else {
    @uris = map { $_->[1] } @uris;
}

print map { "$_\n" } @uris;

exit 0;

__END__

=head1 NAME

urifind - find URIs in a document and dump them to STDOUT.

=head1 SYNOPSIS

    $ urifind file

=head1 DESCRIPTION

F<urifind> is a simple script that finds URIs in one or more files
(using C<URI::Find>), and outputs them to to STDOUT.  That's it.

To find all the URIs in F<file1>, use:

    $ urifind file1

To find the URIs in multiple files, simply list them as arguments:

    $ urifind file1 file2 file3

F<urifind> will read from C<STDIN> if no files are given or if a
filename of C<-> is specified:

    $ wget http://www.boston.com/ -O - | urifind

When multiple files are listed, F<urifind> prefixes each found URI
with the file from which it came:

    $ urifind file1 file2
    file1: http://www.boston.com/index.html
    file2: http://use.perl.org/

This can be turned on for single files with the C<-p> ("prefix") switch:

    $urifind -p file3
    file1: http://fsck.com/rt/

It can also be turned off for multiple files with the C<-n> ("no
prefix") switch:

    $ urifind -n file1 file2
    http://www.boston.com/index.html
    http://use.perl.org/

By default, URIs will be displayed in the order found; to sort them
ascii-betically, use the C<-s> ("sort") option.  To reverse sort them,
use the C<-r> ("reverse") flag (C<-r> implies C<-s>).

    $ urifind -s file1 file2
    http://use.perl.org/
    http://www.boston.com/index.html
    mailto:webmaster@boston.com

    $ urifind -r file1 file2
    mailto:webmaster@boston.com
    http://www.boston.com/index.html
    http://use.perl.org/

Finally, F<urifind> supports limiting the returned URIs by scheme or
by arbitrary pattern, using the C<-S> option (for schemes) and the
C<-P> option.  Both C<-S> and C<-P> can be specified multiple times:

    $ urifind -S mailto file1
    mailto:webmaster@boston.com

    $ urifind -S mailto -S http file1
    mailto:webmaster@boston.com
    http://www.boston.com/index.html

C<-P> takes an arbitrary Perl regex.  It might need to be protected
from the shell:

    $ urifind -P 's?html?' file1
    http://www.boston.com/index.html

    $ urifind -P '\.org\b' -S http file4
    http://www.gnu.org/software/wget/wget.html

Add a C<-d> to have F<urifind> dump the refexen generated from C<-S>
and C<-P> to C<STDERR>.  C<-D> does the same but exits immediately:

    $ urifind -P '\.org\b' -S http -D 
    $scheme = '^(\bhttp\b):'
    @pats = ('^(\bhttp\b):', '\.org\b')

To remove duplicates from the results, use the C<-u> ("unique")
switch.

=head1 OPTION SUMMARY

=over 4

=item -s

Sort results.

=item -r

Reverse sort results (implies -s).

=item -u

Return unique results only.

=item -n

Don't include filename in output.

=item -p

Include filename in output (0 by default, but 1 if multiple files are
included on the command line).

=item -P $re

Print only lines matching regex '$re' (may be specified multiple times).

=item -S $scheme

Only this scheme (may be specified multiple times).

=item -h

Help summary.

=item -v

Display version and exit.

=item -d

Dump compiled regexes for C<-S> and C<-P> to C<STDERR>.

=item -D

Same as C<-d>, but exit after dumping.

=back

=head1 AUTHOR

darren chamberlain E<lt>darren@cpan.orgE<gt>

=head1 COPYRIGHT

(C) 2003 darren chamberlain

This library is free software; you may distribute it and/or modify it
under the same terms as Perl itself.

=head1 SEE ALSO

L<URI::Find>