#!/usr/bin/perl
#
# 1999@mei@websec.org
# mini spider with a
# couple of special
# features
#
#
# todo:
# -
#
#
##############################################################
use LWP;
use Getopt::Std;
use HTTP::Request::Common;
use HTTP::Response;
use HTML::TokeParser;
use IO::Scalar;
use URI::Find;
use vars qw($opt_u $opt_l $opt_p $opt_b $opt_q $opt_d);
getopts("u:l:p:b:qd");
## vardecs
##
my $url = $opt_u;
my $logfile = $opt_l;
my $proxy = $opt_p;
my $backupfile = $opt_b;
my $query = ($opt_q ? 1 : 0);
my @base;
## check that
##
if (!$url)
{
print "\nusage: $0 -u [URL]\n\t-p [proxyServer]\n\t-l [logFile]";
print "\n\t -d [considerDirectories]\n\t-q [accept queries(?a=b&c=d)]\n\n";
exit 11;
}
## a bit of input-validation :-)
##
($url = "http://".$url) if ($url !~ /http:\/\// && $url !~ /https:\/\//);
my $origin = $base[0] = $url;
my $user_agent = new LWP::UserAgent;
$user_agent->agent("Mozilla/4.0(compatible;MSIE 6.0;Windows NT 5.0)");
if ($proxy ne '')
{
$proxy = "http://".$proxy if ($proxy !~ /^.[^:\/]+:\/\//);
$user_agent->proxy('http', $proxy) if($url =~ /^http:\/\/.*$/);
$user_agent->proxy('https', $proxy) if($url =~ /^https:\/\/.*$/);
}
my (@alle, @alldirs);
my ($single, $sidir);
## cool recursive function
## checks all links of
## given url
##
&getall($user_agent, $origin, $url, \@alle, $query);
## remove doubles
## from all entries
##
@alle = sort keys %{{ map {$_,1} @alle }};
if ($logfile ne '')
{
open (RF, "> $logfile") || die "\ncant open $logfile !?!\n";
open (RFD, "> $logfile.dirs") || die "\ncant open $logfile.dirs !?!\n";
}
## all the entries
##
foreach $single (@alle)
{
$single =~ /(^.+\/)(.[^\/]*$)/;
push (@alldirs, $1);
print RF "$single\n" if ($logfile ne '');
}
## all the dirs
## remove doubles first
##
@alldirs = sort keys %{{ map {$_,1} @alldirs }};
foreach $sidir (@alldirs)
{
print STDERR "$sidir\n";
print RFD "$sidir\n" if ($logfile ne '');
}
if ($logfile ne '')
{
close (RF);
close (RFD);
}
print "\ndone\n";
exit 1;
####################
## sub extract_links
##
sub extract_links
{
my $ua = shift;
my $inc = shift;
my $url = shift;
my @links;
my $sili = "";
my $response = $ua->request(GET "$url");
## check for:
## frame-src, a-href, form-action, img-src
## todo: java-script adresses ?!?!
if ($response->is_success)
{
my $c = $response->content();
push (@links, &parseit($c, $inc, "frame", "src"));
push (@links, &parseit($c, $inc, "iframe", "src"));
push (@links, &parseit($c, $inc, "a", "href"));
push (@links, &parseit($c, $inc, "a", "name"));
push (@links, &parseit($c, $inc, "form", "action"));
push (@links, &parseit($c, $inc, "img", "src"));
push (@links, &parseit($c, $inc, "object", "data"));
push (@links, &parseit($c, $inc, "applet", "code"));
push (@links, &parseit($c, $inc, "embed", "src"));
}
## done
##
return @links;
}
## parseit
## parses html-content
## and returns array containing links
##
sub parseit
{
my $c = shift;
my $inc = shift;
my $first = shift;
my $second = shift;
my $fcont = new IO::Scalar \$c;
my @links;
## in case of frames
##
$f_page = HTML::TokeParser->new($fcont);
while (my $token = $f_page->get_tag("$first"))
{
my $link = $token->[1]{"$second"} || "";
push(@links,$link);
}
$fcont->close;
## the supertrick parse all content for URI like strings
## applets flash pdfs etc
##
my $sl1 = $sl2 = "";
my $finder = URI::Find->new( sub{ $sl1 = @_[0]; $sl2 = @_[1]; } );
$finder->find(\$c);
push(@links,$sl1) if ($sl1 ne "");
return @links;
}
## get_all
## nice and recursive
## todo: create proper urls :-)
##
sub getall
{
my $ua = shift;
my $inc = shift;
my $tmp_uri = shift;
my $all_links = shift;
my $q = shift;
if ( ($tmp_uri =~ /^$inc/) || ($tmp_uri !~ /^.[^:\/]+:\/\// && $tmp_uri !~ /mailto:/) )
{
## try to create a usable url
##
my $uri = &clean_uri($tmp_uri, $inc, $q);
## only if not existing
##
if ( !search_array($all_links, $uri) )
{
print STDERR "$uri\n";
## extract links into temp array
##
my @temp = &extract_links($ua, $inc, $uri);
push(@{$all_links}, $uri);
foreach my $single (@temp)
{
&getall($ua, $inc, $single, $all_links, $q)
}
}
}
}
## search_array
## check if given entry(needle) is
## in array
## todo: find a faster algorythm :-)
##
sub search_array
{
$array = shift;
$needle = shift;
for (my $c=0; $c<=$#{$array}; $c++)
{
return 1 if ($needle eq @{$array}[$c]);
}
return 0;
}
## create proper uri second try
## todo: remove ".." relative signatures ?!
##
sub clean_uri
{
my $tmp_uri = shift;
my $inc = shift;
my $q = shift;
my($iprot, $idom, $ipath, $iquery, $ifrags) =
$inc =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
my($prot, $dom, $path, $query, $frags) =
$tmp_uri =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
## clean a bit
##
($path = "/".$path) if($path !~ /^\//);
## the ugly dot dot problem
## is not solved by the URI-module
## therefore ...
##
#my $ret = $iprot."://".$idom.$path;
$query = "?".$query if ($query ne '');
my $ret = $iprot."://".$idom.$path.($q > 0 ? $query.$frags : '');
$ret =~ s/\/\.\//\//g;
while ($ret =~ /$iprot:\/\/$idom\/\.\.\//)
{
$ret =~ s/($iprot:\/\/$idom\/)\.\.\//$1/;
}
if ($ret =~ /\.\./ && $ret !~ /$iprot:\/\/$idom\/\.\.\//)
{
$ret =~ s/\/.[^\/]+\/\.\.\///;
}
return $ret;
}