Home  Services  Contact  Projects  Whitepapers  Tools  Partners 


#!/usr/bin/perl
#
# 1999@mei@websec.org
# mini spider with a
# couple of special
# features
#
#
# todo:
# -
#
#
##############################################################

use LWP;
use Getopt::Std;
use HTTP::Request::Common;
use HTTP::Response;
use HTML::TokeParser;
use IO::Scalar;
use URI::Find;

use vars qw($opt_u $opt_l $opt_p $opt_b $opt_q $opt_d);
getopts("u:l:p:b:qd");

## vardecs
##
my $url = $opt_u;
my $logfile = $opt_l;
my $proxy = $opt_p;
my $backupfile = $opt_b;
my $query = ($opt_q ? 1 : 0);
my @base;

## check that
##
if (!$url)
{
print "\nusage: $0 -u [URL]\n\t-p [proxyServer]\n\t-l [logFile]";
print "\n\t -d [considerDirectories]\n\t-q [accept queries(?a=b&c=d)]\n\n";
exit 11;
}


## a bit of input-validation :-)
##
($url = "http://".$url) if ($url !~ /http:\/\// && $url !~ /https:\/\//);

my $origin = $base[0] = $url;

my $user_agent = new LWP::UserAgent;
$user_agent->agent("Mozilla/4.0(compatible;MSIE 6.0;Windows NT 5.0)");

if ($proxy ne '')
{
$proxy = "http://".$proxy if ($proxy !~ /^.[^:\/]+:\/\//);
$user_agent->proxy('http', $proxy) if($url =~ /^http:\/\/.*$/);
$user_agent->proxy('https', $proxy) if($url =~ /^https:\/\/.*$/);
}

my (@alle, @alldirs);
my ($single, $sidir);

## cool recursive function
## checks all links of
## given url
##
&getall($user_agent, $origin, $url, \@alle, $query);

## remove doubles
## from all entries
##
@alle = sort keys %{{ map {$_,1} @alle }};

if ($logfile ne '')
{
open (RF, "> $logfile") || die "\ncant open $logfile !?!\n";
open (RFD, "> $logfile.dirs") || die "\ncant open $logfile.dirs !?!\n";
}

## all the entries
##
foreach $single (@alle)
{
$single =~ /(^.+\/)(.[^\/]*$)/;
push (@alldirs, $1);
print RF "$single\n" if ($logfile ne '');
}

## all the dirs
## remove doubles first
##
@alldirs = sort keys %{{ map {$_,1} @alldirs }};

foreach $sidir (@alldirs)
{
print STDERR "$sidir\n";
print RFD "$sidir\n" if ($logfile ne '');
}

if ($logfile ne '')
{
close (RF);
close (RFD);
}

print "\ndone\n";
exit 1;


####################
## sub extract_links
##
sub extract_links
{
my $ua = shift;
my $inc = shift;
my $url = shift;
my @links;
my $sili = "";

my $response = $ua->request(GET "$url");

## check for:
## frame-src, a-href, form-action, img-src
## todo: java-script adresses ?!?!
if ($response->is_success)
{
my $c = $response->content();
push (@links, &parseit($c, $inc, "frame", "src"));
push (@links, &parseit($c, $inc, "iframe", "src"));
push (@links, &parseit($c, $inc, "a", "href"));
push (@links, &parseit($c, $inc, "a", "name"));
push (@links, &parseit($c, $inc, "form", "action"));
push (@links, &parseit($c, $inc, "img", "src"));
push (@links, &parseit($c, $inc, "object", "data"));
push (@links, &parseit($c, $inc, "applet", "code"));
push (@links, &parseit($c, $inc, "embed", "src"));
}

## done
##
return @links;
}


## parseit
## parses html-content
## and returns array containing links
##

sub parseit
{
my $c = shift;
my $inc = shift;
my $first = shift;
my $second = shift;
my $fcont = new IO::Scalar \$c;
my @links;

## in case of frames
##
$f_page = HTML::TokeParser->new($fcont);
while (my $token = $f_page->get_tag("$first"))
{
my $link = $token->[1]{"$second"} || "";
push(@links,$link);
}
$fcont->close;

## the supertrick parse all content for URI like strings
## applets flash pdfs etc
##
my $sl1 = $sl2 = "";
my $finder = URI::Find->new( sub{ $sl1 = @_[0]; $sl2 = @_[1]; } );
$finder->find(\$c);
push(@links,$sl1) if ($sl1 ne "");

return @links;
}


## get_all
## nice and recursive
## todo: create proper urls :-)
##
sub getall
{
my $ua = shift;
my $inc = shift;
my $tmp_uri = shift;
my $all_links = shift;
my $q = shift;

if ( ($tmp_uri =~ /^$inc/) || ($tmp_uri !~ /^.[^:\/]+:\/\// && $tmp_uri !~ /mailto:/) )
{
## try to create a usable url
##
my $uri = &clean_uri($tmp_uri, $inc, $q);

## only if not existing
##
if ( !search_array($all_links, $uri) )
{
print STDERR "$uri\n";
## extract links into temp array
##
my @temp = &extract_links($ua, $inc, $uri);
push(@{$all_links}, $uri);

foreach my $single (@temp)
{
&getall($ua, $inc, $single, $all_links, $q)
}
}
}
}


## search_array
## check if given entry(needle) is
## in array
## todo: find a faster algorythm :-)
##
sub search_array
{
$array = shift;
$needle = shift;

for (my $c=0; $c<=$#{$array}; $c++)
{
return 1 if ($needle eq @{$array}[$c]);
}

return 0;
}

## create proper uri second try
## todo: remove ".." relative signatures ?!
##
sub clean_uri
{
my $tmp_uri = shift;
my $inc = shift;
my $q = shift;

my($iprot, $idom, $ipath, $iquery, $ifrags) =
$inc =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
my($prot, $dom, $path, $query, $frags) =
$tmp_uri =~ m|^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;

## clean a bit
##
($path = "/".$path) if($path !~ /^\//);

## the ugly dot dot problem
## is not solved by the URI-module
## therefore ...
##
#my $ret = $iprot."://".$idom.$path;
$query = "?".$query if ($query ne '');
my $ret = $iprot."://".$idom.$path.($q > 0 ? $query.$frags : '');

$ret =~ s/\/\.\//\//g;

while ($ret =~ /$iprot:\/\/$idom\/\.\.\//)
{
$ret =~ s/($iprot:\/\/$idom\/)\.\.\//$1/;
}

if ($ret =~ /\.\./ && $ret !~ /$iprot:\/\/$idom\/\.\.\//)
{
$ret =~ s/\/.[^\/]+\/\.\.\///;
}

return $ret;
}

Home  Services  Contact  Projects  Whitepapers  Tools  Partners