lnkcheck.pl

Lines 12 and 13 show an example. $files should contain in each line either a local file name or a URL.

    1 | #!/usr/bin/perl
    2 | # lnkcheck.pl -- This scripts tests pages locally or given by URL
    3 | # for correctness of absoulte links.
    4 | # by Boris 'pi' Piwinger <3.14@piology.org>
    5 | use LWP::UserAgent;
    6 | $| = 1;
    7 | undef($/); # Read files in one step
    8 | my$ua = new LWP::UserAgent;
    9 | $ua->agent("lnkcheck/3.14\@piology.org");
   10 | $ua->timeout(300);
   11 | 
   12 | my$files= `find /home/httpd/html \\( -path '/home/httpd/html/manual' -prune \\) -or -type f -iname \"*html\" -print \\) | sort`;
   13 | $files .= "http://piology.org/perl/\n";
   14 | 
   15 | my($fail,%hostname,%urls);
   16 | my$temperrors="time(?:d )?out|Connection reset by peer|No route to host|unreachable";
   17 | 
   18 | # Run thru the entries in $files
   19 | while ($files =~ /([^\n]+)/g) {
   20 | 
   21 | # Fetch page from web or locally
   22 | my$file = $1;
   23 | my$page;
   24 | if ($file =~ m!^http://!) {
   25 |   undef($ua->max_size(0)); # We have to get the full page here
   26 |   my$response = $ua->request(HTTP::Request->new("GET",$file));
   27 |   next unless ($response->is_success);
   28 |   $page = $response->content;
   29 | } else {
   30 |   if (open PAGE, "<$file") {
   31 |     $page = <PAGE>;
   32 |     close PAGE;
   33 |   } else {
   34 |     print "Cannot read $file.\n";
   35 |     next;
   36 |   }
   37 | }
   38 | 
   39 | # Reduce server load
   40 | $ua->max_size(0);
   41 | 
   42 | # Scan page for links and verify 'em
   43 | $fail = "$file\n";
   44 | while ($page =~ /href[\s="]+(https?:[^">\s]+)/gi) {
   45 |   my$url = $1;
   46 |   $url =~ s/&#38;/&/g;
   47 |   $url =~ s/&amp;/&/g;
   48 |   $fail .= "  $url\n    see above\n" if ($urls{$url} eq "a-oh");
   49 |   next if (defined($urls{$url}));
   50 |   $urls{$url}="OK";
   51 |   my$error = "  $url\n";
   52 |   my$response = $ua->simple_request(HTTP::Request->new("HEAD",$url));
   53 |   unless ($response->is_success) {
   54 |     unless ($response->message =~ /$temperrors/i || $response->code == 401 || $response->code == 405 || $response->code == 302) {
   55 |       $error .= "    HEAD: " . $response->code . " " . $response->message;
   56 |       $error =~ s/\n*$/\n/s;
   57 |     }
   58 |     if ($error =~ /\(Bad hostname '([^']+)/) {
   59 |       $hostname{$1}="a-oh";
   60 |       $fail.=$error;
   61 |       $urls{$url}="a-oh";
   62 |     } else {
   63 |       $response = $ua->simple_request(HTTP::Request->new("GET",$url));
   64 |       unless ($response->is_success) {
   65 |         unless ($response->message =~ /$temperrors/i || $response->code == 401 || $response->code == 302) {
   66 |           $error .= "    GET: " . $response->code . " " . $response->message;
   67 |           $error =~ s/\n*$/\n/s;
   68 |         }
   69 |         if ($error =~ /GET:/) {
   70 |           $fail .= $error;
   71 |           $urls{$url}="a-oh";
   72 |         }
   73 |       }
   74 |     }
   75 |   }
   76 | }
   77 | print $fail if ($fail =~ /GET|see above|Bad hostname/);
   78 | 
   79 | }
   80 | 
   81 | # Now lookup all "bad hostnames" from authoratative name servers
   82 | @_=(sort keys %hostname);
   83 | print "\nanslookup.pl @_\n";
   84 | print `anslookup.pl @_`;

Valid CSS!Valid HTML 4.01!
© Boris 'pi' Piwinger, August 11, 2002