/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/spider.pl
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/spider.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 #!/usr/local/bin/perl -w
2     use strict;
3    
4    
5     # $Id: spider.pl,v 1.42 2002/08/13 23:54:49 whmoseley Exp $
6     #
7     # "prog" document source for spidering web servers
8     #
9     # For documentation, type:
10     #
11     # perldoc spider.pl
12     #
13     # Apr 7, 2001 -- added quite a bit of bulk for easier debugging
14     #
15     # Nov 19, 2001 -- to do, build a server object so we are not using the passed in hash,
16     # and so we can warn on invalid config settings.
17    
18     $HTTP::URI_CLASS = "URI"; # prevent loading default URI::URL
19     # so we don't store long list of base items
20     # and eat up memory with >= URI 1.13
21     use LWP::RobotUA;
22     use HTML::LinkExtor;
23     use HTML::Tagset;
24    
25     use vars '$VERSION';
26     $VERSION = sprintf '%d.%02d', q$Revision: 1.42 $ =~ /: (\d+)\.(\d+)/;
27    
28     use vars '$bit';
29     use constant DEBUG_ERRORS => $bit = 1; # program errors
30     use constant DEBUG_URL => $bit <<= 1; # print out every URL processes
31     use constant DEBUG_HEADERS => $bit <<= 1; # prints the response headers
32     use constant DEBUG_FAILED => $bit <<= 1; # failed to return a 200
33     use constant DEBUG_SKIPPED => $bit <<= 1; # didn't index for some reason
34     use constant DEBUG_INFO => $bit <<= 1; # more verbose
35     use constant DEBUG_LINKS => $bit <<= 1; # prints links as they are extracted
36    
37     my %DEBUG_MAP = (
38     errors => DEBUG_ERRORS,
39     url => DEBUG_URL,
40     headers => DEBUG_HEADERS,
41     failed => DEBUG_FAILED,
42     skipped => DEBUG_SKIPPED,
43     info => DEBUG_INFO,
44     links => DEBUG_LINKS,
45     );
46    
47    
48    
49     use constant MAX_SIZE => 5_000_000; # Max size of document to fetch
50     use constant MAX_WAIT_TIME => 30; # request time.
51    
52     #Can't locate object method "host" via package "URI::mailto" at ../prog-bin/spider.pl line 473.
53     #sub URI::mailto::host { return '' };
54    
55    
56     # This is not the right way to do this.
57     sub UNIVERSAL::host { '' };
58     sub UNIVERSAL::port { '' };
59     sub UNIVERSAL::host_port { '' };
60    
61    
62    
63     #-----------------------------------------------------------------------
64    
65     use vars '@servers';
66    
67     my $config = shift || 'SwishSpiderConfig.pl';
68    
69     if ( lc( $config ) eq 'default' ) {
70     @servers = default_urls();
71     } else {
72     do $config or die "Failed to read $0 configuration parameters '$config' $! $@";
73     }
74    
75    
76     print STDERR "$0: Reading parameters from '$config'\n" unless $ENV{SPIDER_QUIET};
77    
78     my $abort;
79     local $SIG{HUP} = sub { warn "Caught SIGHUP\n"; $abort++ } unless $^O =~ /Win32/i;
80    
81     my %visited; # global -- I suppose would be smarter to localize it per server.
82    
83     my %validated;
84     my %bad_links;
85    
86     for my $s ( @servers ) {
87     if ( !$s->{base_url} ) {
88     die "You must specify 'base_url' in your spider config settings\n";
89     }
90    
91     my @urls = ref $s->{base_url} eq 'ARRAY' ? @{$s->{base_url}} :( $s->{base_url});
92     for my $url ( @urls ) {
93     $s->{base_url} = $url;
94     process_server( $s );
95     }
96     }
97    
98    
99     if ( %bad_links ) {
100     print STDERR "\nBad Links:\n\n";
101     foreach my $page ( sort keys %bad_links ) {
102     print STDERR "On page: $page\n";
103     printf(STDERR " %-40s %s\n", $_, $validated{$_} ) for @{$bad_links{$page}};
104     print STDERR "\n";
105     }
106     }
107    
108    
109     #-----------------------------------------------------------------------
110    
111    
112     sub process_server {
113     my $server = shift;
114    
115     # set defaults
116    
117     if ( $ENV{SPIDER_DEBUG} ) {
118     $server->{debug} = 0;
119    
120     $server->{debug} |= (exists $DEBUG_MAP{lc $_} ? $DEBUG_MAP{lc $_} : die "Bad debug setting passed in environment '$_'\nOptions are: " . join( ', ', keys %DEBUG_MAP) ."\n")
121     for split /\s*,\s*/, $ENV{SPIDER_DEBUG};
122    
123     } else {
124     $server->{debug} ||= 0;
125     die "debug parameter '$server->{debug}' must be a number\n" unless $server->{debug} =~ /^\d+$/;
126     }
127    
128     $server->{quiet} ||= $ENV{SPIDER_QUIET} || 0;
129    
130    
131     $server->{max_size} ||= MAX_SIZE;
132     die "max_size parameter '$server->{max_size}' must be a number\n" unless $server->{max_size} =~ /^\d+$/;
133    
134    
135     $server->{max_wait_time} ||= MAX_WAIT_TIME;
136     die "max_wait_time parameter '$server->{max_wait_time}' must be a number\n" if $server->{max_wait_time} !~ /^\d+/;
137    
138    
139    
140     $server->{link_tags} = ['a'] unless ref $server->{link_tags} eq 'ARRAY';
141     $server->{link_tags_lookup} = { map { lc, 1 } @{$server->{link_tags}} };
142    
143     die "max_depth parameter '$server->{max_depth}' must be a number\n" if defined $server->{max_depth} && $server->{max_depth} !~ /^\d+/;
144    
145    
146     for ( qw/ test_url test_response filter_content/ ) {
147     next unless $server->{$_};
148     $server->{$_} = [ $server->{$_} ] unless ref $server->{$_} eq 'ARRAY';
149     my $n;
150     for my $sub ( @{$server->{$_}} ) {
151     $n++;
152     die "Entry number $n in $_ is not a code reference\n" unless ref $sub eq 'CODE';
153     }
154     }
155    
156    
157    
158     my $start = time;
159    
160     if ( $server->{skip} ) {
161     print STDERR "Skipping: $server->{base_url}\n" unless $server->{quiet};
162     return;
163     }
164    
165     require "HTTP/Cookies.pm" if $server->{use_cookies};
166     require "Digest/MD5.pm" if $server->{use_md5};
167    
168    
169     # set starting URL, and remove any specified fragment
170     my $uri = URI->new( $server->{base_url} );
171     $uri->fragment(undef);
172    
173     if ( $uri->userinfo ) {
174     die "Can't specify parameter 'credentials' because base_url defines them\n"
175     if $server->{credentials};
176     $server->{credentials} = $uri->userinfo;
177     $uri->userinfo( undef );
178     }
179    
180    
181     print STDERR "\n -- Starting to spider: $uri --\n" if $server->{debug};
182    
183    
184    
185     # set the starting server name (including port) -- will only spider on server:port
186    
187    
188     # All URLs will end up with this host:port
189     $server->{authority} = $uri->canonical->authority;
190    
191     # All URLs must match this scheme ( Jan 22, 2002 - spot by Darryl Friesen )
192     $server->{scheme} = $uri->scheme;
193    
194    
195    
196     # Now, set the OK host:port names
197     $server->{same} = [ $uri->canonical->authority ];
198    
199     push @{$server->{same}}, @{$server->{same_hosts}} if ref $server->{same_hosts};
200    
201     $server->{same_host_lookup} = { map { $_, 1 } @{$server->{same}} };
202    
203    
204    
205    
206     # set time to end
207    
208     $server->{max_time} = $server->{max_time} * 60 + time
209     if $server->{max_time};
210    
211    
212     # set default agent for log files
213    
214     $server->{agent} ||= 'swish-e spider 2.2 http://swish-e.org/';
215    
216    
217     # get a user agent object
218    
219    
220     my $ua;
221    
222     if ( $server->{ignore_robots_file} ) {
223     $ua = LWP::UserAgent->new;
224     return unless $ua;
225     $ua->agent( $server->{agent} );
226     $ua->from( $server->{email} );
227    
228     } else {
229     $ua = LWP::RobotUA->new( $server->{agent}, $server->{email} );
230     return unless $ua;
231     $ua->delay( $server->{delay_min} || 0.1 );
232     }
233    
234     # Set the timeout on the server and using Windows.
235     $ua->timeout( $server->{max_wait_time} ) if $^O =~ /Win32/i;
236    
237    
238     $server->{ua} = $ua; # save it for fun.
239     # $ua->parse_head(0); # Don't parse the content
240    
241     $ua->cookie_jar( HTTP::Cookies->new ) if $server->{use_cookies};
242    
243     if ( $server->{keep_alive} ) {
244    
245     if ( $ua->can( 'conn_cache' ) ) {
246     my $keep_alive = $server->{keep_alive} =~ /^\d+$/ ? $server->{keep_alive} : 1;
247     $ua->conn_cache( { total_capacity => $keep_alive } );
248    
249     } else {
250     warn "Can't use keep-alive: conn_cache method not available\n";
251     }
252     }
253    
254    
255     # uri, parent, depth
256     eval { spider( $server, $uri ) };
257     print STDERR $@ if $@;
258    
259    
260     # provide a way to call a function in the config file when all done
261     check_user_function( 'spider_done', undef, $server );
262    
263    
264     return if $server->{quiet};
265    
266    
267     $start = time - $start;
268     $start++ unless $start;
269    
270     my $max_width = 0;
271     my $max_num = 0;
272     for ( keys %{$server->{counts}} ) {
273     $max_width = length if length > $max_width;
274     my $val = commify( $server->{counts}{$_} );
275     $max_num = length $val if length $val > $max_num;
276     }
277    
278    
279     printf STDERR "\nSummary for: $server->{base_url}\n";
280    
281     for ( sort keys %{$server->{counts}} ) {
282     printf STDERR "%${max_width}s: %${max_num}s (%0.1f/sec)\n",
283     $_,
284     commify( $server->{counts}{$_} ),
285     $server->{counts}{$_}/$start;
286     }
287     }
288    
289    
290     #-----------------------------------------------------------------------
291     # Deal with Basic Authen
292    
293    
294    
295     # Thanks Gisle!
296     sub get_basic_credentials {
297     my($uri, $server, $realm ) = @_;
298     my $netloc = $uri->canonical->host_port;
299    
300     my ($user, $password);
301    
302     eval {
303     local $SIG{ALRM} = sub { die "timed out\n" };
304     alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i;
305    
306     if ( $uri->userinfo ) {
307     print STDERR "\nSorry: invalid username/password\n";
308     $uri->userinfo( undef );
309     }
310    
311    
312     print STDERR "Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: ";
313     $user = <STDIN>;
314     chomp($user);
315     die "No Username specified\n" unless length $user;
316    
317     alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i;
318    
319     print STDERR "Password: ";
320     system("stty -echo");
321     $password = <STDIN>;
322     system("stty echo");
323     print STDERR "\n"; # because we disabled echo
324     chomp($password);
325    
326     alarm( 0 ) unless $^O =~ /Win32/i;
327     };
328    
329     return if $@;
330    
331     return join ':', $user, $password;
332    
333    
334     }
335    
336    
337    
338    
339     #----------- Non recursive spidering ---------------------------
340    
341     sub spider {
342     my ( $server, $uri ) = @_;
343    
344     # Validate the first link, just in case
345     return unless check_link( $uri, $server, '', '(Base URL)' );
346    
347     my @link_array = [ $uri, '', 0 ];
348    
349     while ( @link_array ) {
350    
351     die if $abort || $server->{abort};
352    
353     my ( $uri, $parent, $depth ) = @{shift @link_array};
354    
355     my $new_links = process_link( $server, $uri, $parent, $depth );
356    
357     push @link_array, map { [ $_, $uri, $depth+1 ] } @$new_links if $new_links;
358    
359     }
360     }
361    
362    
363     #----------- Process a url and return links -----------------------
364     sub process_link {
365     my ( $server, $uri, $parent, $depth ) = @_;
366    
367    
368     $server->{counts}{'Unique URLs'}++;
369    
370     die "$0: Max files Reached\n"
371     if $server->{max_files} && $server->{counts}{'Unique URLs'} > $server->{max_files};
372    
373     die "$0: Time Limit Exceeded\n"
374     if $server->{max_time} && $server->{max_time} < time;
375    
376    
377    
378     # make request
379     my $ua = $server->{ua};
380     my $request = HTTP::Request->new('GET', $uri );
381    
382    
383     my $content = '';
384    
385     # Really should just subclass the response object!
386     $server->{no_contents} = 0;
387     $server->{no_index} = 0;
388     $server->{no_spider} = 0;
389    
390    
391     # Set basic auth if defined - use URI specific first, then credentials
392     if ( my ( $user, $pass ) = split /:/, ( $uri->userinfo || $server->{credentials} || '' ) ) {
393     $request->authorization_basic( $user, $pass );
394     }
395    
396    
397    
398    
399     my $been_here;
400     my $callback = sub {
401    
402     # Reset alarm;
403     alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i;
404    
405    
406     # Cache user/pass
407     if ( $server->{cur_realm} && $uri->userinfo ) {
408     my $key = $uri->canonical->host_port . ':' . $server->{cur_realm};
409     $server->{auth_cache}{$key} = $uri->userinfo;
410     }
411    
412     $uri->userinfo( undef ) unless $been_here;
413    
414     die "test_response" if !$been_here++ && !check_user_function( 'test_response', $uri, $server, $_[1], \$_[0] );
415    
416    
417     if ( length( $content ) + length( $_[0] ) > $server->{max_size} ) {
418     print STDERR "-Skipped $uri: Document exceeded $server->{max_size} bytes\n" if $server->{debug}&DEBUG_SKIPPED;
419     die "too big!\n";
420     }
421    
422     $content .= $_[0];
423    
424     };
425    
426     my $response;
427    
428     eval {
429     local $SIG{ALRM} = sub { die "timed out\n" };
430     alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i;
431     $response = $ua->simple_request( $request, $callback, 4096 );
432     alarm( 0 ) unless $^O =~ /Win32/i;
433     };
434    
435    
436     return if $server->{abort};
437    
438    
439     if ( $response && $response->code == 401 && $response->header('WWW-Authenticate') && $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i ) {
440     my $realm = $1;
441    
442     my $user_pass;
443    
444     # Do we have a cached user/pass for this realm?
445     my $key = $uri->canonical->host_port . ':' . $realm;
446    
447     if ( $user_pass = $server->{auth_cache}{$key} ) {
448    
449     # If we didn't just try it, try again
450     unless( $uri->userinfo && $user_pass eq $uri->userinfo ) {
451     $uri->userinfo( $user_pass );
452     return process_link( $server, $uri, $parent, $depth );
453     }
454     }
455    
456     # otherwise, prompt:
457    
458    
459     if ( $user_pass = get_basic_credentials( $uri, $server, $realm ) ) {
460     $uri->userinfo( $user_pass );
461    
462     $server->{cur_realm} = $realm; # save so we can cache
463     my $links = process_link( $server, $uri, $parent, $depth );
464     delete $server->{cur_realm};
465    
466     return $links;
467     }
468     print STDERR "Skipping $uri\n";
469     }
470    
471     $uri->userinfo( undef );
472    
473    
474    
475     # Log the response
476    
477     if ( ( $server->{debug} & DEBUG_URL ) || ( $server->{debug} & DEBUG_FAILED && !$response->is_success) ) {
478     print STDERR '>> ',
479     join( ' ',
480     ( $response->is_success ? '+Fetched' : '-Failed' ),
481     $depth,
482     "Cnt: $server->{counts}{'Unique URLs'}",
483     $uri,
484     ( $response->status_line || $response->status || 'unknown status' ),
485     ( $response->content_type || 'Unknown content type'),
486     ( $response->content_length || '???' ),
487     "parent:$parent",
488     ),"\n";
489     }
490    
491    
492    
493    
494     # If the LWP callback aborts
495    
496     if ( $response->header('client-aborted') ) {
497     $server->{counts}{Skipped}++;
498     return;
499     }
500    
501     $response->request->uri->userinfo( undef ) if $response->request;
502    
503    
504     # skip excluded by robots.txt
505    
506     if ( !$response->is_success && $response->status_line =~ 'robots.txt' ) {
507     print STDERR "-Skipped $depth $uri: ", $response->status_line,"\n" if $server->{debug}&DEBUG_SKIPPED;
508     $server->{counts}{'robots.txt'}++;
509     return;
510     }
511    
512    
513     # Report bad links (excluding those skipped by robots.txt
514    
515     if ( $server->{validate_links} && !$response->is_success ) {
516     validate_link( $server, $uri, $parent, $response );
517     }
518    
519    
520    
521     # And check for meta robots tag
522     # -- should probably be done in request sub to avoid fetching docs that are not needed
523    
524     unless ( $server->{ignore_robots_file} ) {
525     if ( my $directives = $response->header('X-Meta-ROBOTS') ) {
526     my %settings = map { lc $_, 1 } split /\s*,\s*/, $directives;
527     $server->{no_contents}++ if exists $settings{nocontents}; # an extension for swish
528     $server->{no_index}++ if exists $settings{noindex};
529     $server->{no_spider}++ if exists $settings{nofollow};
530     }
531     }
532    
533    
534    
535    
536     print STDERR "\n----HEADERS for $uri ---\n", $response->headers_as_string,"-----END HEADERS----\n\n"
537     if $server->{debug} & DEBUG_HEADERS;
538    
539    
540     unless ( $response->is_success ) {
541    
542     # look for redirect
543     if ( $response->is_redirect && $response->header('location') ) {
544     my $u = URI->new_abs( $response->header('location'), $response->base );
545    
546     if ( $u->canonical eq $uri->canonical ) {
547     print STDERR "Warning: $uri redirects to itself!.\n";
548     return;
549     }
550    
551     return [$u] if check_link( $u, $server, $response->base, '(redirect)','Location' );
552     }
553     return;
554     }
555    
556     return unless $content; # $$$ any reason to index empty files?
557    
558    
559     # make sure content is unique - probably better to chunk into an MD5 object above
560    
561     if ( $server->{use_md5} ) {
562     my $digest = Digest::MD5::md5($content);
563    
564     if ( $visited{ $digest } ) {
565    
566     print STDERR "-Skipped $uri has same digest as $visited{ $digest }\n"
567     if $server->{debug} & DEBUG_SKIPPED;
568    
569     $server->{counts}{Skipped}++;
570     $server->{counts}{'MD5 Duplicates'}++;
571     return;
572     }
573     $visited{ $digest } = $uri;
574     }
575    
576    
577     # Extract out links (if not too deep)
578    
579     my $links_extracted = extract_links( $server, \$content, $response )
580     unless defined $server->{max_depth} && $depth >= $server->{max_depth};
581    
582    
583     # Index the file
584    
585     if ( $server->{no_index} ) {
586     $server->{counts}{Skipped}++;
587     print STDERR "-Skipped indexing $uri some callback set 'no_index' flag\n" if $server->{debug}&DEBUG_SKIPPED;
588    
589     } else {
590     return $links_extracted unless check_user_function( 'filter_content', $uri, $server, $response, \$content );
591    
592     output_content( $server, \$content, $uri, $response )
593     unless $server->{no_index};
594     }
595    
596    
597    
598     return $links_extracted;
599     }
600    
601     #===================================================================================================
602     # Calls a user-defined function
603     #
604     #---------------------------------------------------------------------------------------------------
605    
606     sub check_user_function {
607     my ( $fn, $uri, $server ) = ( shift, shift, shift );
608    
609     return 1 unless $server->{$fn};
610    
611     my $tests = ref $server->{$fn} eq 'ARRAY' ? $server->{$fn} : [ $server->{$fn} ];
612    
613     my $cnt;
614    
615     for my $sub ( @$tests ) {
616     $cnt++;
617     print STDERR "?Testing '$fn' user supplied function #$cnt '$uri'\n" if $server->{debug} & DEBUG_INFO;
618    
619     my $ret;
620    
621     eval { $ret = $sub->( $uri, $server, @_ ) };
622    
623     if ( $@ ) {
624     print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt death '$@'\n" if $server->{debug} & DEBUG_SKIPPED;
625     $server->{counts}{Skipped}++;
626     return;
627     }
628    
629     next if $ret;
630    
631     print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt\n" if $server->{debug} & DEBUG_SKIPPED;
632     $server->{counts}{Skipped}++;
633     return;
634     }
635     print STDERR "+Passed all $cnt tests for '$fn' user supplied function\n" if $server->{debug} & DEBUG_INFO;
636     return 1;
637     }
638    
639    
640     #==============================================================================================
641     # Extract links from a text/html page
642     #
643     # Call with:
644     # $server - server object
645     # $content - ref to content
646     # $response - response object
647     #
648     #----------------------------------------------------------------------------------------------
649    
650     sub extract_links {
651     my ( $server, $content, $response ) = @_;
652    
653     return unless $response->header('content-type') &&
654     $response->header('content-type') =~ m[^text/html];
655    
656     # allow skipping.
657     if ( $server->{no_spider} ) {
658     print STDERR '-Links not extracted: ', $response->request->uri->canonical, " some callback set 'no_spider' flag\n" if $server->{debug}&DEBUG_SKIPPED;
659     return;
660     }
661    
662     $server->{Spidered}++;
663    
664     my @links;
665    
666    
667     my $base = $response->base;
668    
669     print STDERR "\nExtracting links from ", $response->request->uri, ":\n" if $server->{debug} & DEBUG_LINKS;
670    
671     my $p = HTML::LinkExtor->new;
672     $p->parse( $$content );
673    
674     my %skipped_tags;
675    
676     for ( $p->links ) {
677     my ( $tag, %attr ) = @$_;
678    
679     # which tags to use ( not reported in debug )
680    
681     my $attr = join ' ', map { qq[$_="$attr{$_}"] } keys %attr;
682    
683     print STDERR "\nLooking at extracted tag '<$tag $attr>'\n" if $server->{debug} & DEBUG_LINKS;
684    
685     unless ( $server->{link_tags_lookup}{$tag} ) {
686    
687     # each tag is reported only once per page
688     print STDERR
689     " <$tag> skipped because not one of (",
690     join( ',', @{$server->{link_tags}} ),
691     ")\n" if $server->{debug} & DEBUG_LINKS && !$skipped_tags{$tag}++;
692    
693     if ( $server->{validate_links} && $tag eq 'img' && $attr{src} ) {
694     my $img = URI->new_abs( $attr{src}, $base );
695     validate_link( $server, $img, $base );
696     }
697    
698     next;
699     }
700    
701     # Grab which attribute(s) which might contain links for this tag
702     my $links = $HTML::Tagset::linkElements{$tag};
703     $links = [$links] unless ref $links;
704    
705    
706     my $found;
707    
708    
709     # Now, check each attribut to see if a link exists
710    
711     for my $attribute ( @$links ) {
712     if ( $attr{ $attribute } ) { # ok tag
713    
714     # Create a URI object
715    
716     my $u = URI->new_abs( $attr{$attribute},$base );
717    
718     next unless check_link( $u, $server, $base, $tag, $attribute );
719    
720     push @links, $u;
721     print STDERR qq[ $attribute="$u" Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS;
722     $found++;
723     }
724     }
725    
726    
727     if ( !$found && $server->{debug} & DEBUG_LINKS ) {
728     print STDERR " tag did not include any links to follow or is a duplicate\n";
729     }
730    
731     }
732    
733     print STDERR "! Found ", scalar @links, " links in ", $response->base, "\n\n" if $server->{debug} & DEBUG_INFO;
734    
735    
736     return \@links;
737     }
738    
739    
740    
741    
742     #=============================================================================
743     # This function check's if a link should be added to the list to spider
744     #
745     # Pass:
746     # $u - URI object
747     # $server - the server hash
748     # $base - the base or parent of the link
749     #
750     # Returns true if a valid link
751     #
752     # Calls the user function "test_url". Link rewriting before spider
753     # can be done here.
754     #
755     #------------------------------------------------------------------------------
756     sub check_link {
757     my ( $u, $server, $base, $tag, $attribute ) = @_;
758    
759     $tag ||= '';
760     $attribute ||= '';
761    
762    
763     # Kill the fragment
764     $u->fragment( undef );
765    
766    
767     # This should not happen, but make sure we have a host to check against
768    
769     unless ( $u->host ) {
770     print STDERR qq[ ?? <$tag $attribute="$u"> skipped because missing host name\n] if $server->{debug} & DEBUG_LINKS;
771     return;
772     }
773    
774    
775     # Here we make sure we are looking at a link pointing to the correct (or equivalent) host
776    
777     unless ( $server->{scheme} eq $u->scheme && $server->{same_host_lookup}{$u->canonical->authority} ) {
778    
779     print STDERR qq[ ?? <$tag $attribute="$u"> skipped because different host\n] if $server->{debug} & DEBUG_LINKS;
780     $server->{counts}{'Off-site links'}++;
781     validate_link( $server, $u, $base ) if $server->{validate_links};
782     return;
783     }
784    
785     $u->host_port( $server->{authority} ); # Force all the same host name
786    
787     # Allow rejection of this URL by user function
788    
789     return unless check_user_function( 'test_url', $u, $server );
790    
791    
792     # Don't add the link if already seen - these are so common that we don't report
793    
794     if ( $visited{ $u->canonical }++ ) {
795     #$server->{counts}{Skipped}++;
796     $server->{counts}{Duplicates}++;
797    
798    
799     # Just so it's reported for all pages
800     if ( $server->{validate_links} && $validated{$u->canonical} ) {
801     push @{$bad_links{ $base->canonical }}, $u->canonical;
802     }
803    
804     return;
805     }
806    
807     return 1;
808     }
809    
810    
811     #=============================================================================
812     # This function is used to validate links that are off-site.
813     #
814     # It's just a very basic link check routine that lets you validate the
815     # off-site links at the same time as indexing. Just because we can.
816     #
817     #------------------------------------------------------------------------------
818     sub validate_link {
819     my ($server, $uri, $base, $response ) = @_;
820    
821     # Already checked?
822    
823     if ( exists $validated{ $uri->canonical } )
824     {
825     # Add it to the list of bad links on that page if it's a bad link.
826     push @{$bad_links{ $base->canonical }}, $uri->canonical
827     if $validated{ $uri->canonical };
828    
829     return;
830     }
831    
832     $validated{ $uri->canonical } = 0; # mark as checked and ok.
833    
834     unless ( $response ) {
835     my $ua = LWP::UserAgent->new;
836     my $request = HTTP::Request->new('HEAD', $uri->canonical );
837    
838     eval {
839     local $SIG{ALRM} = sub { die "timed out\n" };
840     alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i;
841     $response = $ua->simple_request( $request );
842     alarm( 0 ) unless $^O =~ /Win32/i;
843     };
844    
845     if ( $@ ) {
846     $server->{counts}{'Bad Links'}++;
847     my $msg = $@;
848     $msg =~ tr/\n//s;
849     $validated{ $uri->canonical } = $msg;
850     push @{$bad_links{ $base->canonical }}, $uri->canonical;
851     return;
852     }
853     }
854    
855     return if $response->is_success;
856    
857     my $error = $response->status_line || $response->status || 'unknown status';
858    
859     $error .= ' ' . URI->new_abs( $response->header('location'), $response->base )->canonical
860     if $response->is_redirect && $response->header('location');
861    
862     $validated{ $uri->canonical } = $error;
863     push @{$bad_links{ $base->canonical }}, $uri->canonical;
864     }
865    
866    
867     sub output_content {
868     my ( $server, $content, $uri, $response ) = @_;
869    
870     $server->{indexed}++;
871    
872     unless ( length $$content ) {
873     print STDERR "Warning: document '", $response->request->uri, "' has no content\n";
874     $$content = ' ';
875     }
876    
877    
878     $server->{counts}{'Total Bytes'} += length $$content;
879     $server->{counts}{'Total Docs'}++;
880    
881    
882     my $headers = join "\n",
883     'Path-Name: ' . $uri,
884     'Content-Length: ' . length $$content,
885     '';
886    
887     $headers .= 'Last-Mtime: ' . $response->last_modified . "\n"
888     if $response->last_modified;
889    
890    
891     $headers .= "No-Contents: 1\n" if $server->{no_contents};
892     print "$headers\n$$content";
893    
894     die "$0: Max indexed files Reached\n"
895     if $server->{max_indexed} && $server->{counts}{'Total Docs'} >= $server->{max_indexed};
896     }
897    
898    
899    
900     sub commify {
901     local $_ = shift;
902     1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
903     return $_;
904     }
905    
906     sub default_urls {
907    
908     my $validate = 0;
909     if ( $ARGV[0] eq 'validate' ) {
910     shift @ARGV;
911     $validate = 1;
912     }
913    
914     die "$0: Must list URLs when using 'default'\n" unless @ARGV;
915    
916    
917     my @content_types = qw{ text/html text/plain };
918    
919     return map {
920     {
921     #debug => DEBUG_HEADERS,
922     #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO,
923     base_url => \@ARGV,
924     email => 'swish@domain.invalid',
925     delay_min => .0001,
926     link_tags => [qw/ a frame /],
927     test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg|png)$/i },
928    
929     test_response => sub {
930     my $content_type = $_[2]->content_type;
931     my $ok = grep { $_ eq $content_type } @content_types;
932     return 1 if $ok;
933     print STDERR "$_[0] $content_type != (@content_types)\n";
934     return;
935     },
936     validate_links => $validate,
937    
938     }
939     } @ARGV;
940     }
941    
942    
943    
944     __END__
945    
946     =head1 NAME
947    
948     spider.pl - Example Perl program to spider web servers
949    
950     =head1 SYNOPSIS
951    
952     swish.config:
953     IndexDir ./spider.pl
954     SwishProgParameters spider.config
955     # other swish-e settings
956    
957     spider.config:
958     @servers = (
959     {
960     base_url => 'http://myserver.com/',
961     email => 'me@myself.com',
962     # other spider settings described below
963     },
964     );
965    
966     Begin indexing:
967    
968     swish-e -S prog -c swish.config
969    
970     Note: When running on some versions of Windows (e.g. Win ME and Win 98 SE)
971     you may need to index using the command:
972    
973     perl spider.pl | swish-e -S prog -c swish.conf -i stdin
974    
975     This pipes the output of the spider directly into swish.
976    
977    
978     =head1 DESCRIPTION
979    
980     This is a swish-e "prog" document source program for spidering
981     web servers. It can be used instead of the C<http> method for
982     spidering with swish.
983    
984     The spider typically uses a configuration
985     file that lists the URL(s) to spider, and configuration parameters that control
986     the behavior of the spider. In addition, you may define I<callback> perl functions
987     in the configuration file that can dynamically change the behavior of the spider
988     based on URL, HTTP response headers, or the content of the fetched document. These
989     callback functions can also be used to filter or convert documents (e.g. PDF, gzip, MS Word)
990     into a format that swish-e can parse. Some examples are provided.
991    
992     You define "servers" to spider, set a few parameters,
993     create callback routines, and start indexing as the synopsis above shows.
994     The spider requires its own configuration file (unless you want the default values). This
995     is NOT the same configuration file that swish-e uses.
996    
997     The example configuration file C<SwishSpiderConfig.pl> is
998     included in the C<prog-bin> directory along with this script. Please just use it as an
999     example, as it contains more settings than you probably want to use. Start with a tiny config file
1000     and add settings as required by your situation.
1001    
1002     The available configuration parameters are discussed below.
1003    
1004     If all that sounds confusing, then you can run the spider with default settings. In fact, you can
1005     run the spider without using swish just to make sure it works. Just run
1006    
1007     ./spider.pl default http://someserver.com/sometestdoc.html
1008    
1009     And you should see F<sometestdoc.html> dumped to your screen. Get ready to kill the script
1010     if the file you request contains links as the output from the fetched pages will be displayed.
1011    
1012     ./spider.pl default http://someserver.com/sometestdoc.html > output.file
1013    
1014     might be more friendly.
1015    
1016     If the first parameter passed to the spider is the word "default" (as in the preceeding example)
1017     then the spider uses the default parameters,
1018     and the following parameter(s) are expected to be URL(s) to spider.
1019     Otherwise, the first parameter is considered to be the name of the configuration file (as described
1020     below). When using C<-S prog>, the swish-e configuration setting
1021     C<SwishProgParameters> is used to pass parameters to the program specified
1022     with C<IndexDir> or the C<-i> switch.
1023    
1024     If you do not specify any parameters the program will look for the file
1025    
1026     SwishSpiderConfig.pl
1027    
1028     in the current directory.
1029    
1030     The spider does require Perl's LWP library and a few other reasonably common modules.
1031     Most well maintained systems should have these modules installed. If not, start here:
1032    
1033     http://search.cpan.org/search?dist=libwww-perl
1034     http://search.cpan.org/search?dist=HTML-Parser
1035    
1036     See more below in C<REQUIREMENTS>. It's a good idea to check that you are running
1037     a current version of these modules.
1038    
1039     =head2 Robots Exclusion Rules and being nice
1040    
1041     This script will not spider files blocked by F<robots.txt>. In addition,
1042     The script will check for <meta name="robots"..> tags, which allows finer
1043     control over what files are indexed and/or spidered.
1044     See http://www.robotstxt.org/wc/exclusion.html for details.
1045    
1046     This spider provides an extension to the <meta> tag exclusion, by adding a
1047     C<NOCONTENTS> attribute. This attribute turns on the C<no_contents> setting, which
1048     asks swish-e to only index the document's title (or file name if not title is found).
1049    
1050     For example:
1051    
1052     <META NAME="ROBOTS" CONTENT="NOCONTENTS, NOFOLLOW">
1053    
1054     says to just index the document's title, but don't index its contents, and don't follow
1055     any links within the document. Granted, it's unlikely that this feature will ever be used...
1056    
1057     If you are indexing your own site, and know what you are doing, you can disable robot exclusion by
1058     the C<ignore_robots_file> configuration parameter, described below. This disables both F<robots.txt>
1059     and the meta tag parsing.
1060    
1061     This script only spiders one file at a time, so load on the web server is not that great.
1062     And with libwww-perl-5.53_91 HTTP/1.1 keep alive requests can reduce the load on
1063     the server even more (and potentially reduce spidering time considerably!)
1064    
1065     Still, discuss spidering with a site's administrator before beginning.
1066     Use the C<delay_min> to adjust how fast the spider fetches documents.
1067     Consider running a second web server with a limited number of children if you really
1068     want to fine tune the resources used by spidering.
1069    
1070     =head2 Duplicate Documents
1071    
1072     The spider program keeps track of URLs visited, so a document is only indexed
1073     one time.
1074    
1075     The Digest::MD5 module can be used to create a "fingerprint" of every page
1076     indexed and this fingerprint is used in a hash to find duplicate pages.
1077     For example, MD5 will prevent indexing these as two different documents:
1078    
1079     http://localhost/path/to/some/index.html
1080     http://localhost/path/to/some/
1081    
1082     But note that this may have side effects you don't want. If you want this
1083     file indexed under this URL:
1084    
1085     http://localhost/important.html
1086    
1087     But the spider happens to find the exact content in this file first:
1088    
1089     http://localhost/developement/test/todo/maybeimportant.html
1090    
1091     Then only that URL will be indexed.
1092    
1093     MD5 may slow down indexing a tiny bit, so test with and without if speed is an
1094     issue (which it probably isn't since you are spidering in the first place).
1095     This feature will also use more memory.
1096    
1097     Note: the "prog" document source in swish bypasses many swish-e configuration settings.
1098     For example, you cannot use the C<IndexOnly> directive with the "prog" document
1099     source. This is by design to limit the overhead when using an external program
1100     for providing documents to swish; after all, with "prog", if you don't want to index a file, then
1101     don't give it to swish to index in the first place.
1102    
1103     So, for spidering, if you do not wish to index images, for example, you will
1104     need to either filter by the URL or by the content-type returned from the web
1105     server. See L<CALLBACK FUNCTIONS|CALLBACK FUNCTIONS> below for more information.
1106    
1107     =head1 REQUIREMENTS
1108    
1109     Perl 5 (hopefully at least 5.00503) or later.
1110    
1111     You must have the LWP Bundle on your computer. Load the LWP::Bundle via the CPAN.pm shell,
1112     or download libwww-perl-x.xx from CPAN (or via ActiveState's ppm utility).
1113     Also required is the the HTML-Parser-x.xx bundle of modules also from CPAN
1114     (and from ActiveState for Windows).
1115    
1116     http://search.cpan.org/search?dist=libwww-perl
1117     http://search.cpan.org/search?dist=HTML-Parser
1118    
1119     You will also need Digest::MD5 if you wish to use the MD5 feature.
1120     HTML::Tagset is also required.
1121     Other modules may be required (for example, the pod2xml.pm module
1122     has its own requirementes -- see perldoc pod2xml for info).
1123    
1124     The spider.pl script, like everyone else, expects perl to live in /usr/local/bin.
1125     If this is not the case then either add a symlink at /usr/local/bin/perl
1126     to point to where perl is installed
1127     or modify the shebang (#!) line at the top of the spider.pl program.
1128    
1129     Note that the libwww-perl package does not support SSL (Secure Sockets Layer) (https)
1130     by default. See F<README.SSL> included in the libwww-perl package for information on
1131     installing SSL support.
1132    
1133     =head1 CONFIGURATION FILE
1134    
1135     Configuration is not very fancy. The spider.pl program simply does a
1136     C<do "path";> to read in the parameters and create the callback subroutines.
1137     The C<path> is the first parameter passed to the spider script, which is set
1138     by the Swish-e configuration setting C<SwishProgParameters>.
1139    
1140     For example, if in your swish-e configuration file you have
1141    
1142     SwishProgParameters /path/to/config.pl
1143     IndexDir /home/moseley/swish-e/prog-bin/spider.pl
1144    
1145     And then run swish as
1146    
1147     swish-e -c swish.config -S prog
1148    
1149     swish will run C</home/moseley/swish-e/prog-bin/spider.pl> and the spider.pl
1150     program will receive as its first parameter C</path/to/config.pl>, and
1151     spider.pl will read C</path/to/config.pl> to get the spider configuration
1152     settings. If C<SwishProgParameters> is not set, the program will try to
1153     use C<SwishSpiderConfig.pl> by default.
1154    
1155     There is a special case of:
1156    
1157     SwishProgParameters default http://www.mysite/index.html ...
1158    
1159     Where default parameters are used. This will only index documents of type
1160     C<text/html> or C<text/plain>, and will skip any file with an extension that matches
1161     the pattern:
1162    
1163     /\.(?:gif|jpeg|.png)$/i
1164    
1165     This can be useful for indexing just your web documnts, but you will probably want finer
1166     control over your spidering by using a configuration file.
1167    
1168     The configuration file must set a global variable C<@servers> (in package main).
1169     Each element in C<@servers> is a reference to a hash. The elements of the has
1170     are described next. More than one server hash may be defined -- each server
1171     will be spidered in order listed in C<@servers>, although currently a I<global> hash is
1172     used to prevent spidering the same URL twice.
1173    
1174     Examples:
1175    
1176     my %serverA = (
1177     base_url => 'http://swish-e.org/',
1178     same_hosts => [ qw/www.swish-e.org/ ],
1179     email => 'my@email.address',
1180     );
1181     my %serverB = (
1182     ...
1183     ...
1184     );
1185     @servers = ( \%serverA, \%serverB, );
1186    
1187     =head1 CONFIGURATION OPTIONS
1188    
1189     This describes the required and optional keys in the server configuration hash, in random order...
1190    
1191     =over 4
1192    
1193     =item base_url
1194    
1195     This required setting is the starting URL for spidering.
1196    
1197     Typically, you will just list one URL for the base_url. You may specify more than one
1198     URL as a reference to a list
1199    
1200     base_url => [qw! http://swish-e.org/ http://othersite.org/other/index.html !],
1201    
1202     You may specify a username and password:
1203    
1204     base_url => 'http://user:pass@swish-e.org/index.html',
1205    
1206     but you may find that to be a security issue. If a URL is protected by Basic Authentication
1207     you will be prompted for a username and password. This might be a slighly safer way to go.
1208    
1209     The parameter C<max_wait_time> controls how long to wait for user entry before skipping the
1210     current URL.
1211    
1212     See also C<credentials> below.
1213    
1214    
1215     =item same_hosts
1216    
1217     This optional key sets equivalent B<authority> name(s) for the site you are spidering.
1218     For example, if your site is C<www.mysite.edu> but also can be reached by
1219     C<mysite.edu> (with or without C<www>) and also C<web.mysite.edu> then:
1220    
1221    
1222     Example:
1223    
1224     $serverA{base_url} = 'http://www.mysite.edu/index.html';
1225     $serverA{same_hosts} = ['mysite.edu', 'web.mysite.edu'];
1226    
1227     Now, if a link is found while spidering of:
1228    
1229     http://web.mysite.edu/path/to/file.html
1230    
1231     it will be considered on the same site, and will actually spidered and indexed
1232     as:
1233    
1234     http://www.mysite.edu/path/to/file.html
1235    
1236     Note: This should probably be called B<same_host_port> because it compares the URI C<host:port>
1237     against the list of host names in C<same_hosts>. So, if you specify a port name in you will
1238     want to specify the port name in the the list of hosts in C<same_hosts>:
1239    
1240     my %serverA = (
1241     base_url => 'http://sunsite.berkeley.edu:4444/',
1242     same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1243     email => 'my@email.address',
1244     );
1245    
1246    
1247     =item email
1248    
1249     This required key sets the email address for the spider. Set this to
1250     your email address.
1251    
1252     =item agent
1253    
1254     This optional key sets the name of the spider.
1255    
1256     =item link_tags
1257    
1258     This optional tag is a reference to an array of tags. Only links found in these tags will be extracted.
1259     The default is to only extract links from C<a> tags.
1260    
1261     For example, to extract tags from C<a> tags and from C<frame> tags:
1262    
1263     my %serverA = (
1264     base_url => 'http://sunsite.berkeley.edu:4444/',
1265     same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1266     email => 'my@email.address',
1267     link_tags => [qw/ a frame /],
1268     );
1269    
1270    
1271     =item delay_min
1272    
1273     This optional key sets the delay in minutes to wait between requests. See the
1274     LWP::RobotUA man page for more information. The default is 0.1 (6 seconds),
1275     but in general you will probably want it much smaller. But, check with
1276     the webmaster before using too small a number.
1277    
1278     =item max_wait_time
1279    
1280     This setting is the number of seconds to wait for data to be returned from
1281     the request. Data is returned in chunks to the spider, and the timer is reset each time
1282     a new chunk is reported. Therefore, documents (requests) that take longer than this setting
1283     should not be aborted as long as some data is received every max_wait_time seconds.
1284     The default it 30 seconds.
1285    
1286     NOTE: This option has no effect on Windows.
1287    
1288     =item max_time
1289    
1290     This optional key will set the max minutes to spider. Spidering
1291     for this host will stop after C<max_time> minutes, and move on to the
1292     next server, if any. The default is to not limit by time.
1293    
1294     =item max_files
1295    
1296     This optional key sets the max number of files to spider before aborting.
1297     The default is to not limit by number of files. This is the number of requests
1298     made to the remote server, not the total number of files to index (see C<max_indexed>).
1299     This count is displayted at the end of indexing as C<Unique URLs>.
1300    
1301     This feature can (and perhaps should) be use when spidering a web site where dynamic
1302     content may generate unique URLs to prevent run-away spidering.
1303    
1304     =item max_indexed
1305    
1306     This optional key sets the max number of files that will be indexed.
1307     The default is to not limit. This is the number of files sent to
1308     swish for indexing (and is reported by C<Total Docs> when spidering ends).
1309    
1310     =item max_size
1311    
1312     This optional key sets the max size of a file read from the web server.
1313     This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is
1314     skipped and a message is written to STDERR if the DEBUG_SKIPPED debug flag is set.
1315    
1316     =item keep_alive
1317    
1318     This optional parameter will enable keep alive requests. This can dramatically speed
1319     up searching and reduce the load on server being spidered. The default is to not use
1320     keep alives, although enabling it will probably be the right thing to do.
1321    
1322     To get the most out of keep alives, you may want to set up your web server to
1323     allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache).
1324     Apache's default is 100, which should be good. (But, in general, don't enable KeepAlives
1325     on a mod_perl server.)
1326    
1327     Note: try to filter as many documents as possible B<before> making the request to the server. In
1328     other words, use C<test_url> to look for files ending in C<.html> instead of using C<test_response> to look
1329     for a content type of C<text/html> if possible.
1330     Do note that aborting a request from C<test_response> will break the
1331     current keep alive connection.
1332    
1333     Note: you must have at least libwww-perl-5.53_90 installed to use this feature.
1334    
1335     =item skip
1336    
1337     This optional key can be used to skip the current server. It's only purpose
1338     is to make it easy to disable a server in a configuration file.
1339    
1340     =item debug
1341    
1342     Set this to a number to display different amounts of info while spidering. Writes info
1343     to STDERR. Zero/undefined is no debug output.
1344    
1345     The following constants are defined for debugging. They may be or'ed together to
1346     get the individual debugging of your choice.
1347    
1348     Here are basically the levels:
1349    
1350     DEBUG_ERRORS general program errors (not used at this time)
1351     DEBUG_URL print out every URL processes
1352     DEBUG_HEADERS prints the response headers
1353     DEBUG_FAILED failed to return a 200
1354     DEBUG_SKIPPED didn't index for some reason
1355     DEBUG_INFO more verbose
1356     DEBUG_LINKS prints links as they are extracted
1357    
1358     For example, to display the urls processed, failed, and skipped use:
1359    
1360     debug => DEBUG_URL | DEBUG_FAILED | DEBUG_SKIPPED,
1361    
1362     To display the returned headers
1363    
1364     debug => DEBUG_HEADERS,
1365    
1366     You can easily run the spider without using swish for debugging purposes:
1367    
1368     ./spider.pl test.config > spider.out
1369    
1370     And you will see debugging info as it runs, and the fetched documents will be saved
1371     in the C<spider.out> file.
1372    
1373     Debugging can be also be set by an environment variable when running swish. This will
1374     override any setting in the configuration file. Set the variable SPIDER_DEBUG when running
1375     the spider. You can specify any of the above debugging options, separated by a comma.
1376    
1377     For example with Bourne type shell:
1378    
1379     SPIDER_DEBUG=url,links
1380    
1381     =item quiet
1382    
1383     If this is true then normal, non-error messages will be supressed. Quiet mode can also
1384     be set by setting the environment variable SPIDER_QUIET to any true value.
1385    
1386     SPIDER_QUIET=1
1387    
1388     =item max_depth
1389    
1390     The C<max_depth> parameter can be used to limit how deeply to recurse a web site.
1391     The depth is just a count of levels of web pages decended, and not related to
1392     the number of path elements in a URL.
1393    
1394     A max_depth of zero says to only spider the page listed as the C<base_url>. A max_depth of one will
1395     spider the C<base_url> page, plus all links on that page, and no more. The default is to spider all
1396     pages.
1397    
1398    
1399     =item ignore_robots_file
1400    
1401     If this is set to true then the robots.txt file will not be checked when spidering
1402     this server. Don't use this option unless you know what you are doing.
1403    
1404     =item use_cookies
1405    
1406     If this is set then a "cookie jar" will be maintained while spidering. Some
1407     (poorly written ;) sites require cookies to be enabled on clients.
1408    
1409     This requires the HTTP::Cookies module.
1410    
1411     =item use_md5
1412    
1413     If this setting is true, then a MD5 digest "fingerprint" will be made from the content of every
1414     spidered document. This digest number will be used as a hash key to prevent
1415     indexing the same content more than once. This is helpful if different URLs
1416     generate the same content.
1417    
1418     Obvious example is these two documents will only be indexed one time:
1419    
1420     http://localhost/path/to/index.html
1421     http://localhost/path/to/
1422    
1423     This option requires the Digest::MD5 module. Spidering with this option might
1424     be a tiny bit slower.
1425    
1426     =item validate_links
1427    
1428     Just a hack. If you set this true the spider will do HEAD requests all links (e.g. off-site links), just
1429     to make sure that all your links work.
1430    
1431     =item credentials
1432    
1433     You may specify a username and password to be used automatically when spidering:
1434    
1435     credentials => 'username:password',
1436    
1437     A username and password supplied in a URL will override this setting.
1438    
1439     =item credential_timeout
1440    
1441     Sets the number of seconds to wait for user input when prompted for a username or password.
1442     The default is 30 seconds.
1443    
1444     =back
1445    
1446     =head1 CALLBACK FUNCTIONS
1447    
1448     Three callback functions can be defined in your parameter hash.
1449     These optional settings are I<callback> subroutines that are called while
1450     processing URLs.
1451    
1452     A little perl discussion is in order:
1453    
1454     In perl, a scalar variable can contain a reference to a subroutine. The config example above shows
1455     that the configuration parameters are stored in a perl I<hash>.
1456    
1457     my %serverA = (
1458     base_url => 'http://sunsite.berkeley.edu:4444/',
1459     same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1460     email => 'my@email.address',
1461     link_tags => [qw/ a frame /],
1462     );
1463    
1464     There's two ways to add a reference to a subroutine to this hash:
1465    
1466     sub foo {
1467     return 1;
1468     }
1469    
1470     my %serverA = (
1471     base_url => 'http://sunsite.berkeley.edu:4444/',
1472     same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1473     email => 'my@email.address',
1474     link_tags => [qw/ a frame /],
1475     test_url => \&foo, # a reference to a named subroutine
1476     );
1477    
1478     Or the subroutine can be coded right in place:
1479    
1480     my %serverA = (
1481     base_url => 'http://sunsite.berkeley.edu:4444/',
1482     same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1483     email => 'my@email.address',
1484     link_tags => [qw/ a frame /],
1485     test_url => sub { reutrn 1; },
1486     );
1487    
1488     The above example is not very useful as it just creates a user callback function that
1489     always returns a true value (the number 1). But, it's just an example.
1490    
1491     The function calls are wrapped in an eval, so calling die (or doing something that dies) will just cause
1492     that URL to be skipped. If you really want to stop processing you need to set $server->{abort} in your
1493     subroutine (or send a kill -HUP to the spider).
1494    
1495     The first two parameters passed are a URI object (to have access to the current URL), and
1496     a reference to the current server hash. The C<server> hash is just a global hash for holding data, and
1497     useful for setting flags as describe belwo.
1498    
1499     Other parameters may be also passes, as described below.
1500     In perl parameters are passed in an array called "@_". The first element (first parameter) of
1501     that array is $_[0], and the second is $_[1], and so on. Depending on how complicated your
1502     function is you may wish to shift your parameters off of the @_ list to make working with them
1503     easier. See the examples below.
1504    
1505    
1506     To make use of these routines you need to understand when they are called, and what changes
1507     you can make in your routines. Each routine deals with a given step, and returning false from
1508     your routine will stop processing for the current URL.
1509    
1510     =over 4
1511    
1512     =item test_url
1513    
1514     C<test_url> allows you to skip processing of urls based on the url before the request
1515     to the server is made. This function is called for the C<base_url> links (links you define in
1516     the spider configuration file) and for every link extracted from a fetched web page.
1517    
1518     This function is a good place to skip links that you are not interested in following. For example,
1519     if you know there's no point in requesting images then you can exclude them like:
1520    
1521     test_url => sub {
1522     my $uri = shift;
1523     return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
1524     return 1;
1525     },
1526    
1527     Or to write it another way:
1528    
1529     test_url => sub { $_[0]->path !~ /\.(gif|jpeg|png)$/ },
1530    
1531     Another feature would be if you were using a web server where path names are
1532     NOT case sensitive (e.g. Windows). You can normalize all links in this situation
1533     using something like
1534    
1535     test_url => sub {
1536     my $uri = shift;
1537     return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
1538    
1539     $uri->path( lc $uri->path ); # make all path names lowercase
1540     return 1;
1541     },
1542    
1543     The important thing about C<test_url> (compared to the other callback functions) is that
1544     it is called while I<extracting> links, not while actually fetching that page from the web
1545     server. Returning false from C<test_url> simple says to not add the URL to the list of links to
1546     spider.
1547    
1548     You may set a flag in the server hash (second parameter) to tell the spider to abort processing.
1549    
1550     test_url => sub {
1551     my $server = $_[1];
1552     $server->{abort}++ if $_[0]->path =~ /foo\.html/;
1553     return 1;
1554     },
1555    
1556     You cannot use the server flags:
1557    
1558     no_contents
1559     no_index
1560     no_spider
1561    
1562    
1563     This is discussed below.
1564    
1565     =item test_response
1566    
1567     This function allows you to filter based on the response from the
1568     remote server (such as by content-type). This function is called while the
1569     web pages is being fetched from the remote server, typically after just enought
1570     data has been returned to read the response from the web server.
1571    
1572     The spider requests a document in "chunks" of 4096 bytes. 4096 is only a suggestion
1573     of how many bytes to return in each chunk. The C<test_response> routine is
1574     called when the first chunk is received only. This allows ignoring (aborting)
1575     reading of a very large file, for example, without having to read the entire file.
1576     Although not much use, a reference to this chunk is passed as the forth parameter.
1577    
1578     Web servers use a Content-Type: header to define the type of data returned from the server.
1579     On a web server you could have a .jpeg file be a web page -- file extensions may not always
1580     indicate the type of the file. The third parameter ($_[2]) returned is a reference to a
1581     HTTP::Response object:
1582    
1583     For example, to only index true HTML (text/html) pages:
1584    
1585     test_response => sub {
1586     my $content_type = $_[2]->content_type;
1587     return $content_type =~ m!text/html!;
1588     },
1589    
1590     You can also set flags in the server hash (the second parameter) to control indexing:
1591    
1592     no_contents -- index only the title (or file name), and not the contents
1593     no_index -- do not index this file, but continue to spider if HTML
1594     no_spider -- index, but do not spider this file for links to follow
1595     abort -- stop spidering any more files
1596    
1597     For example, to avoid index the contents of "private.html", yet still follow any links
1598     in that file:
1599    
1600     test_response => sub {
1601     my $server = $_[1];
1602     $server->{no_index}++ if $_[0]->path =~ /private\.html$/;
1603     return 1;
1604     },
1605    
1606     Note: Do not modify the URI object in this call back function.
1607    
1608    
1609     =item filter_content
1610    
1611     This callback function is called right before sending the content to swish.
1612     Like the other callback function, returning false will cause the URL to be skipped.
1613     Setting the C<abort> server flag and returning false will abort spidering.
1614    
1615     You can also set the C<no_contents> flag.
1616    
1617     This callback function is passed four parameters.
1618     The URI object, server hash, the HTTP::Response object,
1619     and a reference to the content.
1620    
1621     You can modify the content as needed. For example you might not like upper case:
1622    
1623     filter_content => sub {
1624     my $content_ref = $_[3];
1625    
1626     $$content_ref = lc $$content_ref;
1627     return 1;
1628     },
1629    
1630     I more reasonable example would be converting PDF or MS Word documents for parsing by swish.
1631     Examples of this are provided in the F<prog-bin> directory of the swish-e distribution.
1632    
1633     You may also modify the URI object to change the path name passed to swish for indexing.
1634    
1635     filter_content => sub {
1636     my $uri = $_[0];
1637     $uri->host('www.other.host') ;
1638     return 1;
1639     },
1640    
1641     Swish-e's ReplaceRules feature can also be used for modifying the path name indexed.
1642    
1643     Here's a bit more advanced example of indexing text/html and PDF files only:
1644    
1645     use pdf2xml; # included example pdf converter module
1646     $server{filter_content} = sub {
1647     my ( $uri, $server, $response, $content_ref ) = @_;
1648    
1649     return 1 if $response->content_type eq 'text/html';
1650     return 0 unless $response->content_type eq 'application/pdf';
1651    
1652     # for logging counts
1653     $server->{counts}{'PDF transformed'}++;
1654    
1655     $$content_ref = ${pdf2xml( $content_ref )};
1656     return 1;
1657     }
1658    
1659    
1660    
1661     =back
1662    
1663     Note that you can create your own counters to display in the summary list when spidering
1664     is finished by adding a value to the hash pointed to by C<$server->{counts}>.
1665    
1666     test_url => sub {
1667     my $server = $_[1];
1668     $server->{no_index}++ if $_[0]->path =~ /private\.html$/;
1669     $server->{counts}{'Private Files'}++;
1670     return 1;
1671     },
1672    
1673    
1674     Each callback function B<must> return true to continue processing the URL. Returning false will
1675     cause processing of I<the current> URL to be skipped.
1676    
1677     =head2 More on setting flags
1678    
1679     Swish (not this spider) has a configuration directive C<NoContents> that will instruct swish to
1680     index only the title (or file name), and not the contents. This is often used when
1681     indexing binary files such as image files, but can also be used with html
1682     files to index only the document titles.
1683    
1684     As shown above, you can turn this feature on for specific documents by setting a flag in
1685     the server hash passed into the C<test_response> or C<filter_content> subroutines.
1686     For example, in your configuration file you might have the C<test_response> callback set
1687     as:
1688    
1689    
1690     test_response => sub {
1691     my ( $uri, $server, $response ) = @_;
1692     # tell swish not to index the contents if this is of type image
1693     $server->{no_contents} = $response->content_type =~ m[^image/];
1694     return 1; # ok to index and spider this document
1695     }
1696    
1697     The entire contents of the resource is still read from the web server, and passed
1698     on to swish, but swish will also be passed a C<No-Contents> header which tells
1699     swish to enable the NoContents feature for this document only.
1700    
1701     Note: Swish will index the path name only when C<NoContents> is set, unless the document's
1702     type (as set by the swish configuration settings C<IndexContents> or C<DefaultContents>) is
1703     HTML I<and> a title is found in the html document.
1704    
1705     Note: In most cases you probably would not want to send a large binary file to swish, just
1706     to be ignored. Therefore, it would be smart to use a C<filter_content> callback routine to
1707     replace the contents with single character (you cannot use the empty string at this time).
1708    
1709     A similar flag may be set to prevent indexing a document at all, but still allow spidering.
1710     In general, if you want completely skip spidering a file you return false from one of the
1711     callback routines (C<test_url>, C<test_response>, or C<filter_content>). Returning false from any of those
1712     three callbacks will stop processing of that file, and the file will B<not> be spidered.
1713    
1714     But there may be some cases where you still want to spider (extract links) yet, not index the file. An example
1715     might be where you wish to index only PDF files, but you still need to spider all HTML files to find
1716     the links to the PDF files.
1717    
1718     $server{test_response} = sub {
1719     my ( $uri, $server, $response ) = @_;
1720     $server->{no_index} = $response->content_type ne 'application/pdf';
1721     return 1; # ok to spider, but don't index
1722     }
1723    
1724     So, the difference between C<no_contents> and C<no_index> is that C<no_contents> will still index the file
1725     name, just not the contents. C<no_index> will still spider the file (if it's C<text/html>) but the
1726     file will not be processed by swish at all.
1727    
1728     B<Note:> If C<no_index> is set in a C<test_response> callback function then
1729     the document I<will not be filtered>. That is, your C<filter_content>
1730     callback function will not be called.
1731    
1732     The C<no_spider> flag can be set to avoid spiderering an HTML file. The file will still be indexed unless
1733     C<no_index> is also set. But if you do not want to index and spider, then simply return false from one of the three
1734     callback funtions.
1735    
1736    
1737     =head1 SIGNALS
1738    
1739     Sending a SIGHUP to the running spider will cause it to stop spidering. This is a good way to abort spidering, but
1740     let swish index the documents retrieved so far.
1741    
1742     =head1 SEE ALSO
1743    
1744     L<URI> L<LWP::RobotUA> L<WWW::RobotRules> L<Digest::MD5>
1745    
1746     =head1 COPYRIGHT
1747    
1748     Copyright 2001 Bill Moseley
1749    
1750     This program is free software; you can redistribute it and/or modify
1751     it under the same terms as Perl itself.
1752    
1753     =head1 SUPPORT
1754    
1755     Send all questions to the The SWISH-E discussion list.
1756    
1757     See http://sunsite.berkeley.edu/SWISH-E.
1758    
1759     =cut
1760    

  ViewVC Help
Powered by ViewVC 1.1.22