/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/spider.pl
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/spider.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Error occurred while calculating annotation data.
Importing web-site building process.

1 #!/usr/local/bin/perl -w
2 use strict;
3
4
5 # $Id: spider.pl,v 1.42 2002/08/13 23:54:49 whmoseley Exp $
6 #
7 # "prog" document source for spidering web servers
8 #
9 # For documentation, type:
10 #
11 # perldoc spider.pl
12 #
13 # Apr 7, 2001 -- added quite a bit of bulk for easier debugging
14 #
15 # Nov 19, 2001 -- to do, build a server object so we are not using the passed in hash,
16 # and so we can warn on invalid config settings.
17
18 $HTTP::URI_CLASS = "URI"; # prevent loading default URI::URL
19 # so we don't store long list of base items
20 # and eat up memory with >= URI 1.13
21 use LWP::RobotUA;
22 use HTML::LinkExtor;
23 use HTML::Tagset;
24
25 use vars '$VERSION';
26 $VERSION = sprintf '%d.%02d', q$Revision: 1.42 $ =~ /: (\d+)\.(\d+)/;
27
28 use vars '$bit';
29 use constant DEBUG_ERRORS => $bit = 1; # program errors
30 use constant DEBUG_URL => $bit <<= 1; # print out every URL processes
31 use constant DEBUG_HEADERS => $bit <<= 1; # prints the response headers
32 use constant DEBUG_FAILED => $bit <<= 1; # failed to return a 200
33 use constant DEBUG_SKIPPED => $bit <<= 1; # didn't index for some reason
34 use constant DEBUG_INFO => $bit <<= 1; # more verbose
35 use constant DEBUG_LINKS => $bit <<= 1; # prints links as they are extracted
36
37 my %DEBUG_MAP = (
38 errors => DEBUG_ERRORS,
39 url => DEBUG_URL,
40 headers => DEBUG_HEADERS,
41 failed => DEBUG_FAILED,
42 skipped => DEBUG_SKIPPED,
43 info => DEBUG_INFO,
44 links => DEBUG_LINKS,
45 );
46
47
48
49 use constant MAX_SIZE => 5_000_000; # Max size of document to fetch
50 use constant MAX_WAIT_TIME => 30; # request time.
51
52 #Can't locate object method "host" via package "URI::mailto" at ../prog-bin/spider.pl line 473.
53 #sub URI::mailto::host { return '' };
54
55
56 # This is not the right way to do this.
57 sub UNIVERSAL::host { '' };
58 sub UNIVERSAL::port { '' };
59 sub UNIVERSAL::host_port { '' };
60
61
62
63 #-----------------------------------------------------------------------
64
65 use vars '@servers';
66
67 my $config = shift || 'SwishSpiderConfig.pl';
68
69 if ( lc( $config ) eq 'default' ) {
70 @servers = default_urls();
71 } else {
72 do $config or die "Failed to read $0 configuration parameters '$config' $! $@";
73 }
74
75
76 print STDERR "$0: Reading parameters from '$config'\n" unless $ENV{SPIDER_QUIET};
77
78 my $abort;
79 local $SIG{HUP} = sub { warn "Caught SIGHUP\n"; $abort++ } unless $^O =~ /Win32/i;
80
81 my %visited; # global -- I suppose would be smarter to localize it per server.
82
83 my %validated;
84 my %bad_links;
85
86 for my $s ( @servers ) {
87 if ( !$s->{base_url} ) {
88 die "You must specify 'base_url' in your spider config settings\n";
89 }
90
91 my @urls = ref $s->{base_url} eq 'ARRAY' ? @{$s->{base_url}} :( $s->{base_url});
92 for my $url ( @urls ) {
93 $s->{base_url} = $url;
94 process_server( $s );
95 }
96 }
97
98
99 if ( %bad_links ) {
100 print STDERR "\nBad Links:\n\n";
101 foreach my $page ( sort keys %bad_links ) {
102 print STDERR "On page: $page\n";
103 printf(STDERR " %-40s %s\n", $_, $validated{$_} ) for @{$bad_links{$page}};
104 print STDERR "\n";
105 }
106 }
107
108
109 #-----------------------------------------------------------------------
110
111
112 sub process_server {
113 my $server = shift;
114
115 # set defaults
116
117 if ( $ENV{SPIDER_DEBUG} ) {
118 $server->{debug} = 0;
119
120 $server->{debug} |= (exists $DEBUG_MAP{lc $_} ? $DEBUG_MAP{lc $_} : die "Bad debug setting passed in environment '$_'\nOptions are: " . join( ', ', keys %DEBUG_MAP) ."\n")
121 for split /\s*,\s*/, $ENV{SPIDER_DEBUG};
122
123 } else {
124 $server->{debug} ||= 0;
125 die "debug parameter '$server->{debug}' must be a number\n" unless $server->{debug} =~ /^\d+$/;
126 }
127
128 $server->{quiet} ||= $ENV{SPIDER_QUIET} || 0;
129
130
131 $server->{max_size} ||= MAX_SIZE;
132 die "max_size parameter '$server->{max_size}' must be a number\n" unless $server->{max_size} =~ /^\d+$/;
133
134
135 $server->{max_wait_time} ||= MAX_WAIT_TIME;
136 die "max_wait_time parameter '$server->{max_wait_time}' must be a number\n" if $server->{max_wait_time} !~ /^\d+/;
137
138
139
140 $server->{link_tags} = ['a'] unless ref $server->{link_tags} eq 'ARRAY';
141 $server->{link_tags_lookup} = { map { lc, 1 } @{$server->{link_tags}} };
142
143 die "max_depth parameter '$server->{max_depth}' must be a number\n" if defined $server->{max_depth} && $server->{max_depth} !~ /^\d+/;
144
145
146 for ( qw/ test_url test_response filter_content/ ) {
147 next unless $server->{$_};
148 $server->{$_} = [ $server->{$_} ] unless ref $server->{$_} eq 'ARRAY';
149 my $n;
150 for my $sub ( @{$server->{$_}} ) {
151 $n++;
152 die "Entry number $n in $_ is not a code reference\n" unless ref $sub eq 'CODE';
153 }
154 }
155
156
157
158 my $start = time;
159
160 if ( $server->{skip} ) {
161 print STDERR "Skipping: $server->{base_url}\n" unless $server->{quiet};
162 return;
163 }
164
165 require "HTTP/Cookies.pm" if $server->{use_cookies};
166 require "Digest/MD5.pm" if $server->{use_md5};
167
168
169 # set starting URL, and remove any specified fragment
170 my $uri = URI->new( $server->{base_url} );
171 $uri->fragment(undef);
172
173 if ( $uri->userinfo ) {
174 die "Can't specify parameter 'credentials' because base_url defines them\n"
175 if $server->{credentials};
176 $server->{credentials} = $uri->userinfo;
177 $uri->userinfo( undef );
178 }
179
180
181 print STDERR "\n -- Starting to spider: $uri --\n" if $server->{debug};
182
183
184
185 # set the starting server name (including port) -- will only spider on server:port
186
187
188 # All URLs will end up with this host:port
189 $server->{authority} = $uri->canonical->authority;
190
191 # All URLs must match this scheme ( Jan 22, 2002 - spot by Darryl Friesen )
192 $server->{scheme} = $uri->scheme;
193
194
195
196 # Now, set the OK host:port names
197 $server->{same} = [ $uri->canonical->authority ];
198
199 push @{$server->{same}}, @{$server->{same_hosts}} if ref $server->{same_hosts};
200
201 $server->{same_host_lookup} = { map { $_, 1 } @{$server->{same}} };
202
203
204
205
206 # set time to end
207
208 $server->{max_time} = $server->{max_time} * 60 + time
209 if $server->{max_time};
210
211
212 # set default agent for log files
213
214 $server->{agent} ||= 'swish-e spider 2.2 http://swish-e.org/';
215
216
217 # get a user agent object
218
219
220 my $ua;
221
222 if ( $server->{ignore_robots_file} ) {
223 $ua = LWP::UserAgent->new;
224 return unless $ua;
225 $ua->agent( $server->{agent} );
226 $ua->from( $server->{email} );
227
228 } else {
229 $ua = LWP::RobotUA->new( $server->{agent}, $server->{email} );
230 return unless $ua;
231 $ua->delay( $server->{delay_min} || 0.1 );
232 }
233
234 # Set the timeout on the server and using Windows.
235 $ua->timeout( $server->{max_wait_time} ) if $^O =~ /Win32/i;
236
237
238 $server->{ua} = $ua; # save it for fun.
239 # $ua->parse_head(0); # Don't parse the content
240
241 $ua->cookie_jar( HTTP::Cookies->new ) if $server->{use_cookies};
242
243 if ( $server->{keep_alive} ) {
244
245 if ( $ua->can( 'conn_cache' ) ) {
246 my $keep_alive = $server->{keep_alive} =~ /^\d+$/ ? $server->{keep_alive} : 1;
247 $ua->conn_cache( { total_capacity => $keep_alive } );
248
249 } else {
250 warn "Can't use keep-alive: conn_cache method not available\n";
251 }
252 }
253
254
255 # uri, parent, depth
256 eval { spider( $server, $uri ) };
257 print STDERR $@ if $@;
258
259
260 # provide a way to call a function in the config file when all done
261 check_user_function( 'spider_done', undef, $server );
262
263
264 return if $server->{quiet};
265
266
267 $start = time - $start;
268 $start++ unless $start;
269
270 my $max_width = 0;
271 my $max_num = 0;
272 for ( keys %{$server->{counts}} ) {
273 $max_width = length if length > $max_width;
274 my $val = commify( $server->{counts}{$_} );
275 $max_num = length $val if length $val > $max_num;
276 }
277
278
279 printf STDERR "\nSummary for: $server->{base_url}\n";
280
281 for ( sort keys %{$server->{counts}} ) {
282 printf STDERR "%${max_width}s: %${max_num}s (%0.1f/sec)\n",
283 $_,
284 commify( $server->{counts}{$_} ),
285 $server->{counts}{$_}/$start;
286 }
287 }
288
289
290 #-----------------------------------------------------------------------
291 # Deal with Basic Authen
292
293
294
295 # Thanks Gisle!
296 sub get_basic_credentials {
297 my($uri, $server, $realm ) = @_;
298 my $netloc = $uri->canonical->host_port;
299
300 my ($user, $password);
301
302 eval {
303 local $SIG{ALRM} = sub { die "timed out\n" };
304 alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i;
305
306 if ( $uri->userinfo ) {
307 print STDERR "\nSorry: invalid username/password\n";
308 $uri->userinfo( undef );
309 }
310
311
312 print STDERR "Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: ";
313 $user = <STDIN>;
314 chomp($user);
315 die "No Username specified\n" unless length $user;
316
317 alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i;
318
319 print STDERR "Password: ";
320 system("stty -echo");
321 $password = <STDIN>;
322 system("stty echo");
323 print STDERR "\n"; # because we disabled echo
324 chomp($password);
325
326 alarm( 0 ) unless $^O =~ /Win32/i;
327 };
328
329 return if $@;
330
331 return join ':', $user, $password;
332
333
334 }
335
336
337
338
339 #----------- Non recursive spidering ---------------------------
340
341 sub spider {
342 my ( $server, $uri ) = @_;
343
344 # Validate the first link, just in case
345 return unless check_link( $uri, $server, '', '(Base URL)' );
346
347 my @link_array = [ $uri, '', 0 ];
348
349 while ( @link_array ) {
350
351 die if $abort || $server->{abort};
352
353 my ( $uri, $parent, $depth ) = @{shift @link_array};
354
355 my $new_links = process_link( $server, $uri, $parent, $depth );
356
357 push @link_array, map { [ $_, $uri, $depth+1 ] } @$new_links if $new_links;
358
359 }
360 }
361
362
363 #----------- Process a url and return links -----------------------
364 sub process_link {
365 my ( $server, $uri, $parent, $depth ) = @_;
366
367
368 $server->{counts}{'Unique URLs'}++;
369
370 die "$0: Max files Reached\n"
371 if $server->{max_files} && $server->{counts}{'Unique URLs'} > $server->{max_files};
372
373 die "$0: Time Limit Exceeded\n"
374 if $server->{max_time} && $server->{max_time} < time;
375
376
377
378 # make request
379 my $ua = $server->{ua};
380 my $request = HTTP::Request->new('GET', $uri );
381
382
383 my $content = '';
384
385 # Really should just subclass the response object!
386 $server->{no_contents} = 0;
387 $server->{no_index} = 0;
388 $server->{no_spider} = 0;
389
390
391 # Set basic auth if defined - use URI specific first, then credentials
392 if ( my ( $user, $pass ) = split /:/, ( $uri->userinfo || $server->{credentials} || '' ) ) {
393 $request->authorization_basic( $user, $pass );
394 }
395
396
397
398
399 my $been_here;
400 my $callback = sub {
401
402 # Reset alarm;
403 alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i;
404
405
406 # Cache user/pass
407 if ( $server->{cur_realm} && $uri->userinfo ) {
408 my $key = $uri->canonical->host_port . ':' . $server->{cur_realm};
409 $server->{auth_cache}{$key} = $uri->userinfo;
410 }
411
412 $uri->userinfo( undef ) unless $been_here;
413
414 die "test_response" if !$been_here++ && !check_user_function( 'test_response', $uri, $server, $_[1], \$_[0] );
415
416
417 if ( length( $content ) + length( $_[0] ) > $server->{max_size} ) {
418 print STDERR "-Skipped $uri: Document exceeded $server->{max_size} bytes\n" if $server->{debug}&DEBUG_SKIPPED;
419 die "too big!\n";
420 }
421
422 $content .= $_[0];
423
424 };
425
426 my $response;
427
428 eval {
429 local $SIG{ALRM} = sub { die "timed out\n" };
430 alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i;
431 $response = $ua->simple_request( $request, $callback, 4096 );
432 alarm( 0 ) unless $^O =~ /Win32/i;
433 };
434
435
436 return if $server->{abort};
437
438
439 if ( $response && $response->code == 401 && $response->header('WWW-Authenticate') && $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i ) {
440 my $realm = $1;
441
442 my $user_pass;
443
444 # Do we have a cached user/pass for this realm?
445 my $key = $uri->canonical->host_port . ':' . $realm;
446
447 if ( $user_pass = $server->{auth_cache}{$key} ) {
448
449 # If we didn't just try it, try again
450 unless( $uri->userinfo && $user_pass eq $uri->userinfo ) {
451 $uri->userinfo( $user_pass );
452 return process_link( $server, $uri, $parent, $depth );
453 }
454 }
455
456 # otherwise, prompt:
457
458
459 if ( $user_pass = get_basic_credentials( $uri, $server, $realm ) ) {
460 $uri->userinfo( $user_pass );
461
462 $server->{cur_realm} = $realm; # save so we can cache
463 my $links = process_link( $server, $uri, $parent, $depth );
464 delete $server->{cur_realm};
465
466 return $links;
467 }
468 print STDERR "Skipping $uri\n";
469 }
470
471 $uri->userinfo( undef );
472
473
474
475 # Log the response
476
477 if ( ( $server->{debug} & DEBUG_URL ) || ( $server->{debug} & DEBUG_FAILED && !$response->is_success) ) {
478 print STDERR '>> ',
479 join( ' ',
480 ( $response->is_success ? '+Fetched' : '-Failed' ),
481 $depth,
482 "Cnt: $server->{counts}{'Unique URLs'}",
483 $uri,
484 ( $response->status_line || $response->status || 'unknown status' ),
485 ( $response->content_type || 'Unknown content type'),
486 ( $response->content_length || '???' ),
487 "parent:$parent",
488 ),"\n";
489 }
490
491
492
493
494 # If the LWP callback aborts
495
496 if ( $response->header('client-aborted') ) {
497 $server->{counts}{Skipped}++;
498 return;
499 }
500
501 $response->request->uri->userinfo( undef ) if $response->request;
502
503
504 # skip excluded by robots.txt
505
506 if ( !$response->is_success && $response->status_line =~ 'robots.txt' ) {
507 print STDERR "-Skipped $depth $uri: ", $response->status_line,"\n" if $server->{debug}&DEBUG_SKIPPED;
508 $server->{counts}{'robots.txt'}++;
509 return;
510 }
511
512
513 # Report bad links (excluding those skipped by robots.txt
514
515 if ( $server->{validate_links} && !$response->is_success ) {
516 validate_link( $server, $uri, $parent, $response );
517 }
518
519
520
521 # And check for meta robots tag
522 # -- should probably be done in request sub to avoid fetching docs that are not needed
523
524 unless ( $server->{ignore_robots_file} ) {
525 if ( my $directives = $response->header('X-Meta-ROBOTS') ) {
526 my %settings = map { lc $_, 1 } split /\s*,\s*/, $directives;
527 $server->{no_contents}++ if exists $settings{nocontents}; # an extension for swish
528 $server->{no_index}++ if exists $settings{noindex};
529 $server->{no_spider}++ if exists $settings{nofollow};
530 }
531 }
532
533
534
535
536 print STDERR "\n----HEADERS for $uri ---\n", $response->headers_as_string,"-----END HEADERS----\n\n"
537 if $server->{debug} & DEBUG_HEADERS;
538
539
540 unless ( $response->is_success ) {
541
542 # look for redirect
543 if ( $response->is_redirect && $response->header('location') ) {
544 my $u = URI->new_abs( $response->header('location'), $response->base );
545
546 if ( $u->canonical eq $uri->canonical ) {
547 print STDERR "Warning: $uri redirects to itself!.\n";
548 return;
549 }
550
551 return [$u] if check_link( $u, $server, $response->base, '(redirect)','Location' );
552 }
553 return;
554 }
555
556 return unless $content; # $$$ any reason to index empty files?
557
558
559 # make sure content is unique - probably better to chunk into an MD5 object above
560
561 if ( $server->{use_md5} ) {
562 my $digest = Digest::MD5::md5($content);
563
564 if ( $visited{ $digest } ) {
565
566 print STDERR "-Skipped $uri has same digest as $visited{ $digest }\n"
567 if $server->{debug} & DEBUG_SKIPPED;
568
569 $server->{counts}{Skipped}++;
570 $server->{counts}{'MD5 Duplicates'}++;
571 return;
572 }
573 $visited{ $digest } = $uri;
574 }
575
576
577 # Extract out links (if not too deep)
578
579 my $links_extracted = extract_links( $server, \$content, $response )
580 unless defined $server->{max_depth} && $depth >= $server->{max_depth};
581
582
583 # Index the file
584
585 if ( $server->{no_index} ) {
586 $server->{counts}{Skipped}++;
587 print STDERR "-Skipped indexing $uri some callback set 'no_index' flag\n" if $server->{debug}&DEBUG_SKIPPED;
588
589 } else {
590 return $links_extracted unless check_user_function( 'filter_content', $uri, $server, $response, \$content );
591
592 output_content( $server, \$content, $uri, $response )
593 unless $server->{no_index};
594 }
595
596
597
598 return $links_extracted;
599 }
600
601 #===================================================================================================
602 # Calls a user-defined function
603 #
604 #---------------------------------------------------------------------------------------------------
605
606 sub check_user_function {
607 my ( $fn, $uri, $server ) = ( shift, shift, shift );
608
609 return 1 unless $server->{$fn};
610
611 my $tests = ref $server->{$fn} eq 'ARRAY' ? $server->{$fn} : [ $server->{$fn} ];
612
613 my $cnt;
614
615 for my $sub ( @$tests ) {
616 $cnt++;
617 print STDERR "?Testing '$fn' user supplied function #$cnt '$uri'\n" if $server->{debug} & DEBUG_INFO;
618
619 my $ret;
620
621 eval { $ret = $sub->( $uri, $server, @_ ) };
622
623 if ( $@ ) {
624 print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt death '$@'\n" if $server->{debug} & DEBUG_SKIPPED;
625 $server->{counts}{Skipped}++;
626 return;
627 }
628
629 next if $ret;
630
631 print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt\n" if $server->{debug} & DEBUG_SKIPPED;
632 $server->{counts}{Skipped}++;
633 return;
634 }
635 print STDERR "+Passed all $cnt tests for '$fn' user supplied function\n" if $server->{debug} & DEBUG_INFO;
636 return 1;
637 }
638
639
640 #==============================================================================================
641 # Extract links from a text/html page
642 #
643 # Call with:
644 # $server - server object
645 # $content - ref to content
646 # $response - response object
647 #
648 #----------------------------------------------------------------------------------------------
649
650 sub extract_links {
651 my ( $server, $content, $response ) = @_;
652
653 return unless $response->header('content-type') &&
654 $response->header('content-type') =~ m[^text/html];
655
656 # allow skipping.
657 if ( $server->{no_spider} ) {
658 print STDERR '-Links not extracted: ', $response->request->uri->canonical, " some callback set 'no_spider' flag\n" if $server->{debug}&DEBUG_SKIPPED;
659 return;
660 }
661
662 $server->{Spidered}++;
663
664 my @links;
665
666
667 my $base = $response->base;
668
669 print STDERR "\nExtracting links from ", $response->request->uri, ":\n" if $server->{debug} & DEBUG_LINKS;
670
671 my $p = HTML::LinkExtor->new;
672 $p->parse( $$content );
673
674 my %skipped_tags;
675
676 for ( $p->links ) {
677 my ( $tag, %attr ) = @$_;
678
679 # which tags to use ( not reported in debug )
680
681 my $attr = join ' ', map { qq[$_="$attr{$_}"] } keys %attr;
682
683 print STDERR "\nLooking at extracted tag '<$tag $attr>'\n" if $server->{debug} & DEBUG_LINKS;
684
685 unless ( $server->{link_tags_lookup}{$tag} ) {
686
687 # each tag is reported only once per page
688 print STDERR
689 " <$tag> skipped because not one of (",
690 join( ',', @{$server->{link_tags}} ),
691 ")\n" if $server->{debug} & DEBUG_LINKS && !$skipped_tags{$tag}++;
692
693 if ( $server->{validate_links} && $tag eq 'img' && $attr{src} ) {
694 my $img = URI->new_abs( $attr{src}, $base );
695 validate_link( $server, $img, $base );
696 }
697
698 next;
699 }
700
701 # Grab which attribute(s) which might contain links for this tag
702 my $links = $HTML::Tagset::linkElements{$tag};
703 $links = [$links] unless ref $links;
704
705
706 my $found;
707
708
709 # Now, check each attribut to see if a link exists
710
711 for my $attribute ( @$links ) {
712 if ( $attr{ $attribute } ) { # ok tag
713
714 # Create a URI object
715
716 my $u = URI->new_abs( $attr{$attribute},$base );
717
718 next unless check_link( $u, $server, $base, $tag, $attribute );
719
720 push @links, $u;
721 print STDERR qq[ $attribute="$u" Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS;
722 $found++;
723 }
724 }
725
726
727 if ( !$found && $server->{debug} & DEBUG_LINKS ) {
728 print STDERR " tag did not include any links to follow or is a duplicate\n";
729 }
730
731 }
732
733 print STDERR "! Found ", scalar @links, " links in ", $response->base, "\n\n" if $server->{debug} & DEBUG_INFO;
734
735
736 return \@links;
737 }
738
739
740
741
742 #=============================================================================
743 # This function check's if a link should be added to the list to spider
744 #
745 # Pass:
746 # $u - URI object
747 # $server - the server hash
748 # $base - the base or parent of the link
749 #
750 # Returns true if a valid link
751 #
752 # Calls the user function "test_url". Link rewriting before spider
753 # can be done here.
754 #
755 #------------------------------------------------------------------------------
756 sub check_link {
757 my ( $u, $server, $base, $tag, $attribute ) = @_;
758
759 $tag ||= '';
760 $attribute ||= '';
761
762
763 # Kill the fragment
764 $u->fragment( undef );
765
766
767 # This should not happen, but make sure we have a host to check against
768
769 unless ( $u->host ) {
770 print STDERR qq[ ?? <$tag $attribute="$u"> skipped because missing host name\n] if $server->{debug} & DEBUG_LINKS;
771 return;
772 }
773
774
775 # Here we make sure we are looking at a link pointing to the correct (or equivalent) host
776
777 unless ( $server->{scheme} eq $u->scheme && $server->{same_host_lookup}{$u->canonical->authority} ) {
778
779 print STDERR qq[ ?? <$tag $attribute="$u"> skipped because different host\n] if $server->{debug} & DEBUG_LINKS;
780 $server->{counts}{'Off-site links'}++;
781 validate_link( $server, $u, $base ) if $server->{validate_links};
782 return;
783 }
784
785 $u->host_port( $server->{authority} ); # Force all the same host name
786
787 # Allow rejection of this URL by user function
788
789 return unless check_user_function( 'test_url', $u, $server );
790
791
792 # Don't add the link if already seen - these are so common that we don't report
793
794 if ( $visited{ $u->canonical }++ ) {
795 #$server->{counts}{Skipped}++;
796 $server->{counts}{Duplicates}++;
797
798
799 # Just so it's reported for all pages
800 if ( $server->{validate_links} && $validated{$u->canonical} ) {
801 push @{$bad_links{ $base->canonical }}, $u->canonical;
802 }
803
804 return;
805 }
806
807 return 1;
808 }
809
810
811 #=============================================================================
812 # This function is used to validate links that are off-site.
813 #
814 # It's just a very basic link check routine that lets you validate the
815 # off-site links at the same time as indexing. Just because we can.
816 #
817 #------------------------------------------------------------------------------
818 sub validate_link {
819 my ($server, $uri, $base, $response ) = @_;
820
821 # Already checked?
822
823 if ( exists $validated{ $uri->canonical } )
824 {
825 # Add it to the list of bad links on that page if it's a bad link.
826 push @{$bad_links{ $base->canonical }}, $uri->canonical
827 if $validated{ $uri->canonical };
828
829 return;
830 }
831
832 $validated{ $uri->canonical } = 0; # mark as checked and ok.
833
834 unless ( $response ) {
835 my $ua = LWP::UserAgent->new;
836 my $request = HTTP::Request->new('HEAD', $uri->canonical );
837
838 eval {
839 local $SIG{ALRM} = sub { die "timed out\n" };
840 alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i;
841 $response = $ua->simple_request( $request );
842 alarm( 0 ) unless $^O =~ /Win32/i;
843 };
844
845 if ( $@ ) {
846 $server->{counts}{'Bad Links'}++;
847 my $msg = $@;
848 $msg =~ tr/\n//s;
849 $validated{ $uri->canonical } = $msg;
850 push @{$bad_links{ $base->canonical }}, $uri->canonical;
851 return;
852 }
853 }
854
855 return if $response->is_success;
856
857 my $error = $response->status_line || $response->status || 'unknown status';
858
859 $error .= ' ' . URI->new_abs( $response->header('location'), $response->base )->canonical
860 if $response->is_redirect && $response->header('location');
861
862 $validated{ $uri->canonical } = $error;
863 push @{$bad_links{ $base->canonical }}, $uri->canonical;
864 }
865
866
867 sub output_content {
868 my ( $server, $content, $uri, $response ) = @_;
869
870 $server->{indexed}++;
871
872 unless ( length $$content ) {
873 print STDERR "Warning: document '", $response->request->uri, "' has no content\n";
874 $$content = ' ';
875 }
876
877
878 $server->{counts}{'Total Bytes'} += length $$content;
879 $server->{counts}{'Total Docs'}++;
880
881
882 my $headers = join "\n",
883 'Path-Name: ' . $uri,
884 'Content-Length: ' . length $$content,
885 '';
886
887 $headers .= 'Last-Mtime: ' . $response->last_modified . "\n"
888 if $response->last_modified;
889
890
891 $headers .= "No-Contents: 1\n" if $server->{no_contents};
892 print "$headers\n$$content";
893
894 die "$0: Max indexed files Reached\n"
895 if $server->{max_indexed} && $server->{counts}{'Total Docs'} >= $server->{max_indexed};
896 }
897
898
899
900 sub commify {
901 local $_ = shift;
902 1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
903 return $_;
904 }
905
906 sub default_urls {
907
908 my $validate = 0;
909 if ( $ARGV[0] eq 'validate' ) {
910 shift @ARGV;
911 $validate = 1;
912 }
913
914 die "$0: Must list URLs when using 'default'\n" unless @ARGV;
915
916
917 my @content_types = qw{ text/html text/plain };
918
919 return map {
920 {
921 #debug => DEBUG_HEADERS,
922 #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO,
923 base_url => \@ARGV,
924 email => 'swish@domain.invalid',
925 delay_min => .0001,
926 link_tags => [qw/ a frame /],
927 test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg|png)$/i },
928
929 test_response => sub {
930 my $content_type = $_[2]->content_type;
931 my $ok = grep { $_ eq $content_type } @content_types;
932 return 1 if $ok;
933 print STDERR "$_[0] $content_type != (@content_types)\n";
934 return;
935 },
936 validate_links => $validate,
937
938 }
939 } @ARGV;
940 }
941
942
943
944 __END__
945
946 =head1 NAME
947
948 spider.pl - Example Perl program to spider web servers
949
950 =head1 SYNOPSIS
951
952 swish.config:
953 IndexDir ./spider.pl
954 SwishProgParameters spider.config
955 # other swish-e settings
956
957 spider.config:
958 @servers = (
959 {
960 base_url => 'http://myserver.com/',
961 email => 'me@myself.com',
962 # other spider settings described below
963 },
964 );
965
966 Begin indexing:
967
968 swish-e -S prog -c swish.config
969
970 Note: When running on some versions of Windows (e.g. Win ME and Win 98 SE)
971 you may need to index using the command:
972
973 perl spider.pl | swish-e -S prog -c swish.conf -i stdin
974
975 This pipes the output of the spider directly into swish.
976
977
978 =head1 DESCRIPTION
979
980 This is a swish-e "prog" document source program for spidering
981 web servers. It can be used instead of the C<http> method for
982 spidering with swish.
983
984 The spider typically uses a configuration
985 file that lists the URL(s) to spider, and configuration parameters that control
986 the behavior of the spider. In addition, you may define I<callback> perl functions
987 in the configuration file that can dynamically change the behavior of the spider
988 based on URL, HTTP response headers, or the content of the fetched document. These
989 callback functions can also be used to filter or convert documents (e.g. PDF, gzip, MS Word)
990 into a format that swish-e can parse. Some examples are provided.
991
992 You define "servers" to spider, set a few parameters,
993 create callback routines, and start indexing as the synopsis above shows.
994 The spider requires its own configuration file (unless you want the default values). This
995 is NOT the same configuration file that swish-e uses.
996
997 The example configuration file C<SwishSpiderConfig.pl> is
998 included in the C<prog-bin> directory along with this script. Please just use it as an
999 example, as it contains more settings than you probably want to use. Start with a tiny config file
1000 and add settings as required by your situation.
1001
1002 The available configuration parameters are discussed below.
1003
1004 If all that sounds confusing, then you can run the spider with default settings. In fact, you can
1005 run the spider without using swish just to make sure it works. Just run
1006
1007 ./spider.pl default http://someserver.com/sometestdoc.html
1008
1009 And you should see F<sometestdoc.html> dumped to your screen. Get ready to kill the script
1010 if the file you request contains links as the output from the fetched pages will be displayed.
1011
1012 ./spider.pl default http://someserver.com/sometestdoc.html > output.file
1013
1014 might be more friendly.
1015
1016 If the first parameter passed to the spider is the word "default" (as in the preceeding example)
1017 then the spider uses the default parameters,
1018 and the following parameter(s) are expected to be URL(s) to spider.
1019 Otherwise, the first parameter is considered to be the name of the configuration file (as described
1020 below). When using C<-S prog>, the swish-e configuration setting
1021 C<SwishProgParameters> is used to pass parameters to the program specified
1022 with C<IndexDir> or the C<-i> switch.
1023
1024 If you do not specify any parameters the program will look for the file
1025
1026 SwishSpiderConfig.pl
1027
1028 in the current directory.
1029
1030 The spider does require Perl's LWP library and a few other reasonably common modules.
1031 Most well maintained systems should have these modules installed. If not, start here:
1032
1033 http://search.cpan.org/search?dist=libwww-perl
1034 http://search.cpan.org/search?dist=HTML-Parser
1035
1036 See more below in C<REQUIREMENTS>. It's a good idea to check that you are running
1037 a current version of these modules.
1038
1039 =head2 Robots Exclusion Rules and being nice
1040
1041 This script will not spider files blocked by F<robots.txt>. In addition,
1042 The script will check for <meta name="robots"..> tags, which allows finer
1043 control over what files are indexed and/or spidered.
1044 See http://www.robotstxt.org/wc/exclusion.html for details.
1045
1046 This spider provides an extension to the <meta> tag exclusion, by adding a
1047 C<NOCONTENTS> attribute. This attribute turns on the C<no_contents> setting, which
1048 asks swish-e to only index the document's title (or file name if not title is found).
1049
1050 For example:
1051
1052 <META NAME="ROBOTS" CONTENT="NOCONTENTS, NOFOLLOW">
1053
1054 says to just index the document's title, but don't index its contents, and don't follow
1055 any links within the document. Granted, it's unlikely that this feature will ever be used...
1056
1057 If you are indexing your own site, and know what you are doing, you can disable robot exclusion by
1058 the C<ignore_robots_file> configuration parameter, described below. This disables both F<robots.txt>
1059 and the meta tag parsing.
1060
1061 This script only spiders one file at a time, so load on the web server is not that great.
1062 And with libwww-perl-5.53_91 HTTP/1.1 keep alive requests can reduce the load on
1063 the server even more (and potentially reduce spidering time considerably!)
1064
1065 Still, discuss spidering with a site's administrator before beginning.
1066 Use the C<delay_min> to adjust how fast the spider fetches documents.
1067 Consider running a second web server with a limited number of children if you really
1068 want to fine tune the resources used by spidering.
1069
1070 =head2 Duplicate Documents
1071
1072 The spider program keeps track of URLs visited, so a document is only indexed
1073 one time.
1074
1075 The Digest::MD5 module can be used to create a "fingerprint" of every page
1076 indexed and this fingerprint is used in a hash to find duplicate pages.
1077 For example, MD5 will prevent indexing these as two different documents:
1078
1079 http://localhost/path/to/some/index.html
1080 http://localhost/path/to/some/
1081
1082 But note that this may have side effects you don't want. If you want this
1083 file indexed under this URL:
1084
1085 http://localhost/important.html
1086
1087 But the spider happens to find the exact content in this file first:
1088
1089 http://localhost/developement/test/todo/maybeimportant.html
1090
1091 Then only that URL will be indexed.
1092
1093 MD5 may slow down indexing a tiny bit, so test with and without if speed is an
1094 issue (which it probably isn't since you are spidering in the first place).
1095 This feature will also use more memory.
1096
1097 Note: the "prog" document source in swish bypasses many swish-e configuration settings.
1098 For example, you cannot use the C<IndexOnly> directive with the "prog" document
1099 source. This is by design to limit the overhead when using an external program
1100 for providing documents to swish; after all, with "prog", if you don't want to index a file, then
1101 don't give it to swish to index in the first place.
1102
1103 So, for spidering, if you do not wish to index images, for example, you will
1104 need to either filter by the URL or by the content-type returned from the web
1105 server. See L<CALLBACK FUNCTIONS|CALLBACK FUNCTIONS> below for more information.
1106
1107 =head1 REQUIREMENTS
1108
1109 Perl 5 (hopefully at least 5.00503) or later.
1110
1111 You must have the LWP Bundle on your computer. Load the LWP::Bundle via the CPAN.pm shell,
1112 or download libwww-perl-x.xx from CPAN (or via ActiveState's ppm utility).
1113 Also required is the the HTML-Parser-x.xx bundle of modules also from CPAN
1114 (and from ActiveState for Windows).
1115
1116 http://search.cpan.org/search?dist=libwww-perl
1117 http://search.cpan.org/search?dist=HTML-Parser
1118
1119 You will also need Digest::MD5 if you wish to use the MD5 feature.
1120 HTML::Tagset is also required.
1121 Other modules may be required (for example, the pod2xml.pm module
1122 has its own requirementes -- see perldoc pod2xml for info).
1123
1124 The spider.pl script, like everyone else, expects perl to live in /usr/local/bin.
1125 If this is not the case then either add a symlink at /usr/local/bin/perl
1126 to point to where perl is installed
1127 or modify the shebang (#!) line at the top of the spider.pl program.
1128
1129 Note that the libwww-perl package does not support SSL (Secure Sockets Layer) (https)
1130 by default. See F<README.SSL> included in the libwww-perl package for information on
1131 installing SSL support.
1132
1133 =head1 CONFIGURATION FILE
1134
1135 Configuration is not very fancy. The spider.pl program simply does a
1136 C<do "path";> to read in the parameters and create the callback subroutines.
1137 The C<path> is the first parameter passed to the spider script, which is set
1138 by the Swish-e configuration setting C<SwishProgParameters>.
1139
1140 For example, if in your swish-e configuration file you have
1141
1142 SwishProgParameters /path/to/config.pl
1143 IndexDir /home/moseley/swish-e/prog-bin/spider.pl
1144
1145 And then run swish as
1146
1147 swish-e -c swish.config -S prog
1148
1149 swish will run C</home/moseley/swish-e/prog-bin/spider.pl> and the spider.pl
1150 program will receive as its first parameter C</path/to/config.pl>, and
1151 spider.pl will read C</path/to/config.pl> to get the spider configuration
1152 settings. If C<SwishProgParameters> is not set, the program will try to
1153 use C<SwishSpiderConfig.pl> by default.
1154
1155 There is a special case of:
1156
1157 SwishProgParameters default http://www.mysite/index.html ...
1158
1159 Where default parameters are used. This will only index documents of type
1160 C<text/html> or C<text/plain>, and will skip any file with an extension that matches
1161 the pattern:
1162
1163 /\.(?:gif|jpeg|.png)$/i
1164
1165 This can be useful for indexing just your web documnts, but you will probably want finer
1166 control over your spidering by using a configuration file.
1167
1168 The configuration file must set a global variable C<@servers> (in package main).
1169 Each element in C<@servers> is a reference to a hash. The elements of the has
1170 are described next. More than one server hash may be defined -- each server
1171 will be spidered in order listed in C<@servers>, although currently a I<global> hash is
1172 used to prevent spidering the same URL twice.
1173
1174 Examples:
1175
1176 my %serverA = (
1177 base_url => 'http://swish-e.org/',
1178 same_hosts => [ qw/www.swish-e.org/ ],
1179 email => 'my@email.address',
1180 );
1181 my %serverB = (
1182 ...
1183 ...
1184 );
1185 @servers = ( \%serverA, \%serverB, );
1186
1187 =head1 CONFIGURATION OPTIONS
1188
1189 This describes the required and optional keys in the server configuration hash, in random order...
1190
1191 =over 4
1192
1193 =item base_url
1194
1195 This required setting is the starting URL for spidering.
1196
1197 Typically, you will just list one URL for the base_url. You may specify more than one
1198 URL as a reference to a list
1199
1200 base_url => [qw! http://swish-e.org/ http://othersite.org/other/index.html !],
1201
1202 You may specify a username and password:
1203
1204 base_url => 'http://user:pass@swish-e.org/index.html',
1205
1206 but you may find that to be a security issue. If a URL is protected by Basic Authentication
1207 you will be prompted for a username and password. This might be a slighly safer way to go.
1208
1209 The parameter C<max_wait_time> controls how long to wait for user entry before skipping the
1210 current URL.
1211
1212 See also C<credentials> below.
1213
1214
1215 =item same_hosts
1216
1217 This optional key sets equivalent B<authority> name(s) for the site you are spidering.
1218 For example, if your site is C<www.mysite.edu> but also can be reached by
1219 C<mysite.edu> (with or without C<www>) and also C<web.mysite.edu> then:
1220
1221
1222 Example:
1223
1224 $serverA{base_url} = 'http://www.mysite.edu/index.html';
1225 $serverA{same_hosts} = ['mysite.edu', 'web.mysite.edu'];
1226
1227 Now, if a link is found while spidering of:
1228
1229 http://web.mysite.edu/path/to/file.html
1230
1231 it will be considered on the same site, and will actually spidered and indexed
1232 as:
1233
1234 http://www.mysite.edu/path/to/file.html
1235
1236 Note: This should probably be called B<same_host_port> because it compares the URI C<host:port>
1237 against the list of host names in C<same_hosts>. So, if you specify a port name in you will
1238 want to specify the port name in the the list of hosts in C<same_hosts>:
1239
1240 my %serverA = (
1241 base_url => 'http://sunsite.berkeley.edu:4444/',
1242 same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1243 email => 'my@email.address',
1244 );
1245
1246
1247 =item email
1248
1249 This required key sets the email address for the spider. Set this to
1250 your email address.
1251
1252 =item agent
1253
1254 This optional key sets the name of the spider.
1255
1256 =item link_tags
1257
1258 This optional tag is a reference to an array of tags. Only links found in these tags will be extracted.
1259 The default is to only extract links from C<a> tags.
1260
1261 For example, to extract tags from C<a> tags and from C<frame> tags:
1262
1263 my %serverA = (
1264 base_url => 'http://sunsite.berkeley.edu:4444/',
1265 same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1266 email => 'my@email.address',
1267 link_tags => [qw/ a frame /],
1268 );
1269
1270
1271 =item delay_min
1272
1273 This optional key sets the delay in minutes to wait between requests. See the
1274 LWP::RobotUA man page for more information. The default is 0.1 (6 seconds),
1275 but in general you will probably want it much smaller. But, check with
1276 the webmaster before using too small a number.
1277
1278 =item max_wait_time
1279
1280 This setting is the number of seconds to wait for data to be returned from
1281 the request. Data is returned in chunks to the spider, and the timer is reset each time
1282 a new chunk is reported. Therefore, documents (requests) that take longer than this setting
1283 should not be aborted as long as some data is received every max_wait_time seconds.
1284 The default it 30 seconds.
1285
1286 NOTE: This option has no effect on Windows.
1287
1288 =item max_time
1289
1290 This optional key will set the max minutes to spider. Spidering
1291 for this host will stop after C<max_time> minutes, and move on to the
1292 next server, if any. The default is to not limit by time.
1293
1294 =item max_files
1295
1296 This optional key sets the max number of files to spider before aborting.
1297 The default is to not limit by number of files. This is the number of requests
1298 made to the remote server, not the total number of files to index (see C<max_indexed>).
1299 This count is displayted at the end of indexing as C<Unique URLs>.
1300
1301 This feature can (and perhaps should) be use when spidering a web site where dynamic
1302 content may generate unique URLs to prevent run-away spidering.
1303
1304 =item max_indexed
1305
1306 This optional key sets the max number of files that will be indexed.
1307 The default is to not limit. This is the number of files sent to
1308 swish for indexing (and is reported by C<Total Docs> when spidering ends).
1309
1310 =item max_size
1311
1312 This optional key sets the max size of a file read from the web server.
1313 This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is
1314 skipped and a message is written to STDERR if the DEBUG_SKIPPED debug flag is set.
1315
1316 =item keep_alive
1317
1318 This optional parameter will enable keep alive requests. This can dramatically speed
1319 up searching and reduce the load on server being spidered. The default is to not use
1320 keep alives, although enabling it will probably be the right thing to do.
1321
1322 To get the most out of keep alives, you may want to set up your web server to
1323 allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache).
1324 Apache's default is 100, which should be good. (But, in general, don't enable KeepAlives
1325 on a mod_perl server.)
1326
1327 Note: try to filter as many documents as possible B<before> making the request to the server. In
1328 other words, use C<test_url> to look for files ending in C<.html> instead of using C<test_response> to look
1329 for a content type of C<text/html> if possible.
1330 Do note that aborting a request from C<test_response> will break the
1331 current keep alive connection.
1332
1333 Note: you must have at least libwww-perl-5.53_90 installed to use this feature.
1334
1335 =item skip
1336
1337 This optional key can be used to skip the current server. It's only purpose
1338 is to make it easy to disable a server in a configuration file.
1339
1340 =item debug
1341
1342 Set this to a number to display different amounts of info while spidering. Writes info
1343 to STDERR. Zero/undefined is no debug output.
1344
1345 The following constants are defined for debugging. They may be or'ed together to
1346 get the individual debugging of your choice.
1347
1348 Here are basically the levels:
1349
1350 DEBUG_ERRORS general program errors (not used at this time)
1351 DEBUG_URL print out every URL processes
1352 DEBUG_HEADERS prints the response headers
1353 DEBUG_FAILED failed to return a 200
1354 DEBUG_SKIPPED didn't index for some reason
1355 DEBUG_INFO more verbose
1356 DEBUG_LINKS prints links as they are extracted
1357
1358 For example, to display the urls processed, failed, and skipped use:
1359
1360 debug => DEBUG_URL | DEBUG_FAILED | DEBUG_SKIPPED,
1361
1362 To display the returned headers
1363
1364 debug => DEBUG_HEADERS,
1365
1366 You can easily run the spider without using swish for debugging purposes:
1367
1368 ./spider.pl test.config > spider.out
1369
1370 And you will see debugging info as it runs, and the fetched documents will be saved
1371 in the C<spider.out> file.
1372
1373 Debugging can be also be set by an environment variable when running swish. This will
1374 override any setting in the configuration file. Set the variable SPIDER_DEBUG when running
1375 the spider. You can specify any of the above debugging options, separated by a comma.
1376
1377 For example with Bourne type shell:
1378
1379 SPIDER_DEBUG=url,links
1380
1381 =item quiet
1382
1383 If this is true then normal, non-error messages will be supressed. Quiet mode can also
1384 be set by setting the environment variable SPIDER_QUIET to any true value.
1385
1386 SPIDER_QUIET=1
1387
1388 =item max_depth
1389
1390 The C<max_depth> parameter can be used to limit how deeply to recurse a web site.
1391 The depth is just a count of levels of web pages decended, and not related to
1392 the number of path elements in a URL.
1393
1394 A max_depth of zero says to only spider the page listed as the C<base_url>. A max_depth of one will
1395 spider the C<base_url> page, plus all links on that page, and no more. The default is to spider all
1396 pages.
1397
1398
1399 =item ignore_robots_file
1400
1401 If this is set to true then the robots.txt file will not be checked when spidering
1402 this server. Don't use this option unless you know what you are doing.
1403
1404 =item use_cookies
1405
1406 If this is set then a "cookie jar" will be maintained while spidering. Some
1407 (poorly written ;) sites require cookies to be enabled on clients.
1408
1409 This requires the HTTP::Cookies module.
1410
1411 =item use_md5
1412
1413 If this setting is true, then a MD5 digest "fingerprint" will be made from the content of every
1414 spidered document. This digest number will be used as a hash key to prevent
1415 indexing the same content more than once. This is helpful if different URLs
1416 generate the same content.
1417
1418 Obvious example is these two documents will only be indexed one time:
1419
1420 http://localhost/path/to/index.html
1421 http://localhost/path/to/
1422
1423 This option requires the Digest::MD5 module. Spidering with this option might
1424 be a tiny bit slower.
1425
1426 =item validate_links
1427
1428 Just a hack. If you set this true the spider will do HEAD requests all links (e.g. off-site links), just
1429 to make sure that all your links work.
1430
1431 =item credentials
1432
1433 You may specify a username and password to be used automatically when spidering:
1434
1435 credentials => 'username:password',
1436
1437 A username and password supplied in a URL will override this setting.
1438
1439 =item credential_timeout
1440
1441 Sets the number of seconds to wait for user input when prompted for a username or password.
1442 The default is 30 seconds.
1443
1444 =back
1445
1446 =head1 CALLBACK FUNCTIONS
1447
1448 Three callback functions can be defined in your parameter hash.
1449 These optional settings are I<callback> subroutines that are called while
1450 processing URLs.
1451
1452 A little perl discussion is in order:
1453
1454 In perl, a scalar variable can contain a reference to a subroutine. The config example above shows
1455 that the configuration parameters are stored in a perl I<hash>.
1456
1457 my %serverA = (
1458 base_url => 'http://sunsite.berkeley.edu:4444/',
1459 same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1460 email => 'my@email.address',
1461 link_tags => [qw/ a frame /],
1462 );
1463
1464 There's two ways to add a reference to a subroutine to this hash:
1465
1466 sub foo {
1467 return 1;
1468 }
1469
1470 my %serverA = (
1471 base_url => 'http://sunsite.berkeley.edu:4444/',
1472 same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1473 email => 'my@email.address',
1474 link_tags => [qw/ a frame /],
1475 test_url => \&foo, # a reference to a named subroutine
1476 );
1477
1478 Or the subroutine can be coded right in place:
1479
1480 my %serverA = (
1481 base_url => 'http://sunsite.berkeley.edu:4444/',
1482 same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
1483 email => 'my@email.address',
1484 link_tags => [qw/ a frame /],
1485 test_url => sub { reutrn 1; },
1486 );
1487
1488 The above example is not very useful as it just creates a user callback function that
1489 always returns a true value (the number 1). But, it's just an example.
1490
1491 The function calls are wrapped in an eval, so calling die (or doing something that dies) will just cause
1492 that URL to be skipped. If you really want to stop processing you need to set $server->{abort} in your
1493 subroutine (or send a kill -HUP to the spider).
1494
1495 The first two parameters passed are a URI object (to have access to the current URL), and
1496 a reference to the current server hash. The C<server> hash is just a global hash for holding data, and
1497 useful for setting flags as describe belwo.
1498
1499 Other parameters may be also passes, as described below.
1500 In perl parameters are passed in an array called "@_". The first element (first parameter) of
1501 that array is $_[0], and the second is $_[1], and so on. Depending on how complicated your
1502 function is you may wish to shift your parameters off of the @_ list to make working with them
1503 easier. See the examples below.
1504
1505
1506 To make use of these routines you need to understand when they are called, and what changes
1507 you can make in your routines. Each routine deals with a given step, and returning false from
1508 your routine will stop processing for the current URL.
1509
1510 =over 4
1511
1512 =item test_url
1513
1514 C<test_url> allows you to skip processing of urls based on the url before the request
1515 to the server is made. This function is called for the C<base_url> links (links you define in
1516 the spider configuration file) and for every link extracted from a fetched web page.
1517
1518 This function is a good place to skip links that you are not interested in following. For example,
1519 if you know there's no point in requesting images then you can exclude them like:
1520
1521 test_url => sub {
1522 my $uri = shift;
1523 return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
1524 return 1;
1525 },
1526
1527 Or to write it another way:
1528
1529 test_url => sub { $_[0]->path !~ /\.(gif|jpeg|png)$/ },
1530
1531 Another feature would be if you were using a web server where path names are
1532 NOT case sensitive (e.g. Windows). You can normalize all links in this situation
1533 using something like
1534
1535 test_url => sub {
1536 my $uri = shift;
1537 return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
1538
1539 $uri->path( lc $uri->path ); # make all path names lowercase
1540 return 1;
1541 },
1542
1543 The important thing about C<test_url> (compared to the other callback functions) is that
1544 it is called while I<extracting> links, not while actually fetching that page from the web
1545 server. Returning false from C<test_url> simple says to not add the URL to the list of links to
1546 spider.
1547
1548 You may set a flag in the server hash (second parameter) to tell the spider to abort processing.
1549
1550 test_url => sub {
1551 my $server = $_[1];
1552 $server->{abort}++ if $_[0]->path =~ /foo\.html/;
1553 return 1;
1554 },
1555
1556 You cannot use the server flags:
1557
1558 no_contents
1559 no_index
1560 no_spider
1561
1562
1563 This is discussed below.
1564
1565 =item test_response
1566
1567 This function allows you to filter based on the response from the
1568 remote server (such as by content-type). This function is called while the
1569 web pages is being fetched from the remote server, typically after just enought
1570 data has been returned to read the response from the web server.
1571
1572 The spider requests a document in "chunks" of 4096 bytes. 4096 is only a suggestion
1573 of how many bytes to return in each chunk. The C<test_response> routine is
1574 called when the first chunk is received only. This allows ignoring (aborting)
1575 reading of a very large file, for example, without having to read the entire file.
1576 Although not much use, a reference to this chunk is passed as the forth parameter.
1577
1578 Web servers use a Content-Type: header to define the type of data returned from the server.
1579 On a web server you could have a .jpeg file be a web page -- file extensions may not always
1580 indicate the type of the file. The third parameter ($_[2]) returned is a reference to a
1581 HTTP::Response object:
1582
1583 For example, to only index true HTML (text/html) pages:
1584
1585 test_response => sub {
1586 my $content_type = $_[2]->content_type;
1587 return $content_type =~ m!text/html!;
1588 },
1589
1590 You can also set flags in the server hash (the second parameter) to control indexing:
1591
1592 no_contents -- index only the title (or file name), and not the contents
1593 no_index -- do not index this file, but continue to spider if HTML
1594 no_spider -- index, but do not spider this file for links to follow
1595 abort -- stop spidering any more files
1596
1597 For example, to avoid index the contents of "private.html", yet still follow any links
1598 in that file:
1599
1600 test_response => sub {
1601 my $server = $_[1];
1602 $server->{no_index}++ if $_[0]->path =~ /private\.html$/;
1603 return 1;
1604 },
1605
1606 Note: Do not modify the URI object in this call back function.
1607
1608
1609 =item filter_content
1610
1611 This callback function is called right before sending the content to swish.
1612 Like the other callback function, returning false will cause the URL to be skipped.
1613 Setting the C<abort> server flag and returning false will abort spidering.
1614
1615 You can also set the C<no_contents> flag.
1616
1617 This callback function is passed four parameters.
1618 The URI object, server hash, the HTTP::Response object,
1619 and a reference to the content.
1620
1621 You can modify the content as needed. For example you might not like upper case:
1622
1623 filter_content => sub {
1624 my $content_ref = $_[3];
1625
1626 $$content_ref = lc $$content_ref;
1627 return 1;
1628 },
1629
1630 I more reasonable example would be converting PDF or MS Word documents for parsing by swish.
1631 Examples of this are provided in the F<prog-bin> directory of the swish-e distribution.
1632
1633 You may also modify the URI object to change the path name passed to swish for indexing.
1634
1635 filter_content => sub {
1636 my $uri = $_[0];
1637 $uri->host('www.other.host') ;
1638 return 1;
1639 },
1640
1641 Swish-e's ReplaceRules feature can also be used for modifying the path name indexed.
1642
1643 Here's a bit more advanced example of indexing text/html and PDF files only:
1644
1645 use pdf2xml; # included example pdf converter module
1646 $server{filter_content} = sub {
1647 my ( $uri, $server, $response, $content_ref ) = @_;
1648
1649 return 1 if $response->content_type eq 'text/html';
1650 return 0 unless $response->content_type eq 'application/pdf';
1651
1652 # for logging counts
1653 $server->{counts}{'PDF transformed'}++;
1654
1655 $$content_ref = ${pdf2xml( $content_ref )};
1656 return 1;
1657 }
1658
1659
1660
1661 =back
1662
1663 Note that you can create your own counters to display in the summary list when spidering
1664 is finished by adding a value to the hash pointed to by C<$server->{counts}>.
1665
1666 test_url => sub {
1667 my $server = $_[1];
1668 $server->{no_index}++ if $_[0]->path =~ /private\.html$/;
1669 $server->{counts}{'Private Files'}++;
1670 return 1;
1671 },
1672
1673
1674 Each callback function B<must> return true to continue processing the URL. Returning false will
1675 cause processing of I<the current> URL to be skipped.
1676
1677 =head2 More on setting flags
1678
1679 Swish (not this spider) has a configuration directive C<NoContents> that will instruct swish to
1680 index only the title (or file name), and not the contents. This is often used when
1681 indexing binary files such as image files, but can also be used with html
1682 files to index only the document titles.
1683
1684 As shown above, you can turn this feature on for specific documents by setting a flag in
1685 the server hash passed into the C<test_response> or C<filter_content> subroutines.
1686 For example, in your configuration file you might have the C<test_response> callback set
1687 as:
1688
1689
1690 test_response => sub {
1691 my ( $uri, $server, $response ) = @_;
1692 # tell swish not to index the contents if this is of type image
1693 $server->{no_contents} = $response->content_type =~ m[^image/];
1694 return 1; # ok to index and spider this document
1695 }
1696
1697 The entire contents of the resource is still read from the web server, and passed
1698 on to swish, but swish will also be passed a C<No-Contents> header which tells
1699 swish to enable the NoContents feature for this document only.
1700
1701 Note: Swish will index the path name only when C<NoContents> is set, unless the document's
1702 type (as set by the swish configuration settings C<IndexContents> or C<DefaultContents>) is
1703 HTML I<and> a title is found in the html document.
1704
1705 Note: In most cases you probably would not want to send a large binary file to swish, just
1706 to be ignored. Therefore, it would be smart to use a C<filter_content> callback routine to
1707 replace the contents with single character (you cannot use the empty string at this time).
1708
1709 A similar flag may be set to prevent indexing a document at all, but still allow spidering.
1710 In general, if you want completely skip spidering a file you return false from one of the
1711 callback routines (C<test_url>, C<test_response>, or C<filter_content>). Returning false from any of those
1712 three callbacks will stop processing of that file, and the file will B<not> be spidered.
1713
1714 But there may be some cases where you still want to spider (extract links) yet, not index the file. An example
1715 might be where you wish to index only PDF files, but you still need to spider all HTML files to find
1716 the links to the PDF files.
1717
1718 $server{test_response} = sub {
1719 my ( $uri, $server, $response ) = @_;
1720 $server->{no_index} = $response->content_type ne 'application/pdf';
1721 return 1; # ok to spider, but don't index
1722 }
1723
1724 So, the difference between C<no_contents> and C<no_index> is that C<no_contents> will still index the file
1725 name, just not the contents. C<no_index> will still spider the file (if it's C<text/html>) but the
1726 file will not be processed by swish at all.
1727
1728 B<Note:> If C<no_index> is set in a C<test_response> callback function then
1729 the document I<will not be filtered>. That is, your C<filter_content>
1730 callback function will not be called.
1731
1732 The C<no_spider> flag can be set to avoid spiderering an HTML file. The file will still be indexed unless
1733 C<no_index> is also set. But if you do not want to index and spider, then simply return false from one of the three
1734 callback funtions.
1735
1736
1737 =head1 SIGNALS
1738
1739 Sending a SIGHUP to the running spider will cause it to stop spidering. This is a good way to abort spidering, but
1740 let swish index the documents retrieved so far.
1741
1742 =head1 SEE ALSO
1743
1744 L<URI> L<LWP::RobotUA> L<WWW::RobotRules> L<Digest::MD5>
1745
1746 =head1 COPYRIGHT
1747
1748 Copyright 2001 Bill Moseley
1749
1750 This program is free software; you can redistribute it and/or modify
1751 it under the same terms as Perl itself.
1752
1753 =head1 SUPPORT
1754
1755 Send all questions to the The SWISH-E discussion list.
1756
1757 See http://sunsite.berkeley.edu/SWISH-E.
1758
1759 =cut
1760

  ViewVC Help
Powered by ViewVC 1.1.22