1 |
adcroft |
1.1 |
#!/usr/local/bin/perl -w |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
|
5 |
|
|
# $Id: spider.pl,v 1.42 2002/08/13 23:54:49 whmoseley Exp $ |
6 |
|
|
# |
7 |
|
|
# "prog" document source for spidering web servers |
8 |
|
|
# |
9 |
|
|
# For documentation, type: |
10 |
|
|
# |
11 |
|
|
# perldoc spider.pl |
12 |
|
|
# |
13 |
|
|
# Apr 7, 2001 -- added quite a bit of bulk for easier debugging |
14 |
|
|
# |
15 |
|
|
# Nov 19, 2001 -- to do, build a server object so we are not using the passed in hash, |
16 |
|
|
# and so we can warn on invalid config settings. |
17 |
|
|
|
18 |
|
|
$HTTP::URI_CLASS = "URI"; # prevent loading default URI::URL |
19 |
|
|
# so we don't store long list of base items |
20 |
|
|
# and eat up memory with >= URI 1.13 |
21 |
|
|
use LWP::RobotUA; |
22 |
|
|
use HTML::LinkExtor; |
23 |
|
|
use HTML::Tagset; |
24 |
|
|
|
25 |
|
|
use vars '$VERSION'; |
26 |
|
|
$VERSION = sprintf '%d.%02d', q$Revision: 1.42 $ =~ /: (\d+)\.(\d+)/; |
27 |
|
|
|
28 |
|
|
use vars '$bit'; |
29 |
|
|
use constant DEBUG_ERRORS => $bit = 1; # program errors |
30 |
|
|
use constant DEBUG_URL => $bit <<= 1; # print out every URL processes |
31 |
|
|
use constant DEBUG_HEADERS => $bit <<= 1; # prints the response headers |
32 |
|
|
use constant DEBUG_FAILED => $bit <<= 1; # failed to return a 200 |
33 |
|
|
use constant DEBUG_SKIPPED => $bit <<= 1; # didn't index for some reason |
34 |
|
|
use constant DEBUG_INFO => $bit <<= 1; # more verbose |
35 |
|
|
use constant DEBUG_LINKS => $bit <<= 1; # prints links as they are extracted |
36 |
|
|
|
37 |
|
|
my %DEBUG_MAP = ( |
38 |
|
|
errors => DEBUG_ERRORS, |
39 |
|
|
url => DEBUG_URL, |
40 |
|
|
headers => DEBUG_HEADERS, |
41 |
|
|
failed => DEBUG_FAILED, |
42 |
|
|
skipped => DEBUG_SKIPPED, |
43 |
|
|
info => DEBUG_INFO, |
44 |
|
|
links => DEBUG_LINKS, |
45 |
|
|
); |
46 |
|
|
|
47 |
|
|
|
48 |
|
|
|
49 |
|
|
use constant MAX_SIZE => 5_000_000; # Max size of document to fetch |
50 |
|
|
use constant MAX_WAIT_TIME => 30; # request time. |
51 |
|
|
|
52 |
|
|
#Can't locate object method "host" via package "URI::mailto" at ../prog-bin/spider.pl line 473. |
53 |
|
|
#sub URI::mailto::host { return '' }; |
54 |
|
|
|
55 |
|
|
|
56 |
|
|
# This is not the right way to do this. |
57 |
|
|
sub UNIVERSAL::host { '' }; |
58 |
|
|
sub UNIVERSAL::port { '' }; |
59 |
|
|
sub UNIVERSAL::host_port { '' }; |
60 |
|
|
|
61 |
|
|
|
62 |
|
|
|
63 |
|
|
#----------------------------------------------------------------------- |
64 |
|
|
|
65 |
|
|
use vars '@servers'; |
66 |
|
|
|
67 |
|
|
my $config = shift || 'SwishSpiderConfig.pl'; |
68 |
|
|
|
69 |
|
|
if ( lc( $config ) eq 'default' ) { |
70 |
|
|
@servers = default_urls(); |
71 |
|
|
} else { |
72 |
|
|
do $config or die "Failed to read $0 configuration parameters '$config' $! $@"; |
73 |
|
|
} |
74 |
|
|
|
75 |
|
|
|
76 |
|
|
print STDERR "$0: Reading parameters from '$config'\n" unless $ENV{SPIDER_QUIET}; |
77 |
|
|
|
78 |
|
|
my $abort; |
79 |
|
|
local $SIG{HUP} = sub { warn "Caught SIGHUP\n"; $abort++ } unless $^O =~ /Win32/i; |
80 |
|
|
|
81 |
|
|
my %visited; # global -- I suppose would be smarter to localize it per server. |
82 |
|
|
|
83 |
|
|
my %validated; |
84 |
|
|
my %bad_links; |
85 |
|
|
|
86 |
|
|
for my $s ( @servers ) { |
87 |
|
|
if ( !$s->{base_url} ) { |
88 |
|
|
die "You must specify 'base_url' in your spider config settings\n"; |
89 |
|
|
} |
90 |
|
|
|
91 |
|
|
my @urls = ref $s->{base_url} eq 'ARRAY' ? @{$s->{base_url}} :( $s->{base_url}); |
92 |
|
|
for my $url ( @urls ) { |
93 |
|
|
$s->{base_url} = $url; |
94 |
|
|
process_server( $s ); |
95 |
|
|
} |
96 |
|
|
} |
97 |
|
|
|
98 |
|
|
|
99 |
|
|
if ( %bad_links ) { |
100 |
|
|
print STDERR "\nBad Links:\n\n"; |
101 |
|
|
foreach my $page ( sort keys %bad_links ) { |
102 |
|
|
print STDERR "On page: $page\n"; |
103 |
|
|
printf(STDERR " %-40s %s\n", $_, $validated{$_} ) for @{$bad_links{$page}}; |
104 |
|
|
print STDERR "\n"; |
105 |
|
|
} |
106 |
|
|
} |
107 |
|
|
|
108 |
|
|
|
109 |
|
|
#----------------------------------------------------------------------- |
110 |
|
|
|
111 |
|
|
|
112 |
|
|
sub process_server { |
113 |
|
|
my $server = shift; |
114 |
|
|
|
115 |
|
|
# set defaults |
116 |
|
|
|
117 |
|
|
if ( $ENV{SPIDER_DEBUG} ) { |
118 |
|
|
$server->{debug} = 0; |
119 |
|
|
|
120 |
|
|
$server->{debug} |= (exists $DEBUG_MAP{lc $_} ? $DEBUG_MAP{lc $_} : die "Bad debug setting passed in environment '$_'\nOptions are: " . join( ', ', keys %DEBUG_MAP) ."\n") |
121 |
|
|
for split /\s*,\s*/, $ENV{SPIDER_DEBUG}; |
122 |
|
|
|
123 |
|
|
} else { |
124 |
|
|
$server->{debug} ||= 0; |
125 |
|
|
die "debug parameter '$server->{debug}' must be a number\n" unless $server->{debug} =~ /^\d+$/; |
126 |
|
|
} |
127 |
|
|
|
128 |
|
|
$server->{quiet} ||= $ENV{SPIDER_QUIET} || 0; |
129 |
|
|
|
130 |
|
|
|
131 |
|
|
$server->{max_size} ||= MAX_SIZE; |
132 |
|
|
die "max_size parameter '$server->{max_size}' must be a number\n" unless $server->{max_size} =~ /^\d+$/; |
133 |
|
|
|
134 |
|
|
|
135 |
|
|
$server->{max_wait_time} ||= MAX_WAIT_TIME; |
136 |
|
|
die "max_wait_time parameter '$server->{max_wait_time}' must be a number\n" if $server->{max_wait_time} !~ /^\d+/; |
137 |
|
|
|
138 |
|
|
|
139 |
|
|
|
140 |
|
|
$server->{link_tags} = ['a'] unless ref $server->{link_tags} eq 'ARRAY'; |
141 |
|
|
$server->{link_tags_lookup} = { map { lc, 1 } @{$server->{link_tags}} }; |
142 |
|
|
|
143 |
|
|
die "max_depth parameter '$server->{max_depth}' must be a number\n" if defined $server->{max_depth} && $server->{max_depth} !~ /^\d+/; |
144 |
|
|
|
145 |
|
|
|
146 |
|
|
for ( qw/ test_url test_response filter_content/ ) { |
147 |
|
|
next unless $server->{$_}; |
148 |
|
|
$server->{$_} = [ $server->{$_} ] unless ref $server->{$_} eq 'ARRAY'; |
149 |
|
|
my $n; |
150 |
|
|
for my $sub ( @{$server->{$_}} ) { |
151 |
|
|
$n++; |
152 |
|
|
die "Entry number $n in $_ is not a code reference\n" unless ref $sub eq 'CODE'; |
153 |
|
|
} |
154 |
|
|
} |
155 |
|
|
|
156 |
|
|
|
157 |
|
|
|
158 |
|
|
my $start = time; |
159 |
|
|
|
160 |
|
|
if ( $server->{skip} ) { |
161 |
|
|
print STDERR "Skipping: $server->{base_url}\n" unless $server->{quiet}; |
162 |
|
|
return; |
163 |
|
|
} |
164 |
|
|
|
165 |
|
|
require "HTTP/Cookies.pm" if $server->{use_cookies}; |
166 |
|
|
require "Digest/MD5.pm" if $server->{use_md5}; |
167 |
|
|
|
168 |
|
|
|
169 |
|
|
# set starting URL, and remove any specified fragment |
170 |
|
|
my $uri = URI->new( $server->{base_url} ); |
171 |
|
|
$uri->fragment(undef); |
172 |
|
|
|
173 |
|
|
if ( $uri->userinfo ) { |
174 |
|
|
die "Can't specify parameter 'credentials' because base_url defines them\n" |
175 |
|
|
if $server->{credentials}; |
176 |
|
|
$server->{credentials} = $uri->userinfo; |
177 |
|
|
$uri->userinfo( undef ); |
178 |
|
|
} |
179 |
|
|
|
180 |
|
|
|
181 |
|
|
print STDERR "\n -- Starting to spider: $uri --\n" if $server->{debug}; |
182 |
|
|
|
183 |
|
|
|
184 |
|
|
|
185 |
|
|
# set the starting server name (including port) -- will only spider on server:port |
186 |
|
|
|
187 |
|
|
|
188 |
|
|
# All URLs will end up with this host:port |
189 |
|
|
$server->{authority} = $uri->canonical->authority; |
190 |
|
|
|
191 |
|
|
# All URLs must match this scheme ( Jan 22, 2002 - spot by Darryl Friesen ) |
192 |
|
|
$server->{scheme} = $uri->scheme; |
193 |
|
|
|
194 |
|
|
|
195 |
|
|
|
196 |
|
|
# Now, set the OK host:port names |
197 |
|
|
$server->{same} = [ $uri->canonical->authority ]; |
198 |
|
|
|
199 |
|
|
push @{$server->{same}}, @{$server->{same_hosts}} if ref $server->{same_hosts}; |
200 |
|
|
|
201 |
|
|
$server->{same_host_lookup} = { map { $_, 1 } @{$server->{same}} }; |
202 |
|
|
|
203 |
|
|
|
204 |
|
|
|
205 |
|
|
|
206 |
|
|
# set time to end |
207 |
|
|
|
208 |
|
|
$server->{max_time} = $server->{max_time} * 60 + time |
209 |
|
|
if $server->{max_time}; |
210 |
|
|
|
211 |
|
|
|
212 |
|
|
# set default agent for log files |
213 |
|
|
|
214 |
|
|
$server->{agent} ||= 'swish-e spider 2.2 http://swish-e.org/'; |
215 |
|
|
|
216 |
|
|
|
217 |
|
|
# get a user agent object |
218 |
|
|
|
219 |
|
|
|
220 |
|
|
my $ua; |
221 |
|
|
|
222 |
|
|
if ( $server->{ignore_robots_file} ) { |
223 |
|
|
$ua = LWP::UserAgent->new; |
224 |
|
|
return unless $ua; |
225 |
|
|
$ua->agent( $server->{agent} ); |
226 |
|
|
$ua->from( $server->{email} ); |
227 |
|
|
|
228 |
|
|
} else { |
229 |
|
|
$ua = LWP::RobotUA->new( $server->{agent}, $server->{email} ); |
230 |
|
|
return unless $ua; |
231 |
|
|
$ua->delay( $server->{delay_min} || 0.1 ); |
232 |
|
|
} |
233 |
|
|
|
234 |
|
|
# Set the timeout on the server and using Windows. |
235 |
|
|
$ua->timeout( $server->{max_wait_time} ) if $^O =~ /Win32/i; |
236 |
|
|
|
237 |
|
|
|
238 |
|
|
$server->{ua} = $ua; # save it for fun. |
239 |
|
|
# $ua->parse_head(0); # Don't parse the content |
240 |
|
|
|
241 |
|
|
$ua->cookie_jar( HTTP::Cookies->new ) if $server->{use_cookies}; |
242 |
|
|
|
243 |
|
|
if ( $server->{keep_alive} ) { |
244 |
|
|
|
245 |
|
|
if ( $ua->can( 'conn_cache' ) ) { |
246 |
|
|
my $keep_alive = $server->{keep_alive} =~ /^\d+$/ ? $server->{keep_alive} : 1; |
247 |
|
|
$ua->conn_cache( { total_capacity => $keep_alive } ); |
248 |
|
|
|
249 |
|
|
} else { |
250 |
|
|
warn "Can't use keep-alive: conn_cache method not available\n"; |
251 |
|
|
} |
252 |
|
|
} |
253 |
|
|
|
254 |
|
|
|
255 |
|
|
# uri, parent, depth |
256 |
|
|
eval { spider( $server, $uri ) }; |
257 |
|
|
print STDERR $@ if $@; |
258 |
|
|
|
259 |
|
|
|
260 |
|
|
# provide a way to call a function in the config file when all done |
261 |
|
|
check_user_function( 'spider_done', undef, $server ); |
262 |
|
|
|
263 |
|
|
|
264 |
|
|
return if $server->{quiet}; |
265 |
|
|
|
266 |
|
|
|
267 |
|
|
$start = time - $start; |
268 |
|
|
$start++ unless $start; |
269 |
|
|
|
270 |
|
|
my $max_width = 0; |
271 |
|
|
my $max_num = 0; |
272 |
|
|
for ( keys %{$server->{counts}} ) { |
273 |
|
|
$max_width = length if length > $max_width; |
274 |
|
|
my $val = commify( $server->{counts}{$_} ); |
275 |
|
|
$max_num = length $val if length $val > $max_num; |
276 |
|
|
} |
277 |
|
|
|
278 |
|
|
|
279 |
|
|
printf STDERR "\nSummary for: $server->{base_url}\n"; |
280 |
|
|
|
281 |
|
|
for ( sort keys %{$server->{counts}} ) { |
282 |
|
|
printf STDERR "%${max_width}s: %${max_num}s (%0.1f/sec)\n", |
283 |
|
|
$_, |
284 |
|
|
commify( $server->{counts}{$_} ), |
285 |
|
|
$server->{counts}{$_}/$start; |
286 |
|
|
} |
287 |
|
|
} |
288 |
|
|
|
289 |
|
|
|
290 |
|
|
#----------------------------------------------------------------------- |
291 |
|
|
# Deal with Basic Authen |
292 |
|
|
|
293 |
|
|
|
294 |
|
|
|
295 |
|
|
# Thanks Gisle! |
296 |
|
|
sub get_basic_credentials { |
297 |
|
|
my($uri, $server, $realm ) = @_; |
298 |
|
|
my $netloc = $uri->canonical->host_port; |
299 |
|
|
|
300 |
|
|
my ($user, $password); |
301 |
|
|
|
302 |
|
|
eval { |
303 |
|
|
local $SIG{ALRM} = sub { die "timed out\n" }; |
304 |
|
|
alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i; |
305 |
|
|
|
306 |
|
|
if ( $uri->userinfo ) { |
307 |
|
|
print STDERR "\nSorry: invalid username/password\n"; |
308 |
|
|
$uri->userinfo( undef ); |
309 |
|
|
} |
310 |
|
|
|
311 |
|
|
|
312 |
|
|
print STDERR "Need Authentication for $uri at realm '$realm'\n(<Enter> skips)\nUsername: "; |
313 |
|
|
$user = <STDIN>; |
314 |
|
|
chomp($user); |
315 |
|
|
die "No Username specified\n" unless length $user; |
316 |
|
|
|
317 |
|
|
alarm( $server->{credential_timeout} || 30 ) unless $^O =~ /Win32/i; |
318 |
|
|
|
319 |
|
|
print STDERR "Password: "; |
320 |
|
|
system("stty -echo"); |
321 |
|
|
$password = <STDIN>; |
322 |
|
|
system("stty echo"); |
323 |
|
|
print STDERR "\n"; # because we disabled echo |
324 |
|
|
chomp($password); |
325 |
|
|
|
326 |
|
|
alarm( 0 ) unless $^O =~ /Win32/i; |
327 |
|
|
}; |
328 |
|
|
|
329 |
|
|
return if $@; |
330 |
|
|
|
331 |
|
|
return join ':', $user, $password; |
332 |
|
|
|
333 |
|
|
|
334 |
|
|
} |
335 |
|
|
|
336 |
|
|
|
337 |
|
|
|
338 |
|
|
|
339 |
|
|
#----------- Non recursive spidering --------------------------- |
340 |
|
|
|
341 |
|
|
sub spider { |
342 |
|
|
my ( $server, $uri ) = @_; |
343 |
|
|
|
344 |
|
|
# Validate the first link, just in case |
345 |
|
|
return unless check_link( $uri, $server, '', '(Base URL)' ); |
346 |
|
|
|
347 |
|
|
my @link_array = [ $uri, '', 0 ]; |
348 |
|
|
|
349 |
|
|
while ( @link_array ) { |
350 |
|
|
|
351 |
|
|
die if $abort || $server->{abort}; |
352 |
|
|
|
353 |
|
|
my ( $uri, $parent, $depth ) = @{shift @link_array}; |
354 |
|
|
|
355 |
|
|
my $new_links = process_link( $server, $uri, $parent, $depth ); |
356 |
|
|
|
357 |
|
|
push @link_array, map { [ $_, $uri, $depth+1 ] } @$new_links if $new_links; |
358 |
|
|
|
359 |
|
|
} |
360 |
|
|
} |
361 |
|
|
|
362 |
|
|
|
363 |
|
|
#----------- Process a url and return links ----------------------- |
364 |
|
|
sub process_link { |
365 |
|
|
my ( $server, $uri, $parent, $depth ) = @_; |
366 |
|
|
|
367 |
|
|
|
368 |
|
|
$server->{counts}{'Unique URLs'}++; |
369 |
|
|
|
370 |
|
|
die "$0: Max files Reached\n" |
371 |
|
|
if $server->{max_files} && $server->{counts}{'Unique URLs'} > $server->{max_files}; |
372 |
|
|
|
373 |
|
|
die "$0: Time Limit Exceeded\n" |
374 |
|
|
if $server->{max_time} && $server->{max_time} < time; |
375 |
|
|
|
376 |
|
|
|
377 |
|
|
|
378 |
|
|
# make request |
379 |
|
|
my $ua = $server->{ua}; |
380 |
|
|
my $request = HTTP::Request->new('GET', $uri ); |
381 |
|
|
|
382 |
|
|
|
383 |
|
|
my $content = ''; |
384 |
|
|
|
385 |
|
|
# Really should just subclass the response object! |
386 |
|
|
$server->{no_contents} = 0; |
387 |
|
|
$server->{no_index} = 0; |
388 |
|
|
$server->{no_spider} = 0; |
389 |
|
|
|
390 |
|
|
|
391 |
|
|
# Set basic auth if defined - use URI specific first, then credentials |
392 |
|
|
if ( my ( $user, $pass ) = split /:/, ( $uri->userinfo || $server->{credentials} || '' ) ) { |
393 |
|
|
$request->authorization_basic( $user, $pass ); |
394 |
|
|
} |
395 |
|
|
|
396 |
|
|
|
397 |
|
|
|
398 |
|
|
|
399 |
|
|
my $been_here; |
400 |
|
|
my $callback = sub { |
401 |
|
|
|
402 |
|
|
# Reset alarm; |
403 |
|
|
alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i; |
404 |
|
|
|
405 |
|
|
|
406 |
|
|
# Cache user/pass |
407 |
|
|
if ( $server->{cur_realm} && $uri->userinfo ) { |
408 |
|
|
my $key = $uri->canonical->host_port . ':' . $server->{cur_realm}; |
409 |
|
|
$server->{auth_cache}{$key} = $uri->userinfo; |
410 |
|
|
} |
411 |
|
|
|
412 |
|
|
$uri->userinfo( undef ) unless $been_here; |
413 |
|
|
|
414 |
|
|
die "test_response" if !$been_here++ && !check_user_function( 'test_response', $uri, $server, $_[1], \$_[0] ); |
415 |
|
|
|
416 |
|
|
|
417 |
|
|
if ( length( $content ) + length( $_[0] ) > $server->{max_size} ) { |
418 |
|
|
print STDERR "-Skipped $uri: Document exceeded $server->{max_size} bytes\n" if $server->{debug}&DEBUG_SKIPPED; |
419 |
|
|
die "too big!\n"; |
420 |
|
|
} |
421 |
|
|
|
422 |
|
|
$content .= $_[0]; |
423 |
|
|
|
424 |
|
|
}; |
425 |
|
|
|
426 |
|
|
my $response; |
427 |
|
|
|
428 |
|
|
eval { |
429 |
|
|
local $SIG{ALRM} = sub { die "timed out\n" }; |
430 |
|
|
alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i; |
431 |
|
|
$response = $ua->simple_request( $request, $callback, 4096 ); |
432 |
|
|
alarm( 0 ) unless $^O =~ /Win32/i; |
433 |
|
|
}; |
434 |
|
|
|
435 |
|
|
|
436 |
|
|
return if $server->{abort}; |
437 |
|
|
|
438 |
|
|
|
439 |
|
|
if ( $response && $response->code == 401 && $response->header('WWW-Authenticate') && $response->header('WWW-Authenticate') =~ /realm="([^"]+)"/i ) { |
440 |
|
|
my $realm = $1; |
441 |
|
|
|
442 |
|
|
my $user_pass; |
443 |
|
|
|
444 |
|
|
# Do we have a cached user/pass for this realm? |
445 |
|
|
my $key = $uri->canonical->host_port . ':' . $realm; |
446 |
|
|
|
447 |
|
|
if ( $user_pass = $server->{auth_cache}{$key} ) { |
448 |
|
|
|
449 |
|
|
# If we didn't just try it, try again |
450 |
|
|
unless( $uri->userinfo && $user_pass eq $uri->userinfo ) { |
451 |
|
|
$uri->userinfo( $user_pass ); |
452 |
|
|
return process_link( $server, $uri, $parent, $depth ); |
453 |
|
|
} |
454 |
|
|
} |
455 |
|
|
|
456 |
|
|
# otherwise, prompt: |
457 |
|
|
|
458 |
|
|
|
459 |
|
|
if ( $user_pass = get_basic_credentials( $uri, $server, $realm ) ) { |
460 |
|
|
$uri->userinfo( $user_pass ); |
461 |
|
|
|
462 |
|
|
$server->{cur_realm} = $realm; # save so we can cache |
463 |
|
|
my $links = process_link( $server, $uri, $parent, $depth ); |
464 |
|
|
delete $server->{cur_realm}; |
465 |
|
|
|
466 |
|
|
return $links; |
467 |
|
|
} |
468 |
|
|
print STDERR "Skipping $uri\n"; |
469 |
|
|
} |
470 |
|
|
|
471 |
|
|
$uri->userinfo( undef ); |
472 |
|
|
|
473 |
|
|
|
474 |
|
|
|
475 |
|
|
# Log the response |
476 |
|
|
|
477 |
|
|
if ( ( $server->{debug} & DEBUG_URL ) || ( $server->{debug} & DEBUG_FAILED && !$response->is_success) ) { |
478 |
|
|
print STDERR '>> ', |
479 |
|
|
join( ' ', |
480 |
|
|
( $response->is_success ? '+Fetched' : '-Failed' ), |
481 |
|
|
$depth, |
482 |
|
|
"Cnt: $server->{counts}{'Unique URLs'}", |
483 |
|
|
$uri, |
484 |
|
|
( $response->status_line || $response->status || 'unknown status' ), |
485 |
|
|
( $response->content_type || 'Unknown content type'), |
486 |
|
|
( $response->content_length || '???' ), |
487 |
|
|
"parent:$parent", |
488 |
|
|
),"\n"; |
489 |
|
|
} |
490 |
|
|
|
491 |
|
|
|
492 |
|
|
|
493 |
|
|
|
494 |
|
|
# If the LWP callback aborts |
495 |
|
|
|
496 |
|
|
if ( $response->header('client-aborted') ) { |
497 |
|
|
$server->{counts}{Skipped}++; |
498 |
|
|
return; |
499 |
|
|
} |
500 |
|
|
|
501 |
|
|
$response->request->uri->userinfo( undef ) if $response->request; |
502 |
|
|
|
503 |
|
|
|
504 |
|
|
# skip excluded by robots.txt |
505 |
|
|
|
506 |
|
|
if ( !$response->is_success && $response->status_line =~ 'robots.txt' ) { |
507 |
|
|
print STDERR "-Skipped $depth $uri: ", $response->status_line,"\n" if $server->{debug}&DEBUG_SKIPPED; |
508 |
|
|
$server->{counts}{'robots.txt'}++; |
509 |
|
|
return; |
510 |
|
|
} |
511 |
|
|
|
512 |
|
|
|
513 |
|
|
# Report bad links (excluding those skipped by robots.txt |
514 |
|
|
|
515 |
|
|
if ( $server->{validate_links} && !$response->is_success ) { |
516 |
|
|
validate_link( $server, $uri, $parent, $response ); |
517 |
|
|
} |
518 |
|
|
|
519 |
|
|
|
520 |
|
|
|
521 |
|
|
# And check for meta robots tag |
522 |
|
|
# -- should probably be done in request sub to avoid fetching docs that are not needed |
523 |
|
|
|
524 |
|
|
unless ( $server->{ignore_robots_file} ) { |
525 |
|
|
if ( my $directives = $response->header('X-Meta-ROBOTS') ) { |
526 |
|
|
my %settings = map { lc $_, 1 } split /\s*,\s*/, $directives; |
527 |
|
|
$server->{no_contents}++ if exists $settings{nocontents}; # an extension for swish |
528 |
|
|
$server->{no_index}++ if exists $settings{noindex}; |
529 |
|
|
$server->{no_spider}++ if exists $settings{nofollow}; |
530 |
|
|
} |
531 |
|
|
} |
532 |
|
|
|
533 |
|
|
|
534 |
|
|
|
535 |
|
|
|
536 |
|
|
print STDERR "\n----HEADERS for $uri ---\n", $response->headers_as_string,"-----END HEADERS----\n\n" |
537 |
|
|
if $server->{debug} & DEBUG_HEADERS; |
538 |
|
|
|
539 |
|
|
|
540 |
|
|
unless ( $response->is_success ) { |
541 |
|
|
|
542 |
|
|
# look for redirect |
543 |
|
|
if ( $response->is_redirect && $response->header('location') ) { |
544 |
|
|
my $u = URI->new_abs( $response->header('location'), $response->base ); |
545 |
|
|
|
546 |
|
|
if ( $u->canonical eq $uri->canonical ) { |
547 |
|
|
print STDERR "Warning: $uri redirects to itself!.\n"; |
548 |
|
|
return; |
549 |
|
|
} |
550 |
|
|
|
551 |
|
|
return [$u] if check_link( $u, $server, $response->base, '(redirect)','Location' ); |
552 |
|
|
} |
553 |
|
|
return; |
554 |
|
|
} |
555 |
|
|
|
556 |
|
|
return unless $content; # $$$ any reason to index empty files? |
557 |
|
|
|
558 |
|
|
|
559 |
|
|
# make sure content is unique - probably better to chunk into an MD5 object above |
560 |
|
|
|
561 |
|
|
if ( $server->{use_md5} ) { |
562 |
|
|
my $digest = Digest::MD5::md5($content); |
563 |
|
|
|
564 |
|
|
if ( $visited{ $digest } ) { |
565 |
|
|
|
566 |
|
|
print STDERR "-Skipped $uri has same digest as $visited{ $digest }\n" |
567 |
|
|
if $server->{debug} & DEBUG_SKIPPED; |
568 |
|
|
|
569 |
|
|
$server->{counts}{Skipped}++; |
570 |
|
|
$server->{counts}{'MD5 Duplicates'}++; |
571 |
|
|
return; |
572 |
|
|
} |
573 |
|
|
$visited{ $digest } = $uri; |
574 |
|
|
} |
575 |
|
|
|
576 |
|
|
|
577 |
|
|
# Extract out links (if not too deep) |
578 |
|
|
|
579 |
|
|
my $links_extracted = extract_links( $server, \$content, $response ) |
580 |
|
|
unless defined $server->{max_depth} && $depth >= $server->{max_depth}; |
581 |
|
|
|
582 |
|
|
|
583 |
|
|
# Index the file |
584 |
|
|
|
585 |
|
|
if ( $server->{no_index} ) { |
586 |
|
|
$server->{counts}{Skipped}++; |
587 |
|
|
print STDERR "-Skipped indexing $uri some callback set 'no_index' flag\n" if $server->{debug}&DEBUG_SKIPPED; |
588 |
|
|
|
589 |
|
|
} else { |
590 |
|
|
return $links_extracted unless check_user_function( 'filter_content', $uri, $server, $response, \$content ); |
591 |
|
|
|
592 |
|
|
output_content( $server, \$content, $uri, $response ) |
593 |
|
|
unless $server->{no_index}; |
594 |
|
|
} |
595 |
|
|
|
596 |
|
|
|
597 |
|
|
|
598 |
|
|
return $links_extracted; |
599 |
|
|
} |
600 |
|
|
|
601 |
|
|
#=================================================================================================== |
602 |
|
|
# Calls a user-defined function |
603 |
|
|
# |
604 |
|
|
#--------------------------------------------------------------------------------------------------- |
605 |
|
|
|
606 |
|
|
sub check_user_function { |
607 |
|
|
my ( $fn, $uri, $server ) = ( shift, shift, shift ); |
608 |
|
|
|
609 |
|
|
return 1 unless $server->{$fn}; |
610 |
|
|
|
611 |
|
|
my $tests = ref $server->{$fn} eq 'ARRAY' ? $server->{$fn} : [ $server->{$fn} ]; |
612 |
|
|
|
613 |
|
|
my $cnt; |
614 |
|
|
|
615 |
|
|
for my $sub ( @$tests ) { |
616 |
|
|
$cnt++; |
617 |
|
|
print STDERR "?Testing '$fn' user supplied function #$cnt '$uri'\n" if $server->{debug} & DEBUG_INFO; |
618 |
|
|
|
619 |
|
|
my $ret; |
620 |
|
|
|
621 |
|
|
eval { $ret = $sub->( $uri, $server, @_ ) }; |
622 |
|
|
|
623 |
|
|
if ( $@ ) { |
624 |
|
|
print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt death '$@'\n" if $server->{debug} & DEBUG_SKIPPED; |
625 |
|
|
$server->{counts}{Skipped}++; |
626 |
|
|
return; |
627 |
|
|
} |
628 |
|
|
|
629 |
|
|
next if $ret; |
630 |
|
|
|
631 |
|
|
print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt\n" if $server->{debug} & DEBUG_SKIPPED; |
632 |
|
|
$server->{counts}{Skipped}++; |
633 |
|
|
return; |
634 |
|
|
} |
635 |
|
|
print STDERR "+Passed all $cnt tests for '$fn' user supplied function\n" if $server->{debug} & DEBUG_INFO; |
636 |
|
|
return 1; |
637 |
|
|
} |
638 |
|
|
|
639 |
|
|
|
640 |
|
|
#============================================================================================== |
641 |
|
|
# Extract links from a text/html page |
642 |
|
|
# |
643 |
|
|
# Call with: |
644 |
|
|
# $server - server object |
645 |
|
|
# $content - ref to content |
646 |
|
|
# $response - response object |
647 |
|
|
# |
648 |
|
|
#---------------------------------------------------------------------------------------------- |
649 |
|
|
|
650 |
|
|
sub extract_links { |
651 |
|
|
my ( $server, $content, $response ) = @_; |
652 |
|
|
|
653 |
|
|
return unless $response->header('content-type') && |
654 |
|
|
$response->header('content-type') =~ m[^text/html]; |
655 |
|
|
|
656 |
|
|
# allow skipping. |
657 |
|
|
if ( $server->{no_spider} ) { |
658 |
|
|
print STDERR '-Links not extracted: ', $response->request->uri->canonical, " some callback set 'no_spider' flag\n" if $server->{debug}&DEBUG_SKIPPED; |
659 |
|
|
return; |
660 |
|
|
} |
661 |
|
|
|
662 |
|
|
$server->{Spidered}++; |
663 |
|
|
|
664 |
|
|
my @links; |
665 |
|
|
|
666 |
|
|
|
667 |
|
|
my $base = $response->base; |
668 |
|
|
|
669 |
|
|
print STDERR "\nExtracting links from ", $response->request->uri, ":\n" if $server->{debug} & DEBUG_LINKS; |
670 |
|
|
|
671 |
|
|
my $p = HTML::LinkExtor->new; |
672 |
|
|
$p->parse( $$content ); |
673 |
|
|
|
674 |
|
|
my %skipped_tags; |
675 |
|
|
|
676 |
|
|
for ( $p->links ) { |
677 |
|
|
my ( $tag, %attr ) = @$_; |
678 |
|
|
|
679 |
|
|
# which tags to use ( not reported in debug ) |
680 |
|
|
|
681 |
|
|
my $attr = join ' ', map { qq[$_="$attr{$_}"] } keys %attr; |
682 |
|
|
|
683 |
|
|
print STDERR "\nLooking at extracted tag '<$tag $attr>'\n" if $server->{debug} & DEBUG_LINKS; |
684 |
|
|
|
685 |
|
|
unless ( $server->{link_tags_lookup}{$tag} ) { |
686 |
|
|
|
687 |
|
|
# each tag is reported only once per page |
688 |
|
|
print STDERR |
689 |
|
|
" <$tag> skipped because not one of (", |
690 |
|
|
join( ',', @{$server->{link_tags}} ), |
691 |
|
|
")\n" if $server->{debug} & DEBUG_LINKS && !$skipped_tags{$tag}++; |
692 |
|
|
|
693 |
|
|
if ( $server->{validate_links} && $tag eq 'img' && $attr{src} ) { |
694 |
|
|
my $img = URI->new_abs( $attr{src}, $base ); |
695 |
|
|
validate_link( $server, $img, $base ); |
696 |
|
|
} |
697 |
|
|
|
698 |
|
|
next; |
699 |
|
|
} |
700 |
|
|
|
701 |
|
|
# Grab which attribute(s) which might contain links for this tag |
702 |
|
|
my $links = $HTML::Tagset::linkElements{$tag}; |
703 |
|
|
$links = [$links] unless ref $links; |
704 |
|
|
|
705 |
|
|
|
706 |
|
|
my $found; |
707 |
|
|
|
708 |
|
|
|
709 |
|
|
# Now, check each attribut to see if a link exists |
710 |
|
|
|
711 |
|
|
for my $attribute ( @$links ) { |
712 |
|
|
if ( $attr{ $attribute } ) { # ok tag |
713 |
|
|
|
714 |
|
|
# Create a URI object |
715 |
|
|
|
716 |
|
|
my $u = URI->new_abs( $attr{$attribute},$base ); |
717 |
|
|
|
718 |
|
|
next unless check_link( $u, $server, $base, $tag, $attribute ); |
719 |
|
|
|
720 |
|
|
push @links, $u; |
721 |
|
|
print STDERR qq[ $attribute="$u" Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS; |
722 |
|
|
$found++; |
723 |
|
|
} |
724 |
|
|
} |
725 |
|
|
|
726 |
|
|
|
727 |
|
|
if ( !$found && $server->{debug} & DEBUG_LINKS ) { |
728 |
|
|
print STDERR " tag did not include any links to follow or is a duplicate\n"; |
729 |
|
|
} |
730 |
|
|
|
731 |
|
|
} |
732 |
|
|
|
733 |
|
|
print STDERR "! Found ", scalar @links, " links in ", $response->base, "\n\n" if $server->{debug} & DEBUG_INFO; |
734 |
|
|
|
735 |
|
|
|
736 |
|
|
return \@links; |
737 |
|
|
} |
738 |
|
|
|
739 |
|
|
|
740 |
|
|
|
741 |
|
|
|
742 |
|
|
#============================================================================= |
743 |
|
|
# This function check's if a link should be added to the list to spider |
744 |
|
|
# |
745 |
|
|
# Pass: |
746 |
|
|
# $u - URI object |
747 |
|
|
# $server - the server hash |
748 |
|
|
# $base - the base or parent of the link |
749 |
|
|
# |
750 |
|
|
# Returns true if a valid link |
751 |
|
|
# |
752 |
|
|
# Calls the user function "test_url". Link rewriting before spider |
753 |
|
|
# can be done here. |
754 |
|
|
# |
755 |
|
|
#------------------------------------------------------------------------------ |
756 |
|
|
sub check_link { |
757 |
|
|
my ( $u, $server, $base, $tag, $attribute ) = @_; |
758 |
|
|
|
759 |
|
|
$tag ||= ''; |
760 |
|
|
$attribute ||= ''; |
761 |
|
|
|
762 |
|
|
|
763 |
|
|
# Kill the fragment |
764 |
|
|
$u->fragment( undef ); |
765 |
|
|
|
766 |
|
|
|
767 |
|
|
# This should not happen, but make sure we have a host to check against |
768 |
|
|
|
769 |
|
|
unless ( $u->host ) { |
770 |
|
|
print STDERR qq[ ?? <$tag $attribute="$u"> skipped because missing host name\n] if $server->{debug} & DEBUG_LINKS; |
771 |
|
|
return; |
772 |
|
|
} |
773 |
|
|
|
774 |
|
|
|
775 |
|
|
# Here we make sure we are looking at a link pointing to the correct (or equivalent) host |
776 |
|
|
|
777 |
|
|
unless ( $server->{scheme} eq $u->scheme && $server->{same_host_lookup}{$u->canonical->authority} ) { |
778 |
|
|
|
779 |
|
|
print STDERR qq[ ?? <$tag $attribute="$u"> skipped because different host\n] if $server->{debug} & DEBUG_LINKS; |
780 |
|
|
$server->{counts}{'Off-site links'}++; |
781 |
|
|
validate_link( $server, $u, $base ) if $server->{validate_links}; |
782 |
|
|
return; |
783 |
|
|
} |
784 |
|
|
|
785 |
|
|
$u->host_port( $server->{authority} ); # Force all the same host name |
786 |
|
|
|
787 |
|
|
# Allow rejection of this URL by user function |
788 |
|
|
|
789 |
|
|
return unless check_user_function( 'test_url', $u, $server ); |
790 |
|
|
|
791 |
|
|
|
792 |
|
|
# Don't add the link if already seen - these are so common that we don't report |
793 |
|
|
|
794 |
|
|
if ( $visited{ $u->canonical }++ ) { |
795 |
|
|
#$server->{counts}{Skipped}++; |
796 |
|
|
$server->{counts}{Duplicates}++; |
797 |
|
|
|
798 |
|
|
|
799 |
|
|
# Just so it's reported for all pages |
800 |
|
|
if ( $server->{validate_links} && $validated{$u->canonical} ) { |
801 |
|
|
push @{$bad_links{ $base->canonical }}, $u->canonical; |
802 |
|
|
} |
803 |
|
|
|
804 |
|
|
return; |
805 |
|
|
} |
806 |
|
|
|
807 |
|
|
return 1; |
808 |
|
|
} |
809 |
|
|
|
810 |
|
|
|
811 |
|
|
#============================================================================= |
812 |
|
|
# This function is used to validate links that are off-site. |
813 |
|
|
# |
814 |
|
|
# It's just a very basic link check routine that lets you validate the |
815 |
|
|
# off-site links at the same time as indexing. Just because we can. |
816 |
|
|
# |
817 |
|
|
#------------------------------------------------------------------------------ |
818 |
|
|
sub validate_link { |
819 |
|
|
my ($server, $uri, $base, $response ) = @_; |
820 |
|
|
|
821 |
|
|
# Already checked? |
822 |
|
|
|
823 |
|
|
if ( exists $validated{ $uri->canonical } ) |
824 |
|
|
{ |
825 |
|
|
# Add it to the list of bad links on that page if it's a bad link. |
826 |
|
|
push @{$bad_links{ $base->canonical }}, $uri->canonical |
827 |
|
|
if $validated{ $uri->canonical }; |
828 |
|
|
|
829 |
|
|
return; |
830 |
|
|
} |
831 |
|
|
|
832 |
|
|
$validated{ $uri->canonical } = 0; # mark as checked and ok. |
833 |
|
|
|
834 |
|
|
unless ( $response ) { |
835 |
|
|
my $ua = LWP::UserAgent->new; |
836 |
|
|
my $request = HTTP::Request->new('HEAD', $uri->canonical ); |
837 |
|
|
|
838 |
|
|
eval { |
839 |
|
|
local $SIG{ALRM} = sub { die "timed out\n" }; |
840 |
|
|
alarm( $server->{max_wait_time} ) unless $^O =~ /Win32/i; |
841 |
|
|
$response = $ua->simple_request( $request ); |
842 |
|
|
alarm( 0 ) unless $^O =~ /Win32/i; |
843 |
|
|
}; |
844 |
|
|
|
845 |
|
|
if ( $@ ) { |
846 |
|
|
$server->{counts}{'Bad Links'}++; |
847 |
|
|
my $msg = $@; |
848 |
|
|
$msg =~ tr/\n//s; |
849 |
|
|
$validated{ $uri->canonical } = $msg; |
850 |
|
|
push @{$bad_links{ $base->canonical }}, $uri->canonical; |
851 |
|
|
return; |
852 |
|
|
} |
853 |
|
|
} |
854 |
|
|
|
855 |
|
|
return if $response->is_success; |
856 |
|
|
|
857 |
|
|
my $error = $response->status_line || $response->status || 'unknown status'; |
858 |
|
|
|
859 |
|
|
$error .= ' ' . URI->new_abs( $response->header('location'), $response->base )->canonical |
860 |
|
|
if $response->is_redirect && $response->header('location'); |
861 |
|
|
|
862 |
|
|
$validated{ $uri->canonical } = $error; |
863 |
|
|
push @{$bad_links{ $base->canonical }}, $uri->canonical; |
864 |
|
|
} |
865 |
|
|
|
866 |
|
|
|
867 |
|
|
sub output_content { |
868 |
|
|
my ( $server, $content, $uri, $response ) = @_; |
869 |
|
|
|
870 |
|
|
$server->{indexed}++; |
871 |
|
|
|
872 |
|
|
unless ( length $$content ) { |
873 |
|
|
print STDERR "Warning: document '", $response->request->uri, "' has no content\n"; |
874 |
|
|
$$content = ' '; |
875 |
|
|
} |
876 |
|
|
|
877 |
|
|
|
878 |
|
|
$server->{counts}{'Total Bytes'} += length $$content; |
879 |
|
|
$server->{counts}{'Total Docs'}++; |
880 |
|
|
|
881 |
|
|
|
882 |
|
|
my $headers = join "\n", |
883 |
|
|
'Path-Name: ' . $uri, |
884 |
|
|
'Content-Length: ' . length $$content, |
885 |
|
|
''; |
886 |
|
|
|
887 |
|
|
$headers .= 'Last-Mtime: ' . $response->last_modified . "\n" |
888 |
|
|
if $response->last_modified; |
889 |
|
|
|
890 |
|
|
|
891 |
|
|
$headers .= "No-Contents: 1\n" if $server->{no_contents}; |
892 |
|
|
print "$headers\n$$content"; |
893 |
|
|
|
894 |
|
|
die "$0: Max indexed files Reached\n" |
895 |
|
|
if $server->{max_indexed} && $server->{counts}{'Total Docs'} >= $server->{max_indexed}; |
896 |
|
|
} |
897 |
|
|
|
898 |
|
|
|
899 |
|
|
|
900 |
|
|
sub commify { |
901 |
|
|
local $_ = shift; |
902 |
|
|
1 while s/^([-+]?\d+)(\d{3})/$1,$2/; |
903 |
|
|
return $_; |
904 |
|
|
} |
905 |
|
|
|
906 |
|
|
sub default_urls { |
907 |
|
|
|
908 |
|
|
my $validate = 0; |
909 |
|
|
if ( $ARGV[0] eq 'validate' ) { |
910 |
|
|
shift @ARGV; |
911 |
|
|
$validate = 1; |
912 |
|
|
} |
913 |
|
|
|
914 |
|
|
die "$0: Must list URLs when using 'default'\n" unless @ARGV; |
915 |
|
|
|
916 |
|
|
|
917 |
|
|
my @content_types = qw{ text/html text/plain }; |
918 |
|
|
|
919 |
|
|
return map { |
920 |
|
|
{ |
921 |
|
|
#debug => DEBUG_HEADERS, |
922 |
|
|
#debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO, |
923 |
|
|
base_url => \@ARGV, |
924 |
|
|
email => 'swish@domain.invalid', |
925 |
|
|
delay_min => .0001, |
926 |
|
|
link_tags => [qw/ a frame /], |
927 |
|
|
test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg|png)$/i }, |
928 |
|
|
|
929 |
|
|
test_response => sub { |
930 |
|
|
my $content_type = $_[2]->content_type; |
931 |
|
|
my $ok = grep { $_ eq $content_type } @content_types; |
932 |
|
|
return 1 if $ok; |
933 |
|
|
print STDERR "$_[0] $content_type != (@content_types)\n"; |
934 |
|
|
return; |
935 |
|
|
}, |
936 |
|
|
validate_links => $validate, |
937 |
|
|
|
938 |
|
|
} |
939 |
|
|
} @ARGV; |
940 |
|
|
} |
941 |
|
|
|
942 |
|
|
|
943 |
|
|
|
944 |
|
|
__END__ |
945 |
|
|
|
946 |
|
|
=head1 NAME |
947 |
|
|
|
948 |
|
|
spider.pl - Example Perl program to spider web servers |
949 |
|
|
|
950 |
|
|
=head1 SYNOPSIS |
951 |
|
|
|
952 |
|
|
swish.config: |
953 |
|
|
IndexDir ./spider.pl |
954 |
|
|
SwishProgParameters spider.config |
955 |
|
|
# other swish-e settings |
956 |
|
|
|
957 |
|
|
spider.config: |
958 |
|
|
@servers = ( |
959 |
|
|
{ |
960 |
|
|
base_url => 'http://myserver.com/', |
961 |
|
|
email => 'me@myself.com', |
962 |
|
|
# other spider settings described below |
963 |
|
|
}, |
964 |
|
|
); |
965 |
|
|
|
966 |
|
|
Begin indexing: |
967 |
|
|
|
968 |
|
|
swish-e -S prog -c swish.config |
969 |
|
|
|
970 |
|
|
Note: When running on some versions of Windows (e.g. Win ME and Win 98 SE) |
971 |
|
|
you may need to index using the command: |
972 |
|
|
|
973 |
|
|
perl spider.pl | swish-e -S prog -c swish.conf -i stdin |
974 |
|
|
|
975 |
|
|
This pipes the output of the spider directly into swish. |
976 |
|
|
|
977 |
|
|
|
978 |
|
|
=head1 DESCRIPTION |
979 |
|
|
|
980 |
|
|
This is a swish-e "prog" document source program for spidering |
981 |
|
|
web servers. It can be used instead of the C<http> method for |
982 |
|
|
spidering with swish. |
983 |
|
|
|
984 |
|
|
The spider typically uses a configuration |
985 |
|
|
file that lists the URL(s) to spider, and configuration parameters that control |
986 |
|
|
the behavior of the spider. In addition, you may define I<callback> perl functions |
987 |
|
|
in the configuration file that can dynamically change the behavior of the spider |
988 |
|
|
based on URL, HTTP response headers, or the content of the fetched document. These |
989 |
|
|
callback functions can also be used to filter or convert documents (e.g. PDF, gzip, MS Word) |
990 |
|
|
into a format that swish-e can parse. Some examples are provided. |
991 |
|
|
|
992 |
|
|
You define "servers" to spider, set a few parameters, |
993 |
|
|
create callback routines, and start indexing as the synopsis above shows. |
994 |
|
|
The spider requires its own configuration file (unless you want the default values). This |
995 |
|
|
is NOT the same configuration file that swish-e uses. |
996 |
|
|
|
997 |
|
|
The example configuration file C<SwishSpiderConfig.pl> is |
998 |
|
|
included in the C<prog-bin> directory along with this script. Please just use it as an |
999 |
|
|
example, as it contains more settings than you probably want to use. Start with a tiny config file |
1000 |
|
|
and add settings as required by your situation. |
1001 |
|
|
|
1002 |
|
|
The available configuration parameters are discussed below. |
1003 |
|
|
|
1004 |
|
|
If all that sounds confusing, then you can run the spider with default settings. In fact, you can |
1005 |
|
|
run the spider without using swish just to make sure it works. Just run |
1006 |
|
|
|
1007 |
|
|
./spider.pl default http://someserver.com/sometestdoc.html |
1008 |
|
|
|
1009 |
|
|
And you should see F<sometestdoc.html> dumped to your screen. Get ready to kill the script |
1010 |
|
|
if the file you request contains links as the output from the fetched pages will be displayed. |
1011 |
|
|
|
1012 |
|
|
./spider.pl default http://someserver.com/sometestdoc.html > output.file |
1013 |
|
|
|
1014 |
|
|
might be more friendly. |
1015 |
|
|
|
1016 |
|
|
If the first parameter passed to the spider is the word "default" (as in the preceeding example) |
1017 |
|
|
then the spider uses the default parameters, |
1018 |
|
|
and the following parameter(s) are expected to be URL(s) to spider. |
1019 |
|
|
Otherwise, the first parameter is considered to be the name of the configuration file (as described |
1020 |
|
|
below). When using C<-S prog>, the swish-e configuration setting |
1021 |
|
|
C<SwishProgParameters> is used to pass parameters to the program specified |
1022 |
|
|
with C<IndexDir> or the C<-i> switch. |
1023 |
|
|
|
1024 |
|
|
If you do not specify any parameters the program will look for the file |
1025 |
|
|
|
1026 |
|
|
SwishSpiderConfig.pl |
1027 |
|
|
|
1028 |
|
|
in the current directory. |
1029 |
|
|
|
1030 |
|
|
The spider does require Perl's LWP library and a few other reasonably common modules. |
1031 |
|
|
Most well maintained systems should have these modules installed. If not, start here: |
1032 |
|
|
|
1033 |
|
|
http://search.cpan.org/search?dist=libwww-perl |
1034 |
|
|
http://search.cpan.org/search?dist=HTML-Parser |
1035 |
|
|
|
1036 |
|
|
See more below in C<REQUIREMENTS>. It's a good idea to check that you are running |
1037 |
|
|
a current version of these modules. |
1038 |
|
|
|
1039 |
|
|
=head2 Robots Exclusion Rules and being nice |
1040 |
|
|
|
1041 |
|
|
This script will not spider files blocked by F<robots.txt>. In addition, |
1042 |
|
|
The script will check for <meta name="robots"..> tags, which allows finer |
1043 |
|
|
control over what files are indexed and/or spidered. |
1044 |
|
|
See http://www.robotstxt.org/wc/exclusion.html for details. |
1045 |
|
|
|
1046 |
|
|
This spider provides an extension to the <meta> tag exclusion, by adding a |
1047 |
|
|
C<NOCONTENTS> attribute. This attribute turns on the C<no_contents> setting, which |
1048 |
|
|
asks swish-e to only index the document's title (or file name if not title is found). |
1049 |
|
|
|
1050 |
|
|
For example: |
1051 |
|
|
|
1052 |
|
|
<META NAME="ROBOTS" CONTENT="NOCONTENTS, NOFOLLOW"> |
1053 |
|
|
|
1054 |
|
|
says to just index the document's title, but don't index its contents, and don't follow |
1055 |
|
|
any links within the document. Granted, it's unlikely that this feature will ever be used... |
1056 |
|
|
|
1057 |
|
|
If you are indexing your own site, and know what you are doing, you can disable robot exclusion by |
1058 |
|
|
the C<ignore_robots_file> configuration parameter, described below. This disables both F<robots.txt> |
1059 |
|
|
and the meta tag parsing. |
1060 |
|
|
|
1061 |
|
|
This script only spiders one file at a time, so load on the web server is not that great. |
1062 |
|
|
And with libwww-perl-5.53_91 HTTP/1.1 keep alive requests can reduce the load on |
1063 |
|
|
the server even more (and potentially reduce spidering time considerably!) |
1064 |
|
|
|
1065 |
|
|
Still, discuss spidering with a site's administrator before beginning. |
1066 |
|
|
Use the C<delay_min> to adjust how fast the spider fetches documents. |
1067 |
|
|
Consider running a second web server with a limited number of children if you really |
1068 |
|
|
want to fine tune the resources used by spidering. |
1069 |
|
|
|
1070 |
|
|
=head2 Duplicate Documents |
1071 |
|
|
|
1072 |
|
|
The spider program keeps track of URLs visited, so a document is only indexed |
1073 |
|
|
one time. |
1074 |
|
|
|
1075 |
|
|
The Digest::MD5 module can be used to create a "fingerprint" of every page |
1076 |
|
|
indexed and this fingerprint is used in a hash to find duplicate pages. |
1077 |
|
|
For example, MD5 will prevent indexing these as two different documents: |
1078 |
|
|
|
1079 |
|
|
http://localhost/path/to/some/index.html |
1080 |
|
|
http://localhost/path/to/some/ |
1081 |
|
|
|
1082 |
|
|
But note that this may have side effects you don't want. If you want this |
1083 |
|
|
file indexed under this URL: |
1084 |
|
|
|
1085 |
|
|
http://localhost/important.html |
1086 |
|
|
|
1087 |
|
|
But the spider happens to find the exact content in this file first: |
1088 |
|
|
|
1089 |
|
|
http://localhost/developement/test/todo/maybeimportant.html |
1090 |
|
|
|
1091 |
|
|
Then only that URL will be indexed. |
1092 |
|
|
|
1093 |
|
|
MD5 may slow down indexing a tiny bit, so test with and without if speed is an |
1094 |
|
|
issue (which it probably isn't since you are spidering in the first place). |
1095 |
|
|
This feature will also use more memory. |
1096 |
|
|
|
1097 |
|
|
Note: the "prog" document source in swish bypasses many swish-e configuration settings. |
1098 |
|
|
For example, you cannot use the C<IndexOnly> directive with the "prog" document |
1099 |
|
|
source. This is by design to limit the overhead when using an external program |
1100 |
|
|
for providing documents to swish; after all, with "prog", if you don't want to index a file, then |
1101 |
|
|
don't give it to swish to index in the first place. |
1102 |
|
|
|
1103 |
|
|
So, for spidering, if you do not wish to index images, for example, you will |
1104 |
|
|
need to either filter by the URL or by the content-type returned from the web |
1105 |
|
|
server. See L<CALLBACK FUNCTIONS|CALLBACK FUNCTIONS> below for more information. |
1106 |
|
|
|
1107 |
|
|
=head1 REQUIREMENTS |
1108 |
|
|
|
1109 |
|
|
Perl 5 (hopefully at least 5.00503) or later. |
1110 |
|
|
|
1111 |
|
|
You must have the LWP Bundle on your computer. Load the LWP::Bundle via the CPAN.pm shell, |
1112 |
|
|
or download libwww-perl-x.xx from CPAN (or via ActiveState's ppm utility). |
1113 |
|
|
Also required is the the HTML-Parser-x.xx bundle of modules also from CPAN |
1114 |
|
|
(and from ActiveState for Windows). |
1115 |
|
|
|
1116 |
|
|
http://search.cpan.org/search?dist=libwww-perl |
1117 |
|
|
http://search.cpan.org/search?dist=HTML-Parser |
1118 |
|
|
|
1119 |
|
|
You will also need Digest::MD5 if you wish to use the MD5 feature. |
1120 |
|
|
HTML::Tagset is also required. |
1121 |
|
|
Other modules may be required (for example, the pod2xml.pm module |
1122 |
|
|
has its own requirementes -- see perldoc pod2xml for info). |
1123 |
|
|
|
1124 |
|
|
The spider.pl script, like everyone else, expects perl to live in /usr/local/bin. |
1125 |
|
|
If this is not the case then either add a symlink at /usr/local/bin/perl |
1126 |
|
|
to point to where perl is installed |
1127 |
|
|
or modify the shebang (#!) line at the top of the spider.pl program. |
1128 |
|
|
|
1129 |
|
|
Note that the libwww-perl package does not support SSL (Secure Sockets Layer) (https) |
1130 |
|
|
by default. See F<README.SSL> included in the libwww-perl package for information on |
1131 |
|
|
installing SSL support. |
1132 |
|
|
|
1133 |
|
|
=head1 CONFIGURATION FILE |
1134 |
|
|
|
1135 |
|
|
Configuration is not very fancy. The spider.pl program simply does a |
1136 |
|
|
C<do "path";> to read in the parameters and create the callback subroutines. |
1137 |
|
|
The C<path> is the first parameter passed to the spider script, which is set |
1138 |
|
|
by the Swish-e configuration setting C<SwishProgParameters>. |
1139 |
|
|
|
1140 |
|
|
For example, if in your swish-e configuration file you have |
1141 |
|
|
|
1142 |
|
|
SwishProgParameters /path/to/config.pl |
1143 |
|
|
IndexDir /home/moseley/swish-e/prog-bin/spider.pl |
1144 |
|
|
|
1145 |
|
|
And then run swish as |
1146 |
|
|
|
1147 |
|
|
swish-e -c swish.config -S prog |
1148 |
|
|
|
1149 |
|
|
swish will run C</home/moseley/swish-e/prog-bin/spider.pl> and the spider.pl |
1150 |
|
|
program will receive as its first parameter C</path/to/config.pl>, and |
1151 |
|
|
spider.pl will read C</path/to/config.pl> to get the spider configuration |
1152 |
|
|
settings. If C<SwishProgParameters> is not set, the program will try to |
1153 |
|
|
use C<SwishSpiderConfig.pl> by default. |
1154 |
|
|
|
1155 |
|
|
There is a special case of: |
1156 |
|
|
|
1157 |
|
|
SwishProgParameters default http://www.mysite/index.html ... |
1158 |
|
|
|
1159 |
|
|
Where default parameters are used. This will only index documents of type |
1160 |
|
|
C<text/html> or C<text/plain>, and will skip any file with an extension that matches |
1161 |
|
|
the pattern: |
1162 |
|
|
|
1163 |
|
|
/\.(?:gif|jpeg|.png)$/i |
1164 |
|
|
|
1165 |
|
|
This can be useful for indexing just your web documnts, but you will probably want finer |
1166 |
|
|
control over your spidering by using a configuration file. |
1167 |
|
|
|
1168 |
|
|
The configuration file must set a global variable C<@servers> (in package main). |
1169 |
|
|
Each element in C<@servers> is a reference to a hash. The elements of the has |
1170 |
|
|
are described next. More than one server hash may be defined -- each server |
1171 |
|
|
will be spidered in order listed in C<@servers>, although currently a I<global> hash is |
1172 |
|
|
used to prevent spidering the same URL twice. |
1173 |
|
|
|
1174 |
|
|
Examples: |
1175 |
|
|
|
1176 |
|
|
my %serverA = ( |
1177 |
|
|
base_url => 'http://swish-e.org/', |
1178 |
|
|
same_hosts => [ qw/www.swish-e.org/ ], |
1179 |
|
|
email => 'my@email.address', |
1180 |
|
|
); |
1181 |
|
|
my %serverB = ( |
1182 |
|
|
... |
1183 |
|
|
... |
1184 |
|
|
); |
1185 |
|
|
@servers = ( \%serverA, \%serverB, ); |
1186 |
|
|
|
1187 |
|
|
=head1 CONFIGURATION OPTIONS |
1188 |
|
|
|
1189 |
|
|
This describes the required and optional keys in the server configuration hash, in random order... |
1190 |
|
|
|
1191 |
|
|
=over 4 |
1192 |
|
|
|
1193 |
|
|
=item base_url |
1194 |
|
|
|
1195 |
|
|
This required setting is the starting URL for spidering. |
1196 |
|
|
|
1197 |
|
|
Typically, you will just list one URL for the base_url. You may specify more than one |
1198 |
|
|
URL as a reference to a list |
1199 |
|
|
|
1200 |
|
|
base_url => [qw! http://swish-e.org/ http://othersite.org/other/index.html !], |
1201 |
|
|
|
1202 |
|
|
You may specify a username and password: |
1203 |
|
|
|
1204 |
|
|
base_url => 'http://user:pass@swish-e.org/index.html', |
1205 |
|
|
|
1206 |
|
|
but you may find that to be a security issue. If a URL is protected by Basic Authentication |
1207 |
|
|
you will be prompted for a username and password. This might be a slighly safer way to go. |
1208 |
|
|
|
1209 |
|
|
The parameter C<max_wait_time> controls how long to wait for user entry before skipping the |
1210 |
|
|
current URL. |
1211 |
|
|
|
1212 |
|
|
See also C<credentials> below. |
1213 |
|
|
|
1214 |
|
|
|
1215 |
|
|
=item same_hosts |
1216 |
|
|
|
1217 |
|
|
This optional key sets equivalent B<authority> name(s) for the site you are spidering. |
1218 |
|
|
For example, if your site is C<www.mysite.edu> but also can be reached by |
1219 |
|
|
C<mysite.edu> (with or without C<www>) and also C<web.mysite.edu> then: |
1220 |
|
|
|
1221 |
|
|
|
1222 |
|
|
Example: |
1223 |
|
|
|
1224 |
|
|
$serverA{base_url} = 'http://www.mysite.edu/index.html'; |
1225 |
|
|
$serverA{same_hosts} = ['mysite.edu', 'web.mysite.edu']; |
1226 |
|
|
|
1227 |
|
|
Now, if a link is found while spidering of: |
1228 |
|
|
|
1229 |
|
|
http://web.mysite.edu/path/to/file.html |
1230 |
|
|
|
1231 |
|
|
it will be considered on the same site, and will actually spidered and indexed |
1232 |
|
|
as: |
1233 |
|
|
|
1234 |
|
|
http://www.mysite.edu/path/to/file.html |
1235 |
|
|
|
1236 |
|
|
Note: This should probably be called B<same_host_port> because it compares the URI C<host:port> |
1237 |
|
|
against the list of host names in C<same_hosts>. So, if you specify a port name in you will |
1238 |
|
|
want to specify the port name in the the list of hosts in C<same_hosts>: |
1239 |
|
|
|
1240 |
|
|
my %serverA = ( |
1241 |
|
|
base_url => 'http://sunsite.berkeley.edu:4444/', |
1242 |
|
|
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ], |
1243 |
|
|
email => 'my@email.address', |
1244 |
|
|
); |
1245 |
|
|
|
1246 |
|
|
|
1247 |
|
|
=item email |
1248 |
|
|
|
1249 |
|
|
This required key sets the email address for the spider. Set this to |
1250 |
|
|
your email address. |
1251 |
|
|
|
1252 |
|
|
=item agent |
1253 |
|
|
|
1254 |
|
|
This optional key sets the name of the spider. |
1255 |
|
|
|
1256 |
|
|
=item link_tags |
1257 |
|
|
|
1258 |
|
|
This optional tag is a reference to an array of tags. Only links found in these tags will be extracted. |
1259 |
|
|
The default is to only extract links from C<a> tags. |
1260 |
|
|
|
1261 |
|
|
For example, to extract tags from C<a> tags and from C<frame> tags: |
1262 |
|
|
|
1263 |
|
|
my %serverA = ( |
1264 |
|
|
base_url => 'http://sunsite.berkeley.edu:4444/', |
1265 |
|
|
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ], |
1266 |
|
|
email => 'my@email.address', |
1267 |
|
|
link_tags => [qw/ a frame /], |
1268 |
|
|
); |
1269 |
|
|
|
1270 |
|
|
|
1271 |
|
|
=item delay_min |
1272 |
|
|
|
1273 |
|
|
This optional key sets the delay in minutes to wait between requests. See the |
1274 |
|
|
LWP::RobotUA man page for more information. The default is 0.1 (6 seconds), |
1275 |
|
|
but in general you will probably want it much smaller. But, check with |
1276 |
|
|
the webmaster before using too small a number. |
1277 |
|
|
|
1278 |
|
|
=item max_wait_time |
1279 |
|
|
|
1280 |
|
|
This setting is the number of seconds to wait for data to be returned from |
1281 |
|
|
the request. Data is returned in chunks to the spider, and the timer is reset each time |
1282 |
|
|
a new chunk is reported. Therefore, documents (requests) that take longer than this setting |
1283 |
|
|
should not be aborted as long as some data is received every max_wait_time seconds. |
1284 |
|
|
The default it 30 seconds. |
1285 |
|
|
|
1286 |
|
|
NOTE: This option has no effect on Windows. |
1287 |
|
|
|
1288 |
|
|
=item max_time |
1289 |
|
|
|
1290 |
|
|
This optional key will set the max minutes to spider. Spidering |
1291 |
|
|
for this host will stop after C<max_time> minutes, and move on to the |
1292 |
|
|
next server, if any. The default is to not limit by time. |
1293 |
|
|
|
1294 |
|
|
=item max_files |
1295 |
|
|
|
1296 |
|
|
This optional key sets the max number of files to spider before aborting. |
1297 |
|
|
The default is to not limit by number of files. This is the number of requests |
1298 |
|
|
made to the remote server, not the total number of files to index (see C<max_indexed>). |
1299 |
|
|
This count is displayted at the end of indexing as C<Unique URLs>. |
1300 |
|
|
|
1301 |
|
|
This feature can (and perhaps should) be use when spidering a web site where dynamic |
1302 |
|
|
content may generate unique URLs to prevent run-away spidering. |
1303 |
|
|
|
1304 |
|
|
=item max_indexed |
1305 |
|
|
|
1306 |
|
|
This optional key sets the max number of files that will be indexed. |
1307 |
|
|
The default is to not limit. This is the number of files sent to |
1308 |
|
|
swish for indexing (and is reported by C<Total Docs> when spidering ends). |
1309 |
|
|
|
1310 |
|
|
=item max_size |
1311 |
|
|
|
1312 |
|
|
This optional key sets the max size of a file read from the web server. |
1313 |
|
|
This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is |
1314 |
|
|
skipped and a message is written to STDERR if the DEBUG_SKIPPED debug flag is set. |
1315 |
|
|
|
1316 |
|
|
=item keep_alive |
1317 |
|
|
|
1318 |
|
|
This optional parameter will enable keep alive requests. This can dramatically speed |
1319 |
|
|
up searching and reduce the load on server being spidered. The default is to not use |
1320 |
|
|
keep alives, although enabling it will probably be the right thing to do. |
1321 |
|
|
|
1322 |
|
|
To get the most out of keep alives, you may want to set up your web server to |
1323 |
|
|
allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache). |
1324 |
|
|
Apache's default is 100, which should be good. (But, in general, don't enable KeepAlives |
1325 |
|
|
on a mod_perl server.) |
1326 |
|
|
|
1327 |
|
|
Note: try to filter as many documents as possible B<before> making the request to the server. In |
1328 |
|
|
other words, use C<test_url> to look for files ending in C<.html> instead of using C<test_response> to look |
1329 |
|
|
for a content type of C<text/html> if possible. |
1330 |
|
|
Do note that aborting a request from C<test_response> will break the |
1331 |
|
|
current keep alive connection. |
1332 |
|
|
|
1333 |
|
|
Note: you must have at least libwww-perl-5.53_90 installed to use this feature. |
1334 |
|
|
|
1335 |
|
|
=item skip |
1336 |
|
|
|
1337 |
|
|
This optional key can be used to skip the current server. It's only purpose |
1338 |
|
|
is to make it easy to disable a server in a configuration file. |
1339 |
|
|
|
1340 |
|
|
=item debug |
1341 |
|
|
|
1342 |
|
|
Set this to a number to display different amounts of info while spidering. Writes info |
1343 |
|
|
to STDERR. Zero/undefined is no debug output. |
1344 |
|
|
|
1345 |
|
|
The following constants are defined for debugging. They may be or'ed together to |
1346 |
|
|
get the individual debugging of your choice. |
1347 |
|
|
|
1348 |
|
|
Here are basically the levels: |
1349 |
|
|
|
1350 |
|
|
DEBUG_ERRORS general program errors (not used at this time) |
1351 |
|
|
DEBUG_URL print out every URL processes |
1352 |
|
|
DEBUG_HEADERS prints the response headers |
1353 |
|
|
DEBUG_FAILED failed to return a 200 |
1354 |
|
|
DEBUG_SKIPPED didn't index for some reason |
1355 |
|
|
DEBUG_INFO more verbose |
1356 |
|
|
DEBUG_LINKS prints links as they are extracted |
1357 |
|
|
|
1358 |
|
|
For example, to display the urls processed, failed, and skipped use: |
1359 |
|
|
|
1360 |
|
|
debug => DEBUG_URL | DEBUG_FAILED | DEBUG_SKIPPED, |
1361 |
|
|
|
1362 |
|
|
To display the returned headers |
1363 |
|
|
|
1364 |
|
|
debug => DEBUG_HEADERS, |
1365 |
|
|
|
1366 |
|
|
You can easily run the spider without using swish for debugging purposes: |
1367 |
|
|
|
1368 |
|
|
./spider.pl test.config > spider.out |
1369 |
|
|
|
1370 |
|
|
And you will see debugging info as it runs, and the fetched documents will be saved |
1371 |
|
|
in the C<spider.out> file. |
1372 |
|
|
|
1373 |
|
|
Debugging can be also be set by an environment variable when running swish. This will |
1374 |
|
|
override any setting in the configuration file. Set the variable SPIDER_DEBUG when running |
1375 |
|
|
the spider. You can specify any of the above debugging options, separated by a comma. |
1376 |
|
|
|
1377 |
|
|
For example with Bourne type shell: |
1378 |
|
|
|
1379 |
|
|
SPIDER_DEBUG=url,links |
1380 |
|
|
|
1381 |
|
|
=item quiet |
1382 |
|
|
|
1383 |
|
|
If this is true then normal, non-error messages will be supressed. Quiet mode can also |
1384 |
|
|
be set by setting the environment variable SPIDER_QUIET to any true value. |
1385 |
|
|
|
1386 |
|
|
SPIDER_QUIET=1 |
1387 |
|
|
|
1388 |
|
|
=item max_depth |
1389 |
|
|
|
1390 |
|
|
The C<max_depth> parameter can be used to limit how deeply to recurse a web site. |
1391 |
|
|
The depth is just a count of levels of web pages decended, and not related to |
1392 |
|
|
the number of path elements in a URL. |
1393 |
|
|
|
1394 |
|
|
A max_depth of zero says to only spider the page listed as the C<base_url>. A max_depth of one will |
1395 |
|
|
spider the C<base_url> page, plus all links on that page, and no more. The default is to spider all |
1396 |
|
|
pages. |
1397 |
|
|
|
1398 |
|
|
|
1399 |
|
|
=item ignore_robots_file |
1400 |
|
|
|
1401 |
|
|
If this is set to true then the robots.txt file will not be checked when spidering |
1402 |
|
|
this server. Don't use this option unless you know what you are doing. |
1403 |
|
|
|
1404 |
|
|
=item use_cookies |
1405 |
|
|
|
1406 |
|
|
If this is set then a "cookie jar" will be maintained while spidering. Some |
1407 |
|
|
(poorly written ;) sites require cookies to be enabled on clients. |
1408 |
|
|
|
1409 |
|
|
This requires the HTTP::Cookies module. |
1410 |
|
|
|
1411 |
|
|
=item use_md5 |
1412 |
|
|
|
1413 |
|
|
If this setting is true, then a MD5 digest "fingerprint" will be made from the content of every |
1414 |
|
|
spidered document. This digest number will be used as a hash key to prevent |
1415 |
|
|
indexing the same content more than once. This is helpful if different URLs |
1416 |
|
|
generate the same content. |
1417 |
|
|
|
1418 |
|
|
Obvious example is these two documents will only be indexed one time: |
1419 |
|
|
|
1420 |
|
|
http://localhost/path/to/index.html |
1421 |
|
|
http://localhost/path/to/ |
1422 |
|
|
|
1423 |
|
|
This option requires the Digest::MD5 module. Spidering with this option might |
1424 |
|
|
be a tiny bit slower. |
1425 |
|
|
|
1426 |
|
|
=item validate_links |
1427 |
|
|
|
1428 |
|
|
Just a hack. If you set this true the spider will do HEAD requests all links (e.g. off-site links), just |
1429 |
|
|
to make sure that all your links work. |
1430 |
|
|
|
1431 |
|
|
=item credentials |
1432 |
|
|
|
1433 |
|
|
You may specify a username and password to be used automatically when spidering: |
1434 |
|
|
|
1435 |
|
|
credentials => 'username:password', |
1436 |
|
|
|
1437 |
|
|
A username and password supplied in a URL will override this setting. |
1438 |
|
|
|
1439 |
|
|
=item credential_timeout |
1440 |
|
|
|
1441 |
|
|
Sets the number of seconds to wait for user input when prompted for a username or password. |
1442 |
|
|
The default is 30 seconds. |
1443 |
|
|
|
1444 |
|
|
=back |
1445 |
|
|
|
1446 |
|
|
=head1 CALLBACK FUNCTIONS |
1447 |
|
|
|
1448 |
|
|
Three callback functions can be defined in your parameter hash. |
1449 |
|
|
These optional settings are I<callback> subroutines that are called while |
1450 |
|
|
processing URLs. |
1451 |
|
|
|
1452 |
|
|
A little perl discussion is in order: |
1453 |
|
|
|
1454 |
|
|
In perl, a scalar variable can contain a reference to a subroutine. The config example above shows |
1455 |
|
|
that the configuration parameters are stored in a perl I<hash>. |
1456 |
|
|
|
1457 |
|
|
my %serverA = ( |
1458 |
|
|
base_url => 'http://sunsite.berkeley.edu:4444/', |
1459 |
|
|
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ], |
1460 |
|
|
email => 'my@email.address', |
1461 |
|
|
link_tags => [qw/ a frame /], |
1462 |
|
|
); |
1463 |
|
|
|
1464 |
|
|
There's two ways to add a reference to a subroutine to this hash: |
1465 |
|
|
|
1466 |
|
|
sub foo { |
1467 |
|
|
return 1; |
1468 |
|
|
} |
1469 |
|
|
|
1470 |
|
|
my %serverA = ( |
1471 |
|
|
base_url => 'http://sunsite.berkeley.edu:4444/', |
1472 |
|
|
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ], |
1473 |
|
|
email => 'my@email.address', |
1474 |
|
|
link_tags => [qw/ a frame /], |
1475 |
|
|
test_url => \&foo, # a reference to a named subroutine |
1476 |
|
|
); |
1477 |
|
|
|
1478 |
|
|
Or the subroutine can be coded right in place: |
1479 |
|
|
|
1480 |
|
|
my %serverA = ( |
1481 |
|
|
base_url => 'http://sunsite.berkeley.edu:4444/', |
1482 |
|
|
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ], |
1483 |
|
|
email => 'my@email.address', |
1484 |
|
|
link_tags => [qw/ a frame /], |
1485 |
|
|
test_url => sub { reutrn 1; }, |
1486 |
|
|
); |
1487 |
|
|
|
1488 |
|
|
The above example is not very useful as it just creates a user callback function that |
1489 |
|
|
always returns a true value (the number 1). But, it's just an example. |
1490 |
|
|
|
1491 |
|
|
The function calls are wrapped in an eval, so calling die (or doing something that dies) will just cause |
1492 |
|
|
that URL to be skipped. If you really want to stop processing you need to set $server->{abort} in your |
1493 |
|
|
subroutine (or send a kill -HUP to the spider). |
1494 |
|
|
|
1495 |
|
|
The first two parameters passed are a URI object (to have access to the current URL), and |
1496 |
|
|
a reference to the current server hash. The C<server> hash is just a global hash for holding data, and |
1497 |
|
|
useful for setting flags as describe belwo. |
1498 |
|
|
|
1499 |
|
|
Other parameters may be also passes, as described below. |
1500 |
|
|
In perl parameters are passed in an array called "@_". The first element (first parameter) of |
1501 |
|
|
that array is $_[0], and the second is $_[1], and so on. Depending on how complicated your |
1502 |
|
|
function is you may wish to shift your parameters off of the @_ list to make working with them |
1503 |
|
|
easier. See the examples below. |
1504 |
|
|
|
1505 |
|
|
|
1506 |
|
|
To make use of these routines you need to understand when they are called, and what changes |
1507 |
|
|
you can make in your routines. Each routine deals with a given step, and returning false from |
1508 |
|
|
your routine will stop processing for the current URL. |
1509 |
|
|
|
1510 |
|
|
=over 4 |
1511 |
|
|
|
1512 |
|
|
=item test_url |
1513 |
|
|
|
1514 |
|
|
C<test_url> allows you to skip processing of urls based on the url before the request |
1515 |
|
|
to the server is made. This function is called for the C<base_url> links (links you define in |
1516 |
|
|
the spider configuration file) and for every link extracted from a fetched web page. |
1517 |
|
|
|
1518 |
|
|
This function is a good place to skip links that you are not interested in following. For example, |
1519 |
|
|
if you know there's no point in requesting images then you can exclude them like: |
1520 |
|
|
|
1521 |
|
|
test_url => sub { |
1522 |
|
|
my $uri = shift; |
1523 |
|
|
return 0 if $uri->path =~ /\.(gif|jpeg|png)$/; |
1524 |
|
|
return 1; |
1525 |
|
|
}, |
1526 |
|
|
|
1527 |
|
|
Or to write it another way: |
1528 |
|
|
|
1529 |
|
|
test_url => sub { $_[0]->path !~ /\.(gif|jpeg|png)$/ }, |
1530 |
|
|
|
1531 |
|
|
Another feature would be if you were using a web server where path names are |
1532 |
|
|
NOT case sensitive (e.g. Windows). You can normalize all links in this situation |
1533 |
|
|
using something like |
1534 |
|
|
|
1535 |
|
|
test_url => sub { |
1536 |
|
|
my $uri = shift; |
1537 |
|
|
return 0 if $uri->path =~ /\.(gif|jpeg|png)$/; |
1538 |
|
|
|
1539 |
|
|
$uri->path( lc $uri->path ); # make all path names lowercase |
1540 |
|
|
return 1; |
1541 |
|
|
}, |
1542 |
|
|
|
1543 |
|
|
The important thing about C<test_url> (compared to the other callback functions) is that |
1544 |
|
|
it is called while I<extracting> links, not while actually fetching that page from the web |
1545 |
|
|
server. Returning false from C<test_url> simple says to not add the URL to the list of links to |
1546 |
|
|
spider. |
1547 |
|
|
|
1548 |
|
|
You may set a flag in the server hash (second parameter) to tell the spider to abort processing. |
1549 |
|
|
|
1550 |
|
|
test_url => sub { |
1551 |
|
|
my $server = $_[1]; |
1552 |
|
|
$server->{abort}++ if $_[0]->path =~ /foo\.html/; |
1553 |
|
|
return 1; |
1554 |
|
|
}, |
1555 |
|
|
|
1556 |
|
|
You cannot use the server flags: |
1557 |
|
|
|
1558 |
|
|
no_contents |
1559 |
|
|
no_index |
1560 |
|
|
no_spider |
1561 |
|
|
|
1562 |
|
|
|
1563 |
|
|
This is discussed below. |
1564 |
|
|
|
1565 |
|
|
=item test_response |
1566 |
|
|
|
1567 |
|
|
This function allows you to filter based on the response from the |
1568 |
|
|
remote server (such as by content-type). This function is called while the |
1569 |
|
|
web pages is being fetched from the remote server, typically after just enought |
1570 |
|
|
data has been returned to read the response from the web server. |
1571 |
|
|
|
1572 |
|
|
The spider requests a document in "chunks" of 4096 bytes. 4096 is only a suggestion |
1573 |
|
|
of how many bytes to return in each chunk. The C<test_response> routine is |
1574 |
|
|
called when the first chunk is received only. This allows ignoring (aborting) |
1575 |
|
|
reading of a very large file, for example, without having to read the entire file. |
1576 |
|
|
Although not much use, a reference to this chunk is passed as the forth parameter. |
1577 |
|
|
|
1578 |
|
|
Web servers use a Content-Type: header to define the type of data returned from the server. |
1579 |
|
|
On a web server you could have a .jpeg file be a web page -- file extensions may not always |
1580 |
|
|
indicate the type of the file. The third parameter ($_[2]) returned is a reference to a |
1581 |
|
|
HTTP::Response object: |
1582 |
|
|
|
1583 |
|
|
For example, to only index true HTML (text/html) pages: |
1584 |
|
|
|
1585 |
|
|
test_response => sub { |
1586 |
|
|
my $content_type = $_[2]->content_type; |
1587 |
|
|
return $content_type =~ m!text/html!; |
1588 |
|
|
}, |
1589 |
|
|
|
1590 |
|
|
You can also set flags in the server hash (the second parameter) to control indexing: |
1591 |
|
|
|
1592 |
|
|
no_contents -- index only the title (or file name), and not the contents |
1593 |
|
|
no_index -- do not index this file, but continue to spider if HTML |
1594 |
|
|
no_spider -- index, but do not spider this file for links to follow |
1595 |
|
|
abort -- stop spidering any more files |
1596 |
|
|
|
1597 |
|
|
For example, to avoid index the contents of "private.html", yet still follow any links |
1598 |
|
|
in that file: |
1599 |
|
|
|
1600 |
|
|
test_response => sub { |
1601 |
|
|
my $server = $_[1]; |
1602 |
|
|
$server->{no_index}++ if $_[0]->path =~ /private\.html$/; |
1603 |
|
|
return 1; |
1604 |
|
|
}, |
1605 |
|
|
|
1606 |
|
|
Note: Do not modify the URI object in this call back function. |
1607 |
|
|
|
1608 |
|
|
|
1609 |
|
|
=item filter_content |
1610 |
|
|
|
1611 |
|
|
This callback function is called right before sending the content to swish. |
1612 |
|
|
Like the other callback function, returning false will cause the URL to be skipped. |
1613 |
|
|
Setting the C<abort> server flag and returning false will abort spidering. |
1614 |
|
|
|
1615 |
|
|
You can also set the C<no_contents> flag. |
1616 |
|
|
|
1617 |
|
|
This callback function is passed four parameters. |
1618 |
|
|
The URI object, server hash, the HTTP::Response object, |
1619 |
|
|
and a reference to the content. |
1620 |
|
|
|
1621 |
|
|
You can modify the content as needed. For example you might not like upper case: |
1622 |
|
|
|
1623 |
|
|
filter_content => sub { |
1624 |
|
|
my $content_ref = $_[3]; |
1625 |
|
|
|
1626 |
|
|
$$content_ref = lc $$content_ref; |
1627 |
|
|
return 1; |
1628 |
|
|
}, |
1629 |
|
|
|
1630 |
|
|
I more reasonable example would be converting PDF or MS Word documents for parsing by swish. |
1631 |
|
|
Examples of this are provided in the F<prog-bin> directory of the swish-e distribution. |
1632 |
|
|
|
1633 |
|
|
You may also modify the URI object to change the path name passed to swish for indexing. |
1634 |
|
|
|
1635 |
|
|
filter_content => sub { |
1636 |
|
|
my $uri = $_[0]; |
1637 |
|
|
$uri->host('www.other.host') ; |
1638 |
|
|
return 1; |
1639 |
|
|
}, |
1640 |
|
|
|
1641 |
|
|
Swish-e's ReplaceRules feature can also be used for modifying the path name indexed. |
1642 |
|
|
|
1643 |
|
|
Here's a bit more advanced example of indexing text/html and PDF files only: |
1644 |
|
|
|
1645 |
|
|
use pdf2xml; # included example pdf converter module |
1646 |
|
|
$server{filter_content} = sub { |
1647 |
|
|
my ( $uri, $server, $response, $content_ref ) = @_; |
1648 |
|
|
|
1649 |
|
|
return 1 if $response->content_type eq 'text/html'; |
1650 |
|
|
return 0 unless $response->content_type eq 'application/pdf'; |
1651 |
|
|
|
1652 |
|
|
# for logging counts |
1653 |
|
|
$server->{counts}{'PDF transformed'}++; |
1654 |
|
|
|
1655 |
|
|
$$content_ref = ${pdf2xml( $content_ref )}; |
1656 |
|
|
return 1; |
1657 |
|
|
} |
1658 |
|
|
|
1659 |
|
|
|
1660 |
|
|
|
1661 |
|
|
=back |
1662 |
|
|
|
1663 |
|
|
Note that you can create your own counters to display in the summary list when spidering |
1664 |
|
|
is finished by adding a value to the hash pointed to by C<$server->{counts}>. |
1665 |
|
|
|
1666 |
|
|
test_url => sub { |
1667 |
|
|
my $server = $_[1]; |
1668 |
|
|
$server->{no_index}++ if $_[0]->path =~ /private\.html$/; |
1669 |
|
|
$server->{counts}{'Private Files'}++; |
1670 |
|
|
return 1; |
1671 |
|
|
}, |
1672 |
|
|
|
1673 |
|
|
|
1674 |
|
|
Each callback function B<must> return true to continue processing the URL. Returning false will |
1675 |
|
|
cause processing of I<the current> URL to be skipped. |
1676 |
|
|
|
1677 |
|
|
=head2 More on setting flags |
1678 |
|
|
|
1679 |
|
|
Swish (not this spider) has a configuration directive C<NoContents> that will instruct swish to |
1680 |
|
|
index only the title (or file name), and not the contents. This is often used when |
1681 |
|
|
indexing binary files such as image files, but can also be used with html |
1682 |
|
|
files to index only the document titles. |
1683 |
|
|
|
1684 |
|
|
As shown above, you can turn this feature on for specific documents by setting a flag in |
1685 |
|
|
the server hash passed into the C<test_response> or C<filter_content> subroutines. |
1686 |
|
|
For example, in your configuration file you might have the C<test_response> callback set |
1687 |
|
|
as: |
1688 |
|
|
|
1689 |
|
|
|
1690 |
|
|
test_response => sub { |
1691 |
|
|
my ( $uri, $server, $response ) = @_; |
1692 |
|
|
# tell swish not to index the contents if this is of type image |
1693 |
|
|
$server->{no_contents} = $response->content_type =~ m[^image/]; |
1694 |
|
|
return 1; # ok to index and spider this document |
1695 |
|
|
} |
1696 |
|
|
|
1697 |
|
|
The entire contents of the resource is still read from the web server, and passed |
1698 |
|
|
on to swish, but swish will also be passed a C<No-Contents> header which tells |
1699 |
|
|
swish to enable the NoContents feature for this document only. |
1700 |
|
|
|
1701 |
|
|
Note: Swish will index the path name only when C<NoContents> is set, unless the document's |
1702 |
|
|
type (as set by the swish configuration settings C<IndexContents> or C<DefaultContents>) is |
1703 |
|
|
HTML I<and> a title is found in the html document. |
1704 |
|
|
|
1705 |
|
|
Note: In most cases you probably would not want to send a large binary file to swish, just |
1706 |
|
|
to be ignored. Therefore, it would be smart to use a C<filter_content> callback routine to |
1707 |
|
|
replace the contents with single character (you cannot use the empty string at this time). |
1708 |
|
|
|
1709 |
|
|
A similar flag may be set to prevent indexing a document at all, but still allow spidering. |
1710 |
|
|
In general, if you want completely skip spidering a file you return false from one of the |
1711 |
|
|
callback routines (C<test_url>, C<test_response>, or C<filter_content>). Returning false from any of those |
1712 |
|
|
three callbacks will stop processing of that file, and the file will B<not> be spidered. |
1713 |
|
|
|
1714 |
|
|
But there may be some cases where you still want to spider (extract links) yet, not index the file. An example |
1715 |
|
|
might be where you wish to index only PDF files, but you still need to spider all HTML files to find |
1716 |
|
|
the links to the PDF files. |
1717 |
|
|
|
1718 |
|
|
$server{test_response} = sub { |
1719 |
|
|
my ( $uri, $server, $response ) = @_; |
1720 |
|
|
$server->{no_index} = $response->content_type ne 'application/pdf'; |
1721 |
|
|
return 1; # ok to spider, but don't index |
1722 |
|
|
} |
1723 |
|
|
|
1724 |
|
|
So, the difference between C<no_contents> and C<no_index> is that C<no_contents> will still index the file |
1725 |
|
|
name, just not the contents. C<no_index> will still spider the file (if it's C<text/html>) but the |
1726 |
|
|
file will not be processed by swish at all. |
1727 |
|
|
|
1728 |
|
|
B<Note:> If C<no_index> is set in a C<test_response> callback function then |
1729 |
|
|
the document I<will not be filtered>. That is, your C<filter_content> |
1730 |
|
|
callback function will not be called. |
1731 |
|
|
|
1732 |
|
|
The C<no_spider> flag can be set to avoid spiderering an HTML file. The file will still be indexed unless |
1733 |
|
|
C<no_index> is also set. But if you do not want to index and spider, then simply return false from one of the three |
1734 |
|
|
callback funtions. |
1735 |
|
|
|
1736 |
|
|
|
1737 |
|
|
=head1 SIGNALS |
1738 |
|
|
|
1739 |
|
|
Sending a SIGHUP to the running spider will cause it to stop spidering. This is a good way to abort spidering, but |
1740 |
|
|
let swish index the documents retrieved so far. |
1741 |
|
|
|
1742 |
|
|
=head1 SEE ALSO |
1743 |
|
|
|
1744 |
|
|
L<URI> L<LWP::RobotUA> L<WWW::RobotRules> L<Digest::MD5> |
1745 |
|
|
|
1746 |
|
|
=head1 COPYRIGHT |
1747 |
|
|
|
1748 |
|
|
Copyright 2001 Bill Moseley |
1749 |
|
|
|
1750 |
|
|
This program is free software; you can redistribute it and/or modify |
1751 |
|
|
it under the same terms as Perl itself. |
1752 |
|
|
|
1753 |
|
|
=head1 SUPPORT |
1754 |
|
|
|
1755 |
|
|
Send all questions to the The SWISH-E discussion list. |
1756 |
|
|
|
1757 |
|
|
See http://sunsite.berkeley.edu/SWISH-E. |
1758 |
|
|
|
1759 |
|
|
=cut |
1760 |
|
|
|