1 |
adcroft |
1.1 |
=pod |
2 |
|
|
|
3 |
|
|
=head1 NAME |
4 |
|
|
|
5 |
|
|
SwishSpiderConfig.pl - Sample swish-e spider configuration |
6 |
|
|
|
7 |
|
|
=head1 DESCRIPTION |
8 |
|
|
|
9 |
|
|
This is a sample configuation file for the spider.pl program provided |
10 |
|
|
with the swish-e distribution. |
11 |
|
|
|
12 |
|
|
It contains settings for spidering three servers (two are the same server). |
13 |
|
|
All are disabled (skip => 1) to prevent every new swish user from spidering these sites. |
14 |
|
|
|
15 |
|
|
These are just examples. Please spider your own web site. |
16 |
|
|
|
17 |
|
|
**Also, please don't use this exact file as your configuration file.** |
18 |
|
|
|
19 |
|
|
Trim your file down to just the content you need, especially |
20 |
|
|
if posting your config to the Swish-e list requesting for help. Remove these comments |
21 |
|
|
and remove everything below that you are not using. |
22 |
|
|
|
23 |
|
|
The first example is relativly simple. It just spiders any URL that |
24 |
|
|
ends in C<.html>. |
25 |
|
|
|
26 |
|
|
The second example is a bit more advanced and shows how to filter content. |
27 |
|
|
|
28 |
|
|
First, this the spider doesn't request image files (files that end in .gif or .jpeg) |
29 |
|
|
then only indexes files with of C<text/html text/plain application/pdf application/msword> content |
30 |
|
|
type. |
31 |
|
|
|
32 |
|
|
C<application/pdf> and C<application/msword> are then run through filters to extract |
33 |
|
|
out their content. The example filter subroutines are included below, as well. |
34 |
|
|
|
35 |
|
|
This config is set to only spider 100 URLs, or index 20 files, which ever comes first. |
36 |
|
|
|
37 |
|
|
The third example shows more options (which are listed in C<perldoc spider.pl>), and how you might use |
38 |
|
|
subroutine calls for checking URLs, content, and filtering instead of inlined subroutines shown in |
39 |
|
|
the first two examples. |
40 |
|
|
|
41 |
|
|
|
42 |
|
|
Please see C<perldoc spider.pl> for more information. |
43 |
|
|
|
44 |
|
|
=cut |
45 |
|
|
|
46 |
|
|
#--------------------- Global Config ---------------------------- |
47 |
|
|
|
48 |
|
|
# @servers is a list of hashes -- so you can spider more than one site |
49 |
|
|
# in one run (or different parts of the same tree) |
50 |
|
|
# The main program expects to use this array (@SwishSpiderConfig::servers). |
51 |
|
|
|
52 |
|
|
### Please do not spider these examples -- spider your own servers, with permission #### |
53 |
|
|
|
54 |
|
|
@servers = ( |
55 |
|
|
|
56 |
|
|
#============================================================================= |
57 |
|
|
# This is a simple example, that includes a few limits |
58 |
|
|
# Only files ending in .html will be spidered (probably a bit too restrictive) |
59 |
|
|
{ |
60 |
|
|
skip => 1, # skip spidering this server |
61 |
|
|
|
62 |
|
|
base_url => 'http://www.swish-e.org/index.html', |
63 |
|
|
same_hosts => [ qw/swish-e.org/ ], |
64 |
|
|
agent => 'swish-e spider http://swish-e.org/', |
65 |
|
|
email => 'swish@domain.invalid', |
66 |
|
|
|
67 |
|
|
# limit to only .html files |
68 |
|
|
test_url => sub { $_[0]->path =~ /\.html?$/ }, |
69 |
|
|
|
70 |
|
|
delay_min => .0001, # Delay in minutes between requests |
71 |
|
|
max_time => 10, # Max time to spider in minutes |
72 |
|
|
max_files => 100, # Max Unique URLs to spider |
73 |
|
|
max_indexed => 20, # Max number of files to send to swish for indexing |
74 |
|
|
keep_alive => 1, # enable keep alives requests |
75 |
|
|
}, |
76 |
|
|
|
77 |
|
|
|
78 |
|
|
#============================================================================= |
79 |
|
|
# This is a more advanced example that uses more features, |
80 |
|
|
# such as ignoring some file extensions, and only indexing |
81 |
|
|
# some content-types, plus filters PDF and MS Word docs. |
82 |
|
|
# The call-back subroutines are explained a bit more below. |
83 |
|
|
{ |
84 |
|
|
skip => 1, # skip spidering this server |
85 |
|
|
debug => DEBUG_URL, # print some debugging info to STDERR |
86 |
|
|
|
87 |
|
|
base_url => 'http://www.swish-e.org/', |
88 |
|
|
email => 'swish@domain.invalid', |
89 |
|
|
delay_min => .0001, |
90 |
|
|
link_tags => [qw/ a frame /], |
91 |
|
|
max_files => 50, |
92 |
|
|
max_indexed => 20, # Max number of files to send to swish for indexing |
93 |
|
|
|
94 |
|
|
max_size => 1_000_000, # limit to 1MB file size |
95 |
|
|
max_depth => 10, # spider only ten levels deep |
96 |
|
|
keep_alive => 1, |
97 |
|
|
|
98 |
|
|
test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg)$/ }, |
99 |
|
|
|
100 |
|
|
test_response => sub { |
101 |
|
|
my $content_type = $_[2]->content_type; |
102 |
|
|
my $ok = grep { $_ eq $content_type } qw{ text/html text/plain application/pdf application/msword }; |
103 |
|
|
|
104 |
|
|
# This might be used if you only wanted to index PDF files, yet spider still spider. |
105 |
|
|
#$_[1]->{no_index} = $content_type ne 'application/pdf'; |
106 |
|
|
|
107 |
|
|
return 1 if $ok; |
108 |
|
|
print STDERR "$_[0] wrong content type ( $content_type )\n"; |
109 |
|
|
return; |
110 |
|
|
}, |
111 |
|
|
|
112 |
|
|
filter_content => [ \&pdf, \&doc ], |
113 |
|
|
}, |
114 |
|
|
|
115 |
|
|
|
116 |
|
|
#============================================================================= |
117 |
|
|
# This example just shows more settings. See perldoc spider.pl for info |
118 |
|
|
|
119 |
|
|
{ |
120 |
|
|
skip => 1, # Flag to disable spidering this host. |
121 |
|
|
|
122 |
|
|
base_url => 'http://swish-e.org/index.html', |
123 |
|
|
same_hosts => [ qw/www.swish-e.org/ ], |
124 |
|
|
agent => 'swish-e spider http://swish-e.org/', |
125 |
|
|
email => 'swish@domain.invalid', |
126 |
|
|
delay_min => .0001, # Delay in minutes between requests |
127 |
|
|
max_time => 10, # Max time to spider in minutes |
128 |
|
|
max_files => 20, # Max files to spider |
129 |
|
|
ignore_robots_file => 0, # Don't set that to one, unless you are sure. |
130 |
|
|
|
131 |
|
|
use_cookies => 0, # True will keep cookie jar |
132 |
|
|
# Some sites require cookies |
133 |
|
|
# Requires HTTP::Cookies |
134 |
|
|
|
135 |
|
|
use_md5 => 1, # If true, this will use the Digest::MD5 |
136 |
|
|
# module to create checksums on content |
137 |
|
|
# This will very likely catch files |
138 |
|
|
# with differet URLs that are the same |
139 |
|
|
# content. Will trap / and /index.html, |
140 |
|
|
# for example. |
141 |
|
|
|
142 |
|
|
debug => DEBUG_URL | DEBUG_HEADERS, # print some debugging info to STDERR |
143 |
|
|
|
144 |
|
|
|
145 |
|
|
# Here are hooks to callback routines to validate urls and responses |
146 |
|
|
# Probably a good idea to use them so you don't try to index |
147 |
|
|
# Binary data. Look at content-type headers! |
148 |
|
|
|
149 |
|
|
test_url => \&test_url, |
150 |
|
|
test_response => \&test_response, |
151 |
|
|
filter_content => \&filter_content, |
152 |
|
|
|
153 |
|
|
}, |
154 |
|
|
|
155 |
|
|
|
156 |
|
|
|
157 |
|
|
); |
158 |
|
|
|
159 |
|
|
|
160 |
|
|
#---------------------- Public Functions ------------------------------ |
161 |
|
|
# Here are some examples of callback functions |
162 |
|
|
# |
163 |
|
|
# |
164 |
|
|
# Use these to adjust skip/ignore based on filename/content-type |
165 |
|
|
# Or to filter content (pdf -> text, for example) |
166 |
|
|
# |
167 |
|
|
# Remember to include the code references in the config, above. |
168 |
|
|
# |
169 |
|
|
#---------------------------------------------------------------------- |
170 |
|
|
|
171 |
|
|
|
172 |
|
|
# This subroutine lets you check a URL before requesting the |
173 |
|
|
# document from the server |
174 |
|
|
# return false to skip the link |
175 |
|
|
|
176 |
|
|
sub test_url { |
177 |
|
|
my ( $uri, $server ) = @_; |
178 |
|
|
# return 1; # Ok to index/spider |
179 |
|
|
# return 0; # No, don't index or spider; |
180 |
|
|
|
181 |
|
|
# ignore any .gif files |
182 |
|
|
return $uri->path =~ /\.html?$/; |
183 |
|
|
|
184 |
|
|
} |
185 |
|
|
|
186 |
|
|
# This routine is called when the *first* block of data comes back |
187 |
|
|
# from the server. If you return false no more content will be read |
188 |
|
|
# from the server. $response is a HTTP::Response object. |
189 |
|
|
|
190 |
|
|
|
191 |
|
|
sub test_response { |
192 |
|
|
my ( $uri, $server, $response ) = @_; |
193 |
|
|
|
194 |
|
|
$server->{no_contents}++ unless $response->content_type =~ m[^text/html]; |
195 |
|
|
return 1; # ok to index and spider |
196 |
|
|
} |
197 |
|
|
|
198 |
|
|
# This routine can be used to filter content |
199 |
|
|
|
200 |
|
|
sub filter_content { |
201 |
|
|
my ( $uri, $server, $response, $content_ref ) = @_; |
202 |
|
|
|
203 |
|
|
# modify $content_ref |
204 |
|
|
$$content_ref = modify_content( $content_ref ); |
205 |
|
|
return 1; # make sure you return true! |
206 |
|
|
|
207 |
|
|
} |
208 |
|
|
|
209 |
|
|
# Maybe do something here ;) |
210 |
|
|
sub modify_content { |
211 |
|
|
my $content_ref = shift; |
212 |
|
|
|
213 |
|
|
|
214 |
|
|
return $$content_ref; |
215 |
|
|
} |
216 |
|
|
|
217 |
|
|
|
218 |
|
|
|
219 |
|
|
# Here's some real examples |
220 |
|
|
|
221 |
|
|
# This converts PDF files into HTML. The second parameter of |
222 |
|
|
# pdf2html tells which pfd info filed to set as <title> |
223 |
|
|
|
224 |
|
|
use pdf2html; # included example pdf converter module |
225 |
|
|
sub pdf { |
226 |
|
|
my ( $uri, $server, $response, $content_ref ) = @_; |
227 |
|
|
|
228 |
|
|
return 1 unless $response->content_type eq 'application/pdf'; |
229 |
|
|
|
230 |
|
|
# for logging counts |
231 |
|
|
$server->{counts}{'PDF transformed'}++; |
232 |
|
|
|
233 |
|
|
$$content_ref = ${pdf2html( $content_ref, 'title' )}; |
234 |
|
|
$$content_ref =~ tr/ / /s; |
235 |
|
|
return 1; |
236 |
|
|
} |
237 |
|
|
|
238 |
|
|
use doc2txt; # included example pdf converter module |
239 |
|
|
sub doc { |
240 |
|
|
my ( $uri, $server, $response, $content_ref ) = @_; |
241 |
|
|
|
242 |
|
|
return 1 unless $response->content_type eq 'application/msword'; |
243 |
|
|
|
244 |
|
|
# for logging counts |
245 |
|
|
$server->{counts}{'DOC transformed'}++; |
246 |
|
|
|
247 |
|
|
$$content_ref = ${doc2txt( $content_ref )}; |
248 |
|
|
$$content_ref =~ tr/ / /s; |
249 |
|
|
return 1; |
250 |
|
|
} |
251 |
|
|
|
252 |
|
|
# Must return true... |
253 |
|
|
|
254 |
|
|
1; |