/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/SwishSpiderConfig.pl
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/SwishSpiderConfig.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 adcroft 1.1 =pod
2    
3     =head1 NAME
4    
5     SwishSpiderConfig.pl - Sample swish-e spider configuration
6    
7     =head1 DESCRIPTION
8    
9     This is a sample configuation file for the spider.pl program provided
10     with the swish-e distribution.
11    
12     It contains settings for spidering three servers (two are the same server).
13     All are disabled (skip => 1) to prevent every new swish user from spidering these sites.
14    
15     These are just examples. Please spider your own web site.
16    
17     **Also, please don't use this exact file as your configuration file.**
18    
19     Trim your file down to just the content you need, especially
20     if posting your config to the Swish-e list requesting for help. Remove these comments
21     and remove everything below that you are not using.
22    
23     The first example is relativly simple. It just spiders any URL that
24     ends in C<.html>.
25    
26     The second example is a bit more advanced and shows how to filter content.
27    
28     First, this the spider doesn't request image files (files that end in .gif or .jpeg)
29     then only indexes files with of C<text/html text/plain application/pdf application/msword> content
30     type.
31    
32     C<application/pdf> and C<application/msword> are then run through filters to extract
33     out their content. The example filter subroutines are included below, as well.
34    
35     This config is set to only spider 100 URLs, or index 20 files, which ever comes first.
36    
37     The third example shows more options (which are listed in C<perldoc spider.pl>), and how you might use
38     subroutine calls for checking URLs, content, and filtering instead of inlined subroutines shown in
39     the first two examples.
40    
41    
42     Please see C<perldoc spider.pl> for more information.
43    
44     =cut
45    
46     #--------------------- Global Config ----------------------------
47    
48     # @servers is a list of hashes -- so you can spider more than one site
49     # in one run (or different parts of the same tree)
50     # The main program expects to use this array (@SwishSpiderConfig::servers).
51    
52     ### Please do not spider these examples -- spider your own servers, with permission ####
53    
54     @servers = (
55    
56     #=============================================================================
57     # This is a simple example, that includes a few limits
58     # Only files ending in .html will be spidered (probably a bit too restrictive)
59     {
60     skip => 1, # skip spidering this server
61    
62     base_url => 'http://www.swish-e.org/index.html',
63     same_hosts => [ qw/swish-e.org/ ],
64     agent => 'swish-e spider http://swish-e.org/',
65     email => 'swish@domain.invalid',
66    
67     # limit to only .html files
68     test_url => sub { $_[0]->path =~ /\.html?$/ },
69    
70     delay_min => .0001, # Delay in minutes between requests
71     max_time => 10, # Max time to spider in minutes
72     max_files => 100, # Max Unique URLs to spider
73     max_indexed => 20, # Max number of files to send to swish for indexing
74     keep_alive => 1, # enable keep alives requests
75     },
76    
77    
78     #=============================================================================
79     # This is a more advanced example that uses more features,
80     # such as ignoring some file extensions, and only indexing
81     # some content-types, plus filters PDF and MS Word docs.
82     # The call-back subroutines are explained a bit more below.
83     {
84     skip => 1, # skip spidering this server
85     debug => DEBUG_URL, # print some debugging info to STDERR
86    
87     base_url => 'http://www.swish-e.org/',
88     email => 'swish@domain.invalid',
89     delay_min => .0001,
90     link_tags => [qw/ a frame /],
91     max_files => 50,
92     max_indexed => 20, # Max number of files to send to swish for indexing
93    
94     max_size => 1_000_000, # limit to 1MB file size
95     max_depth => 10, # spider only ten levels deep
96     keep_alive => 1,
97    
98     test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg)$/ },
99    
100     test_response => sub {
101     my $content_type = $_[2]->content_type;
102     my $ok = grep { $_ eq $content_type } qw{ text/html text/plain application/pdf application/msword };
103    
104     # This might be used if you only wanted to index PDF files, yet spider still spider.
105     #$_[1]->{no_index} = $content_type ne 'application/pdf';
106    
107     return 1 if $ok;
108     print STDERR "$_[0] wrong content type ( $content_type )\n";
109     return;
110     },
111    
112     filter_content => [ \&pdf, \&doc ],
113     },
114    
115    
116     #=============================================================================
117     # This example just shows more settings. See perldoc spider.pl for info
118    
119     {
120     skip => 1, # Flag to disable spidering this host.
121    
122     base_url => 'http://swish-e.org/index.html',
123     same_hosts => [ qw/www.swish-e.org/ ],
124     agent => 'swish-e spider http://swish-e.org/',
125     email => 'swish@domain.invalid',
126     delay_min => .0001, # Delay in minutes between requests
127     max_time => 10, # Max time to spider in minutes
128     max_files => 20, # Max files to spider
129     ignore_robots_file => 0, # Don't set that to one, unless you are sure.
130    
131     use_cookies => 0, # True will keep cookie jar
132     # Some sites require cookies
133     # Requires HTTP::Cookies
134    
135     use_md5 => 1, # If true, this will use the Digest::MD5
136     # module to create checksums on content
137     # This will very likely catch files
138     # with differet URLs that are the same
139     # content. Will trap / and /index.html,
140     # for example.
141    
142     debug => DEBUG_URL | DEBUG_HEADERS, # print some debugging info to STDERR
143    
144    
145     # Here are hooks to callback routines to validate urls and responses
146     # Probably a good idea to use them so you don't try to index
147     # Binary data. Look at content-type headers!
148    
149     test_url => \&test_url,
150     test_response => \&test_response,
151     filter_content => \&filter_content,
152    
153     },
154    
155    
156    
157     );
158    
159    
160     #---------------------- Public Functions ------------------------------
161     # Here are some examples of callback functions
162     #
163     #
164     # Use these to adjust skip/ignore based on filename/content-type
165     # Or to filter content (pdf -> text, for example)
166     #
167     # Remember to include the code references in the config, above.
168     #
169     #----------------------------------------------------------------------
170    
171    
172     # This subroutine lets you check a URL before requesting the
173     # document from the server
174     # return false to skip the link
175    
176     sub test_url {
177     my ( $uri, $server ) = @_;
178     # return 1; # Ok to index/spider
179     # return 0; # No, don't index or spider;
180    
181     # ignore any .gif files
182     return $uri->path =~ /\.html?$/;
183    
184     }
185    
186     # This routine is called when the *first* block of data comes back
187     # from the server. If you return false no more content will be read
188     # from the server. $response is a HTTP::Response object.
189    
190    
191     sub test_response {
192     my ( $uri, $server, $response ) = @_;
193    
194     $server->{no_contents}++ unless $response->content_type =~ m[^text/html];
195     return 1; # ok to index and spider
196     }
197    
198     # This routine can be used to filter content
199    
200     sub filter_content {
201     my ( $uri, $server, $response, $content_ref ) = @_;
202    
203     # modify $content_ref
204     $$content_ref = modify_content( $content_ref );
205     return 1; # make sure you return true!
206    
207     }
208    
209     # Maybe do something here ;)
210     sub modify_content {
211     my $content_ref = shift;
212    
213    
214     return $$content_ref;
215     }
216    
217    
218    
219     # Here's some real examples
220    
221     # This converts PDF files into HTML. The second parameter of
222     # pdf2html tells which pfd info filed to set as <title>
223    
224     use pdf2html; # included example pdf converter module
225     sub pdf {
226     my ( $uri, $server, $response, $content_ref ) = @_;
227    
228     return 1 unless $response->content_type eq 'application/pdf';
229    
230     # for logging counts
231     $server->{counts}{'PDF transformed'}++;
232    
233     $$content_ref = ${pdf2html( $content_ref, 'title' )};
234     $$content_ref =~ tr/ / /s;
235     return 1;
236     }
237    
238     use doc2txt; # included example pdf converter module
239     sub doc {
240     my ( $uri, $server, $response, $content_ref ) = @_;
241    
242     return 1 unless $response->content_type eq 'application/msword';
243    
244     # for logging counts
245     $server->{counts}{'DOC transformed'}++;
246    
247     $$content_ref = ${doc2txt( $content_ref )};
248     $$content_ref =~ tr/ / /s;
249     return 1;
250     }
251    
252     # Must return true...
253    
254     1;

  ViewVC Help
Powered by ViewVC 1.1.22