/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swishspider
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Error occurred while calculating annotation data.
Importing web-site building process.

1 #!/usr/local/bin/perl -w
2
3 # print STDERR "spider $$ [@ARGV]\n";
4
5 #
6 # SWISH-E http method Spider
7 # $Id: swishspider,v 1.8 2002/08/15 05:31:02 whmoseley Exp $
8 #
9
10 use strict;
11
12 use LWP::UserAgent;
13 use HTTP::Status;
14 use HTML::Parser 3.00;
15 use HTML::LinkExtor;
16
17 if (scalar(@ARGV) != 2) {
18 print STDERR "Usage: SwishSpider localpath url\n";
19 exit(1);
20 }
21
22 my $ua = new LWP::UserAgent;
23 $ua->agent( "SwishSpider http://swish-e.org" );
24
25 my $localpath = shift;
26 my $url = shift;
27
28 my $request = new HTTP::Request( "GET", $url );
29 my $response = $ua->simple_request( $request );
30
31 #
32 # Write out important meta-data. This includes the HTTP code. Depending on the
33 # code, we write out other data. Redirects have the location printed, everything
34 # else gets the content-type.
35 #
36 open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
37
38 print RESP $response->code() . "\n";
39
40 if( $response->code() == RC_OK ) {
41 print RESP $response->header( "content-type" ) . "\n";
42
43 } elsif( $response->is_redirect() ) {
44 print RESP ($response->header( "location" ) ||'') . "\n";
45 }
46
47 print RESP ($response->last_modified || 0), "\n";
48
49 close( RESP );
50
51 #
52 # Write out the actual data assuming the retrieval was succesful. Also, if
53 # we have actual data and it's of type text/html, write out all the links it
54 # refers to
55 #
56 if( $response->code() == RC_OK ) {
57 my $contents = $response->content();
58
59 open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
60 print CONTENTS $contents;
61 close( CONTENTS );
62
63 if( $response->header("content-type") =~ "text/html" ) {
64 open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
65 my $p = HTML::LinkExtor->new( \&linkcb, $url );
66 $p->parse( $contents );
67
68 close( LINKS );
69 }
70 }
71
72
73 sub linkcb {
74 my($tag, %links) = @_;
75 if (($tag eq "a") && ($links{"href"})) {
76 my $link = $links{"href"};
77 #
78 # Remove fragments
79 #
80 $link =~ s/(.*)#.*/$1/;
81
82 #
83 # Remove ../ This is important because the abs() function
84 # can leave these in and cause never ending loops.
85 #
86 $link =~ s/\.\.\///g;
87
88 print LINKS "$link\n";
89 }
90 }

  ViewVC Help
Powered by ViewVC 1.1.22