/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swishspider
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swishspider

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
Initial revision

1 adcroft 1.1 #!/usr/local/bin/perl -w
2    
3     # print STDERR "spider $$ [@ARGV]\n";
4    
5     #
6     # SWISH-E http method Spider
7     # $Id: swishspider,v 1.8 2002/08/15 05:31:02 whmoseley Exp $
8     #
9    
10     use strict;
11    
12     use LWP::UserAgent;
13     use HTTP::Status;
14     use HTML::Parser 3.00;
15     use HTML::LinkExtor;
16    
17     if (scalar(@ARGV) != 2) {
18     print STDERR "Usage: SwishSpider localpath url\n";
19     exit(1);
20     }
21    
22     my $ua = new LWP::UserAgent;
23     $ua->agent( "SwishSpider http://swish-e.org" );
24    
25     my $localpath = shift;
26     my $url = shift;
27    
28     my $request = new HTTP::Request( "GET", $url );
29     my $response = $ua->simple_request( $request );
30    
31     #
32     # Write out important meta-data. This includes the HTTP code. Depending on the
33     # code, we write out other data. Redirects have the location printed, everything
34     # else gets the content-type.
35     #
36     open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
37    
38     print RESP $response->code() . "\n";
39    
40     if( $response->code() == RC_OK ) {
41     print RESP $response->header( "content-type" ) . "\n";
42    
43     } elsif( $response->is_redirect() ) {
44     print RESP ($response->header( "location" ) ||'') . "\n";
45     }
46    
47     print RESP ($response->last_modified || 0), "\n";
48    
49     close( RESP );
50    
51     #
52     # Write out the actual data assuming the retrieval was succesful. Also, if
53     # we have actual data and it's of type text/html, write out all the links it
54     # refers to
55     #
56     if( $response->code() == RC_OK ) {
57     my $contents = $response->content();
58    
59     open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
60     print CONTENTS $contents;
61     close( CONTENTS );
62    
63     if( $response->header("content-type") =~ "text/html" ) {
64     open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
65     my $p = HTML::LinkExtor->new( \&linkcb, $url );
66     $p->parse( $contents );
67    
68     close( LINKS );
69     }
70     }
71    
72    
73     sub linkcb {
74     my($tag, %links) = @_;
75     if (($tag eq "a") && ($links{"href"})) {
76     my $link = $links{"href"};
77     #
78     # Remove fragments
79     #
80     $link =~ s/(.*)#.*/$1/;
81    
82     #
83     # Remove ../ This is important because the abs() function
84     # can leave these in and cause never ending loops.
85     #
86     $link =~ s/\.\.\///g;
87    
88     print LINKS "$link\n";
89     }
90     }
91    

  ViewVC Help
Powered by ViewVC 1.1.22