Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swishspider

#!/usr/local/bin/perl -w

# print STDERR "spider $$ [@ARGV]\n";

#
# SWISH-E http method Spider
# $Id: swishspider,v 1.8 2002/08/15 05:31:02 whmoseley Exp $ 
#

use strict;

use LWP::UserAgent;
use HTTP::Status;
use HTML::Parser 3.00;
use HTML::LinkExtor;

if (scalar(@ARGV) != 2) {
    print STDERR "Usage: SwishSpider localpath url\n";
    exit(1);
}

my $ua = new LWP::UserAgent;
$ua->agent( "SwishSpider http://swish-e.org" );

my $localpath = shift;
my $url = shift;

my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );

#
# Write out important meta-data.  This includes the HTTP code.  Depending on the
# code, we write out other data.  Redirects have the location printed, everything
# else gets the content-type.
#
open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );

print RESP $response->code() . "\n";

if( $response->code() == RC_OK ) {
    print RESP $response->header( "content-type" ) . "\n";

} elsif( $response->is_redirect() ) {
    print RESP ($response->header( "location" ) ||'') . "\n";
}

print RESP ($response->last_modified || 0), "\n";

close( RESP );

#
# Write out the actual data assuming the retrieval was succesful.  Also, if
# we have actual data and it's of type text/html, write out all the links it
# refers to
#
if( $response->code() == RC_OK ) {
    my $contents = $response->content();

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
    print CONTENTS $contents;
    close( CONTENTS );

    if( $response->header("content-type") =~ "text/html" ) {
	open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
	my $p = HTML::LinkExtor->new( \&linkcb, $url );
	$p->parse( $contents );

	close( LINKS );
    }
}


sub linkcb {
    my($tag, %links) = @_;
    if (($tag eq "a") && ($links{"href"})) {
	my $link = $links{"href"};
	#
	# Remove fragments
	#
	$link =~ s/(.*)#.*/$1/;
	
	#
	# Remove ../  This is important because the abs() function
	# can leave these in and cause never ending loops.
	#
	$link =~ s/\.\.\///g;

	print LINKS "$link\n";
    }
}

1	adcroft	1.1	#!/usr/local/bin/perl -w
2
3			# print STDERR "spider $$ [@ARGV]\n";
4
5			#
6			# SWISH-E http method Spider
7			# $Id: swishspider,v 1.8 2002/08/15 05:31:02 whmoseley Exp $
8			#
9
10			use strict;
11
12			use LWP::UserAgent;
13			use HTTP::Status;
14			use HTML::Parser 3.00;
15			use HTML::LinkExtor;
16
17			if (scalar(@ARGV) != 2) {
18			print STDERR "Usage: SwishSpider localpath url\n";
19			exit(1);
20			}
21
22			my $ua = new LWP::UserAgent;
23			$ua->agent( "SwishSpider http://swish-e.org" );
24
25			my $localpath = shift;
26			my $url = shift;
27
28			my $request = new HTTP::Request( "GET", $url );
29			my $response = $ua->simple_request( $request );
30
31			#
32			# Write out important meta-data. This includes the HTTP code. Depending on the
33			# code, we write out other data. Redirects have the location printed, everything
34			# else gets the content-type.
35			#
36			open( RESP, ">$localpath.response" ) \|\| die( "Could not open response file $localpath.response" );
37
38			print RESP $response->code() . "\n";
39
40			if( $response->code() == RC_OK ) {
41			print RESP $response->header( "content-type" ) . "\n";
42
43			} elsif( $response->is_redirect() ) {
44			print RESP ($response->header( "location" ) \|\|'') . "\n";
45			}
46
47			print RESP ($response->last_modified \|\| 0), "\n";
48
49			close( RESP );
50
51			#
52			# Write out the actual data assuming the retrieval was succesful. Also, if
53			# we have actual data and it's of type text/html, write out all the links it
54			# refers to
55			#
56			if( $response->code() == RC_OK ) {
57			my $contents = $response->content();
58
59			open( CONTENTS, ">$localpath.contents" ) \|\| die( "Could not open contents file $localpath.contents\n" );
60			print CONTENTS $contents;
61			close( CONTENTS );
62
63			if( $response->header("content-type") =~ "text/html" ) {
64			open( LINKS, ">$localpath.links" ) \|\| die( "Could not open links file $localpath.links\n" );
65			my $p = HTML::LinkExtor->new( \&linkcb, $url );
66			$p->parse( $contents );
67
68			close( LINKS );
69			}
70			}
71
72
73			sub linkcb {
74			my($tag, %links) = @_;
75			if (($tag eq "a") && ($links{"href"})) {
76			my $link = $links{"href"};
77			#
78			# Remove fragments
79			#
80			$link =~ s/(.)#./$1/;
81
82			#
83			# Remove ../ This is important because the abs() function
84			# can leave these in and cause never ending loops.
85			#
86			$link =~ s/\.\.\///g;
87
88			print LINKS "$link\n";
89			}
90			}
91