/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/filter-bin/_pdf2html.pl
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/filter-bin/_pdf2html.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (21 years, 7 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 #! /usr/local/bin/perl -w
2 use strict;
3
4 # -- Filter PDF to simple HTML for swish
5 # --
6 # -- 2000-05 rasc
7 #
8 =pod
9
10 This filter requires two programs "pdfinfo" and "pdftotext".
11 These programs are part of the xpdf package found at
12 http://www.foolabs.com/xpdf/xpdf.html.
13
14 These programs must be found in the PATH when indexing is run, or
15 explicitly set the path in this program:
16
17 $ENV{PATH} = '/path/to/programs'
18
19 "pdfinfo" extracts the document info from a pdf file, if any exist,
20 and creates metanames for swish to index. See man pdfinfo(1) for
21 information what keywords are available.
22
23 An HTML title is created from the "title" and "subject" pdf info data.
24 Adjust as needed below.
25
26 How the extracted keyword info is indexed in Swish-e is controlled by
27 the following Swish-e configuration settings: MetaNames, PropertyNames,
28 UndefinedMetaTags.
29
30 Passing the -raw option to pdftotext may improve indexing time by
31 reducing the size of the converted output.
32
33 =cut
34
35
36 my $file = shift || die "Usage: $0 <filename>\n";
37
38 #
39 # -- read pdf meta information
40 #
41
42 my %metadata;
43
44 open F, "pdfinfo $file |" || die "$0: Failed to open $file $!";
45
46 while (<F>) {
47 if ( /^\s*([^:]+):\s+(.+)$/ ) {
48 my ( $metaname, $value ) = ( lc( $1 ), escapeHTML( $2 ) );
49 $metaname =~ tr/ /_/;
50 $metadata{$metaname} = $value;
51 }
52 }
53 close F or die "$0: Failed close on pipe to pdfinfo for $file: $?";
54
55
56 # Set the default title from the title and subject info
57
58 my @title = grep { $_ } @metadata{ qw/title subject/ };
59 delete $metadata{$_} for qw/title subject/;
60
61
62 my $title = join ' // ', ( @title ? @title : 'Unknown title' );
63
64 my $metadata =
65 join "\n",
66 map { qq[<meta name="$_" content="$metadata{$_}">] }
67 sort keys %metadata;
68
69
70 print <<EOF;
71 <html>
72 <head>
73 <title>
74 $title
75 </title>
76 $metadata
77 </head>
78 <body>
79 EOF
80
81 # Might be faster to use sysread and read in larger blocks
82
83 open F, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!";
84 print escapeHTML($_) while ( <F> );
85 close F or die "$0: Failed close on pipe to pdftotext for $file: $?";
86
87 print "</body></html>\n";
88
89
90 # How are URLs printed with pdftotext?
91 sub escapeHTML {
92
93 my $str = shift;
94
95 for ( $str ) {
96 s/&/&amp;/go;
97 s/</&lt;/go;
98 s/>/&gt;/go;
99 s/"/&quot;/go;
100 tr/\014/ /; # ^L
101 }
102 return $str;
103 }
104

  ViewVC Help
Powered by ViewVC 1.1.22