/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2html.pm
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2html.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 package pdf2html;
2 use strict;
3
4 =pod
5
6 =head1 NAME
7
8 pdf2html - swish-e sample module to convert pdf to html
9
10 =head1 SYNOPSIS
11
12 use pdf2html;
13 my $html_record_ref = pdf2html( $pdf_file_name, 'title' );
14
15 # or by passing content in a scalar reference
16 my $html_text_ref = pdf2html( \$pdf_content, 'title' );
17
18
19
20
21 =head1 DESCRIPTION
22
23 Sample module for use with other swish-e 'prog' document source programs.
24
25 Pass either a file name, or a scalar reference.
26
27 The differece is when you pass a reference to a scalar
28 only the content is returned. When you pass a file name
29 an entire record is returned ready to be fed to swish -- this
30 includes the headers required by swish for indexing.
31
32 The second optional parameter is the extracted PDF info tag to use as the HTML title.
33
34
35
36 The plan is to find a library that will do this to avoid forking an external
37 program.
38
39 =head1 REQUIREMENTS
40
41 Uses the xpdf package that includes the pdftotext conversion program.
42 This is available from http://www.foolabs.com/xpdf/xpdf.html.
43
44 You will also need the module File::Temp (and its dependencies)
45 available from CPAN if passing content to this module (instead of a file name).
46
47
48 =head1 AUTHOR
49
50 Bill Moseley
51
52 =cut
53
54 use Symbol;
55
56
57 use vars qw(
58 @ISA
59 @EXPORT
60 $VERSION
61 );
62
63 # $Id: pdf2html.pm,v 1.3 2002/01/31 00:37:43 whmoseley Exp $
64 $VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;
65
66 require Exporter;
67 @ISA = qw(Exporter);
68 @EXPORT = qw(pdf2html);
69
70 my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;
71
72
73 sub pdf2html {
74 my $file_or_content = shift;
75 my $title_tag = shift;
76
77
78 my $file = ref $file_or_content
79 ? create_temp_file( $file_or_content )
80 : $file_or_content;
81
82 my $headers = get_pdf_headers( $file, $title_tag ) || '';
83
84 my $content_ref = get_pdf_content_ref( $file );
85
86 my $txt = <<EOF;
87 <html>
88 <head>
89 $headers
90 </head>
91 <body>
92 <pre>
93 $$content_ref
94 </pre>
95 </body>
96 </html>
97 EOF
98
99 if ( ref $file_or_content ) {
100 unlink $file;
101 return \$txt;
102 }
103
104 my $mtime = (stat $file )[9];
105
106 my $size = length $txt;
107
108 my $ret = <<EOF;
109 Content-Length: $size
110 Last-Mtime: $mtime
111 Path-Name: $file
112
113 EOF
114
115 $ret .= $txt;
116
117 return \$ret;
118
119
120 }
121
122 sub get_pdf_headers {
123
124 my $file = shift;
125 my $title_tag = shift;
126
127
128 my $sym = gensym;
129
130 open $sym, "pdfinfo $file |" || die "$0: Failed to open $file $!";
131
132 my %metadata;
133
134 while (<$sym>) {
135 if ( /^\s*([^:]+):\s+(.+)$/ ) {
136 my ( $metaname, $value ) = ( lc( $1 ), escapeXML( $2 ) );
137 $metaname =~ tr/ /_/;
138 $metadata{$metaname} = $value;
139 }
140 }
141 close $sym or warn "$0: Failed close on pipe to pdfinfo for $file: $?";
142
143 my $metas = join "\n", map { qq[<meta name="$_" content="$metadata{$_}">] } sort keys %metadata;
144
145 if ( $title_tag && exists $metadata{ $title_tag } ) {
146 $metas = "<title>$metadata{ $title_tag }</title>\n$metas";
147 }
148
149 return $metas;
150
151 }
152
153 sub get_pdf_content_ref {
154 my $file = shift;
155
156 my $sym = gensym;
157 open $sym, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!";
158
159 local $/ = undef;
160 my $content = escapeXML(<$sym>);
161
162 close $sym or warn "$0: Failed close on pipe to pdftotext for $file: $?";
163
164 return \$content;
165 }
166
167
168
169 # How are URLs printed with pdftotext?
170 sub escapeXML {
171
172 my $str = shift;
173
174 for ( $str ) {
175 s/</&lt;/go;
176 s/>/&gt;/go;
177 tr/\014/ /; # ^L
178 # s/&/&amp;/go;
179 # s/"/&quot;/go;
180 }
181 return $str;
182 }
183
184 # This is the portable way to do this, I suppose.
185 # Otherwise, just create a file in the local directory.
186
187 sub create_temp_file {
188 my $scalar_ref = shift;
189
190 require "File/Temp.pm";
191
192 my ( $fh, $file_name ) = File::Temp::tempfile();
193
194 print $fh $$scalar_ref or die $!;
195
196
197 close $fh or die "Failed to close '$file_name' $!";
198
199 return $file_name;
200 }
201
202

  ViewVC Help
Powered by ViewVC 1.1.22