/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2html.pm
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2html.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 adcroft 1.1 package pdf2html;
2     use strict;
3    
4     =pod
5    
6     =head1 NAME
7    
8     pdf2html - swish-e sample module to convert pdf to html
9    
10     =head1 SYNOPSIS
11    
12     use pdf2html;
13     my $html_record_ref = pdf2html( $pdf_file_name, 'title' );
14    
15     # or by passing content in a scalar reference
16     my $html_text_ref = pdf2html( \$pdf_content, 'title' );
17    
18    
19    
20    
21     =head1 DESCRIPTION
22    
23     Sample module for use with other swish-e 'prog' document source programs.
24    
25     Pass either a file name, or a scalar reference.
26    
27     The differece is when you pass a reference to a scalar
28     only the content is returned. When you pass a file name
29     an entire record is returned ready to be fed to swish -- this
30     includes the headers required by swish for indexing.
31    
32     The second optional parameter is the extracted PDF info tag to use as the HTML title.
33    
34    
35    
36     The plan is to find a library that will do this to avoid forking an external
37     program.
38    
39     =head1 REQUIREMENTS
40    
41     Uses the xpdf package that includes the pdftotext conversion program.
42     This is available from http://www.foolabs.com/xpdf/xpdf.html.
43    
44     You will also need the module File::Temp (and its dependencies)
45     available from CPAN if passing content to this module (instead of a file name).
46    
47    
48     =head1 AUTHOR
49    
50     Bill Moseley
51    
52     =cut
53    
54     use Symbol;
55    
56    
57     use vars qw(
58     @ISA
59     @EXPORT
60     $VERSION
61     );
62    
63     # $Id: pdf2html.pm,v 1.3 2002/01/31 00:37:43 whmoseley Exp $
64     $VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;
65    
66     require Exporter;
67     @ISA = qw(Exporter);
68     @EXPORT = qw(pdf2html);
69    
70     my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;
71    
72    
73     sub pdf2html {
74     my $file_or_content = shift;
75     my $title_tag = shift;
76    
77    
78     my $file = ref $file_or_content
79     ? create_temp_file( $file_or_content )
80     : $file_or_content;
81    
82     my $headers = get_pdf_headers( $file, $title_tag ) || '';
83    
84     my $content_ref = get_pdf_content_ref( $file );
85    
86     my $txt = <<EOF;
87     <html>
88     <head>
89     $headers
90     </head>
91     <body>
92     <pre>
93     $$content_ref
94     </pre>
95     </body>
96     </html>
97     EOF
98    
99     if ( ref $file_or_content ) {
100     unlink $file;
101     return \$txt;
102     }
103    
104     my $mtime = (stat $file )[9];
105    
106     my $size = length $txt;
107    
108     my $ret = <<EOF;
109     Content-Length: $size
110     Last-Mtime: $mtime
111     Path-Name: $file
112    
113     EOF
114    
115     $ret .= $txt;
116    
117     return \$ret;
118    
119    
120     }
121    
122     sub get_pdf_headers {
123    
124     my $file = shift;
125     my $title_tag = shift;
126    
127    
128     my $sym = gensym;
129    
130     open $sym, "pdfinfo $file |" || die "$0: Failed to open $file $!";
131    
132     my %metadata;
133    
134     while (<$sym>) {
135     if ( /^\s*([^:]+):\s+(.+)$/ ) {
136     my ( $metaname, $value ) = ( lc( $1 ), escapeXML( $2 ) );
137     $metaname =~ tr/ /_/;
138     $metadata{$metaname} = $value;
139     }
140     }
141     close $sym or warn "$0: Failed close on pipe to pdfinfo for $file: $?";
142    
143     my $metas = join "\n", map { qq[<meta name="$_" content="$metadata{$_}">] } sort keys %metadata;
144    
145     if ( $title_tag && exists $metadata{ $title_tag } ) {
146     $metas = "<title>$metadata{ $title_tag }</title>\n$metas";
147     }
148    
149     return $metas;
150    
151     }
152    
153     sub get_pdf_content_ref {
154     my $file = shift;
155    
156     my $sym = gensym;
157     open $sym, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!";
158    
159     local $/ = undef;
160     my $content = escapeXML(<$sym>);
161    
162     close $sym or warn "$0: Failed close on pipe to pdftotext for $file: $?";
163    
164     return \$content;
165     }
166    
167    
168    
169     # How are URLs printed with pdftotext?
170     sub escapeXML {
171    
172     my $str = shift;
173    
174     for ( $str ) {
175     s/</&lt;/go;
176     s/>/&gt;/go;
177     tr/\014/ /; # ^L
178     # s/&/&amp;/go;
179     # s/"/&quot;/go;
180     }
181     return $str;
182     }
183    
184     # This is the portable way to do this, I suppose.
185     # Otherwise, just create a file in the local directory.
186    
187     sub create_temp_file {
188     my $scalar_ref = shift;
189    
190     require "File/Temp.pm";
191    
192     my ( $fh, $file_name ) = File::Temp::tempfile();
193    
194     print $fh $$scalar_ref or die $!;
195    
196    
197     close $fh or die "Failed to close '$file_name' $!";
198    
199     return $file_name;
200     }
201    
202    

  ViewVC Help
Powered by ViewVC 1.1.22