swish-e/prog-bin/pdf2html.pm

package pdf2html;
use strict;

=pod

=head1 NAME

pdf2html - swish-e sample module to convert pdf to html

=head1 SYNOPSIS

    use pdf2html;
    my $html_record_ref = pdf2html( $pdf_file_name, 'title' );

    # or by passing content in a scalar reference
    my $html_text_ref = pdf2html( \$pdf_content, 'title' );


=head1 DESCRIPTION

Sample module for use with other swish-e 'prog' document source programs.

Pass either a file name, or a scalar reference.

The differece is when you pass a reference to a scalar
only the content is returned.  When you pass a file name
an entire record is returned ready to be fed to swish -- this
includes the headers required by swish for indexing.

The second optional parameter is the extracted PDF info tag to use as the HTML title.


The plan is to find a library that will do this to avoid forking an external
program.

=head1 REQUIREMENTS

Uses the xpdf package that includes the pdftotext conversion program.
This is available from http://www.foolabs.com/xpdf/xpdf.html.

You will also need the module File::Temp (and its dependencies)
available from CPAN if passing content to this module (instead of a file name).


=head1 AUTHOR

Bill Moseley

=cut

use Symbol;


use vars qw(
    @ISA
    @EXPORT
    $VERSION
);

# $Id: pdf2html.pm,v 1.3 2002/01/31 00:37:43 whmoseley Exp $
$VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;

require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw(pdf2html);

my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;


sub pdf2html {
    my $file_or_content = shift;
    my $title_tag = shift;


    my $file = ref $file_or_content
    ? create_temp_file( $file_or_content )
    : $file_or_content;

    my $headers = get_pdf_headers( $file, $title_tag ) || '';

    my $content_ref = get_pdf_content_ref( $file );

    my $txt = <<EOF;
<html>    
<head>
$headers
</head>
<body>
<pre>
$$content_ref
</pre>
</body>
</html>
EOF

    if ( ref $file_or_content ) {
        unlink $file;
        return \$txt;
    }

    my $mtime  = (stat $file )[9];

    my $size = length $txt;

    my $ret = <<EOF;
Content-Length: $size
Last-Mtime: $mtime
Path-Name: $file

EOF

$ret .= $txt;

    return \$ret;
    

}

sub get_pdf_headers {

    my $file = shift;
    my $title_tag = shift;

    
    my $sym = gensym;

    open $sym, "pdfinfo $file |" || die "$0: Failed to open $file $!";

    my %metadata;

    while (<$sym>) {
        if ( /^\s*([^:]+):\s+(.+)$/ ) {
            my ( $metaname, $value ) = ( lc( $1 ), escapeXML( $2 ) );
            $metaname =~ tr/ /_/;
            $metadata{$metaname} = $value;
        }
    }
    close $sym or warn "$0: Failed close on pipe to pdfinfo for $file: $?";

    my $metas = join "\n", map { qq[<meta name="$_" content="$metadata{$_}">] } sort keys %metadata;

    if ( $title_tag && exists $metadata{ $title_tag } ) {
        $metas = "<title>$metadata{ $title_tag }</title>\n$metas";
    }

    return $metas;

}

sub get_pdf_content_ref {
    my $file = shift;

    my $sym = gensym;
    open $sym, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!";

    local $/ = undef;
    my $content = escapeXML(<$sym>);

    close $sym or warn "$0: Failed close on pipe to pdftotext for $file: $?";

    return \$content;
}


# How are URLs printed with pdftotext?
sub escapeXML {

   my $str = shift;

   for ( $str ) {
       s/</&lt;/go;
       s/>/&gt;/go;
       tr/\014/ /; # ^L
       # s/&/&amp;/go;
       # s/"/&quot;/go;
    }
   return $str;
}

# This is the portable way to do this, I suppose.
# Otherwise, just create a file in the local directory.

sub create_temp_file {
    my $scalar_ref = shift;

    require "File/Temp.pm";

    my ( $fh, $file_name ) = File::Temp::tempfile();

    print $fh $$scalar_ref or die $!;


    close $fh or die "Failed to close '$file_name' $!";

    return $file_name;
}
    

1	adcroft	1.1	package pdf2html;
2			use strict;
3
4			=pod
5
6			=head1 NAME
7
8			pdf2html - swish-e sample module to convert pdf to html
9
10			=head1 SYNOPSIS
11
12			use pdf2html;
13			my $html_record_ref = pdf2html( $pdf_file_name, 'title' );
14
15			# or by passing content in a scalar reference
16			my $html_text_ref = pdf2html( \$pdf_content, 'title' );
17
18
19
20
21			=head1 DESCRIPTION
22
23			Sample module for use with other swish-e 'prog' document source programs.
24
25			Pass either a file name, or a scalar reference.
26
27			The differece is when you pass a reference to a scalar
28			only the content is returned. When you pass a file name
29			an entire record is returned ready to be fed to swish -- this
30			includes the headers required by swish for indexing.
31
32			The second optional parameter is the extracted PDF info tag to use as the HTML title.
33
34
35
36			The plan is to find a library that will do this to avoid forking an external
37			program.
38
39			=head1 REQUIREMENTS
40
41			Uses the xpdf package that includes the pdftotext conversion program.
42			This is available from http://www.foolabs.com/xpdf/xpdf.html.
43
44			You will also need the module File::Temp (and its dependencies)
45			available from CPAN if passing content to this module (instead of a file name).
46
47
48			=head1 AUTHOR
49
50			Bill Moseley
51
52			=cut
53
54			use Symbol;
55
56
57			use vars qw(
58			@ISA
59			@EXPORT
60			$VERSION
61			);
62
63			# $Id: pdf2html.pm,v 1.3 2002/01/31 00:37:43 whmoseley Exp $
64			$VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;
65
66			require Exporter;
67			@ISA = qw(Exporter);
68			@EXPORT = qw(pdf2html);
69
70			my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;
71
72
73			sub pdf2html {
74			my $file_or_content = shift;
75			my $title_tag = shift;
76
77
78			my $file = ref $file_or_content
79			? create_temp_file( $file_or_content )
80			: $file_or_content;
81
82			my $headers = get_pdf_headers( $file, $title_tag ) \|\| '';
83
84			my $content_ref = get_pdf_content_ref( $file );
85
86			my $txt = <<EOF;
87			<html>
88			<head>
89			$headers
90			</head>
91			<body>
92			<pre>
93			$$content_ref
94			</pre>
95			</body>
96			</html>
97			EOF
98
99			if ( ref $file_or_content ) {
100			unlink $file;
101			return \$txt;
102			}
103
104			my $mtime = (stat $file )[9];
105
106			my $size = length $txt;
107
108			my $ret = <<EOF;
109			Content-Length: $size
110			Last-Mtime: $mtime
111			Path-Name: $file
112
113			EOF
114
115			$ret .= $txt;
116
117			return \$ret;
118
119
120			}
121
122			sub get_pdf_headers {
123
124			my $file = shift;
125			my $title_tag = shift;
126
127
128			my $sym = gensym;
129
130			open $sym, "pdfinfo $file \|" \|\| die "$0: Failed to open $file $!";
131
132			my %metadata;
133
134			while (<$sym>) {
135			if ( /^\s*([^:]+):\s+(.+)$/ ) {
136			my ( $metaname, $value ) = ( lc( $1 ), escapeXML( $2 ) );
137			$metaname =~ tr/ /_/;
138			$metadata{$metaname} = $value;
139			}
140			}
141			close $sym or warn "$0: Failed close on pipe to pdfinfo for $file: $?";
142
143			my $metas = join "\n", map { qq[<meta name="$_" content="$metadata{$_}">] } sort keys %metadata;
144
145			if ( $title_tag && exists $metadata{ $title_tag } ) {
146			$metas = "<title>$metadata{ $title_tag }</title>\n$metas";
147			}
148
149			return $metas;
150
151			}
152
153			sub get_pdf_content_ref {
154			my $file = shift;
155
156			my $sym = gensym;
157			open $sym, "pdftotext $file - \|" or die "$0: failed to run pdftotext: $!";
158
159			local $/ = undef;
160			my $content = escapeXML(<$sym>);
161
162			close $sym or warn "$0: Failed close on pipe to pdftotext for $file: $?";
163
164			return \$content;
165			}
166
167
168
169			# How are URLs printed with pdftotext?
170			sub escapeXML {
171
172			my $str = shift;
173
174			for ( $str ) {
175			s/</</go;
176			s/>/>/go;
177			tr/\014/ /; # ^L
178			# s/&/&/go;
179			# s/"/"/go;
180			}
181			return $str;
182			}
183
184			# This is the portable way to do this, I suppose.
185			# Otherwise, just create a file in the local directory.
186
187			sub create_temp_file {
188			my $scalar_ref = shift;
189
190			require "File/Temp.pm";
191
192			my ( $fh, $file_name ) = File::Temp::tempfile();
193
194			print $fh $$scalar_ref or die $!;
195
196
197			close $fh or die "Failed to close '$file_name' $!";
198
199			return $file_name;
200			}
201
202