package pdf2html;
use strict;
=pod
=head1 NAME
pdf2html - swish-e sample module to convert pdf to html
=head1 SYNOPSIS
use pdf2html;
my $html_record_ref = pdf2html( $pdf_file_name, 'title' );
# or by passing content in a scalar reference
my $html_text_ref = pdf2html( \$pdf_content, 'title' );
=head1 DESCRIPTION
Sample module for use with other swish-e 'prog' document source programs.
Pass either a file name, or a scalar reference.
The differece is when you pass a reference to a scalar
only the content is returned. When you pass a file name
an entire record is returned ready to be fed to swish -- this
includes the headers required by swish for indexing.
The second optional parameter is the extracted PDF info tag to use as the HTML title.
The plan is to find a library that will do this to avoid forking an external
program.
=head1 REQUIREMENTS
Uses the xpdf package that includes the pdftotext conversion program.
This is available from http://www.foolabs.com/xpdf/xpdf.html.
You will also need the module File::Temp (and its dependencies)
available from CPAN if passing content to this module (instead of a file name).
=head1 AUTHOR
Bill Moseley
=cut
use Symbol;
use vars qw(
@ISA
@EXPORT
$VERSION
);
# $Id: pdf2html.pm,v 1.1 2002/09/20 19:47:30 adcroft Exp $
$VERSION = sprintf '%d.%02d', q$Revision: 1.1 $ =~ /: (\d+)\.(\d+)/;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(pdf2html);
my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;
sub pdf2html {
my $file_or_content = shift;
my $title_tag = shift;
my $file = ref $file_or_content
? create_temp_file( $file_or_content )
: $file_or_content;
my $headers = get_pdf_headers( $file, $title_tag ) || '';
my $content_ref = get_pdf_content_ref( $file );
my $txt = <
$headers
$$content_ref