swish-e/filter-bin/_pdf2html.pl

#! /usr/local/bin/perl -w
use strict;

# -- Filter PDF to simple HTML for swish
# --
# -- 2000-05  rasc
#
=pod

This filter requires two programs "pdfinfo" and "pdftotext".
These programs are part of the xpdf package found at
http://www.foolabs.com/xpdf/xpdf.html.

These programs must be found in the PATH when indexing is run, or 
explicitly set the path in this program:

  $ENV{PATH} = '/path/to/programs'

"pdfinfo" extracts the document info from a pdf file, if any exist,
and creates metanames for swish to index.  See man pdfinfo(1) for
information what keywords are available.

An HTML title is created from the "title" and "subject" pdf info data.
Adjust as needed below.

How the extracted keyword info is indexed in Swish-e is controlled by
the following Swish-e configuration settings: MetaNames, PropertyNames,
UndefinedMetaTags.

Passing the -raw option to pdftotext may improve indexing time by
reducing the size of the converted output.

=cut


my $file = shift || die "Usage: $0 <filename>\n";

#
# -- read pdf meta information
#

my %metadata;

open F, "pdfinfo $file |" || die "$0: Failed to open $file $!";

while (<F>) {
    if ( /^\s*([^:]+):\s+(.+)$/ ) {
        my ( $metaname, $value ) = ( lc( $1 ), escapeHTML( $2 ) );
        $metaname =~ tr/ /_/;
        $metadata{$metaname} = $value;
    }
}
close F or die "$0: Failed close on pipe to pdfinfo for $file: $?";


# Set the default title from the title and subject info

my @title = grep { $_ } @metadata{ qw/title subject/ };
delete $metadata{$_} for qw/title subject/;


my $title = join ' // ', ( @title ? @title : 'Unknown title' );

my $metadata = 
    join "\n",
        map { qq[<meta name="$_" content="$metadata{$_}">] }
                   sort keys %metadata;


print <<EOF;
<html>
<head>
    <title>
        $title
    </title>
    $metadata
</head>
<body>
EOF

# Might be faster to use sysread and read in larger blocks

open F, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!";
print escapeHTML($_) while ( <F> );
close F or die "$0: Failed close on pipe to pdftotext for $file: $?";

print "</body></html>\n";


# How are URLs printed with pdftotext?
sub escapeHTML {

   my $str = shift;

   for ( $str ) {
       s/&/&amp;/go;
       s/</&lt;/go;
       s/>/&gt;/go;
       s/"/&quot;/go;
       tr/\014/ /; # ^L
    }
   return $str;
}

1	#! /usr/local/bin/perl -w
2	use strict;
3
4	# -- Filter PDF to simple HTML for swish
5	# --
6	# -- 2000-05 rasc
7	#
8	=pod
9
10	This filter requires two programs "pdfinfo" and "pdftotext".
11	These programs are part of the xpdf package found at
12	http://www.foolabs.com/xpdf/xpdf.html.
13
14	These programs must be found in the PATH when indexing is run, or
15	explicitly set the path in this program:
16
17	$ENV{PATH} = '/path/to/programs'
18
19	"pdfinfo" extracts the document info from a pdf file, if any exist,
20	and creates metanames for swish to index. See man pdfinfo(1) for
21	information what keywords are available.
22
23	An HTML title is created from the "title" and "subject" pdf info data.
24	Adjust as needed below.
25
26	How the extracted keyword info is indexed in Swish-e is controlled by
27	the following Swish-e configuration settings: MetaNames, PropertyNames,
28	UndefinedMetaTags.
29
30	Passing the -raw option to pdftotext may improve indexing time by
31	reducing the size of the converted output.
32
33	=cut
34
35
36	my $file = shift \|\| die "Usage: $0 <filename>\n";
37
38	#
39	# -- read pdf meta information
40	#
41
42	my %metadata;
43
44	open F, "pdfinfo $file \|" \|\| die "$0: Failed to open $file $!";
45
46	while (<F>) {
47	if ( /^\s*([^:]+):\s+(.+)$/ ) {
48	my ( $metaname, $value ) = ( lc( $1 ), escapeHTML( $2 ) );
49	$metaname =~ tr/ /_/;
50	$metadata{$metaname} = $value;
51	}
52	}
53	close F or die "$0: Failed close on pipe to pdfinfo for $file: $?";
54
55
56	# Set the default title from the title and subject info
57
58	my @title = grep { $_ } @metadata{ qw/title subject/ };
59	delete $metadata{$_} for qw/title subject/;
60
61
62	my $title = join ' // ', ( @title ? @title : 'Unknown title' );
63
64	my $metadata =
65	join "\n",
66	map { qq[<meta name="$_" content="$metadata{$_}">] }
67	sort keys %metadata;
68
69
70	print <<EOF;
71	<html>
72	<head>
73	<title>
74	$title
75	</title>
76	$metadata
77	</head>
78	<body>
79	EOF
80
81	# Might be faster to use sysread and read in larger blocks
82
83	open F, "pdftotext $file - \|" or die "$0: failed to run pdftotext: $!";
84	print escapeHTML($_) while ( <F> );
85	close F or die "$0: Failed close on pipe to pdftotext for $file: $?";
86
87	print "</body></html>\n";
88
89
90	# How are URLs printed with pdftotext?
91	sub escapeHTML {
92
93	my $str = shift;
94
95	for ( $str ) {
96	s/&/&/go;
97	s/</</go;
98	s/>/>/go;
99	s/"/"/go;
100	tr/\014/ /; # ^L
101	}
102	return $str;
103	}
104