1 |
#! /usr/local/bin/perl -w |
2 |
use strict; |
3 |
|
4 |
# -- Filter PDF to simple HTML for swish |
5 |
# -- |
6 |
# -- 2000-05 rasc |
7 |
# |
8 |
=pod |
9 |
|
10 |
This filter requires two programs "pdfinfo" and "pdftotext". |
11 |
These programs are part of the xpdf package found at |
12 |
http://www.foolabs.com/xpdf/xpdf.html. |
13 |
|
14 |
These programs must be found in the PATH when indexing is run, or |
15 |
explicitly set the path in this program: |
16 |
|
17 |
$ENV{PATH} = '/path/to/programs' |
18 |
|
19 |
"pdfinfo" extracts the document info from a pdf file, if any exist, |
20 |
and creates metanames for swish to index. See man pdfinfo(1) for |
21 |
information what keywords are available. |
22 |
|
23 |
An HTML title is created from the "title" and "subject" pdf info data. |
24 |
Adjust as needed below. |
25 |
|
26 |
How the extracted keyword info is indexed in Swish-e is controlled by |
27 |
the following Swish-e configuration settings: MetaNames, PropertyNames, |
28 |
UndefinedMetaTags. |
29 |
|
30 |
Passing the -raw option to pdftotext may improve indexing time by |
31 |
reducing the size of the converted output. |
32 |
|
33 |
=cut |
34 |
|
35 |
|
36 |
my $file = shift || die "Usage: $0 <filename>\n"; |
37 |
|
38 |
# |
39 |
# -- read pdf meta information |
40 |
# |
41 |
|
42 |
my %metadata; |
43 |
|
44 |
open F, "pdfinfo $file |" || die "$0: Failed to open $file $!"; |
45 |
|
46 |
while (<F>) { |
47 |
if ( /^\s*([^:]+):\s+(.+)$/ ) { |
48 |
my ( $metaname, $value ) = ( lc( $1 ), escapeHTML( $2 ) ); |
49 |
$metaname =~ tr/ /_/; |
50 |
$metadata{$metaname} = $value; |
51 |
} |
52 |
} |
53 |
close F or die "$0: Failed close on pipe to pdfinfo for $file: $?"; |
54 |
|
55 |
|
56 |
# Set the default title from the title and subject info |
57 |
|
58 |
my @title = grep { $_ } @metadata{ qw/title subject/ }; |
59 |
delete $metadata{$_} for qw/title subject/; |
60 |
|
61 |
|
62 |
my $title = join ' // ', ( @title ? @title : 'Unknown title' ); |
63 |
|
64 |
my $metadata = |
65 |
join "\n", |
66 |
map { qq[<meta name="$_" content="$metadata{$_}">] } |
67 |
sort keys %metadata; |
68 |
|
69 |
|
70 |
print <<EOF; |
71 |
<html> |
72 |
<head> |
73 |
<title> |
74 |
$title |
75 |
</title> |
76 |
$metadata |
77 |
</head> |
78 |
<body> |
79 |
EOF |
80 |
|
81 |
# Might be faster to use sysread and read in larger blocks |
82 |
|
83 |
open F, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!"; |
84 |
print escapeHTML($_) while ( <F> ); |
85 |
close F or die "$0: Failed close on pipe to pdftotext for $file: $?"; |
86 |
|
87 |
print "</body></html>\n"; |
88 |
|
89 |
|
90 |
# How are URLs printed with pdftotext? |
91 |
sub escapeHTML { |
92 |
|
93 |
my $str = shift; |
94 |
|
95 |
for ( $str ) { |
96 |
s/&/&/go; |
97 |
s/</</go; |
98 |
s/>/>/go; |
99 |
s/"/"/go; |
100 |
tr/\014/ /; # ^L |
101 |
} |
102 |
return $str; |
103 |
} |
104 |
|