/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2xml.pm
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2xml.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 package pdf2xml;
2 use strict;
3
4 =pod
5
6 =head1 NAME
7
8 pdf2xml - swish-e sample module to convert pdf2xml
9
10 =head1 SYNOPSIS
11
12 use pdf2xml;
13 my $xml_record_ref = pdf2xml( $pdf_file_name );
14
15 # or by passing content in a scalar reference
16 my $xml_text_ref = pdf2xml( \$pdf_content );
17
18
19
20
21 =head1 DESCRIPTION
22
23 Sample module for use with other swish-e 'prog' document source programs.
24
25 Pass either a file name, or a scalar reference.
26
27 The differece is when you pass a reference to a scalar
28 only the content is returned. When you pass a file name
29 an entire record is returned ready to be fed to swish -- this
30 includes the headers required by swish for indexing.
31
32 The plan is to find a library that will do this to avoid forking an external
33 program.
34
35 =head1 REQUIREMENTS
36
37 Uses the xpdf package that includes the pdftotext conversion program.
38 This is available from http://www.foolabs.com/xpdf/xpdf.html.
39
40 You will also need the module File::Temp (and its dependencies)
41 available from CPAN if passing content to this module (instead of a file name).
42
43
44 =head1 AUTHOR
45
46 Bill Moseley
47
48 =cut
49
50 use Symbol;
51
52
53 use vars qw(
54 @ISA
55 @EXPORT
56 $VERSION
57 );
58
59 # $Id: pdf2xml.pm,v 1.3 2001/05/11 14:48:47 whmoseley Exp $
60 $VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;
61
62 require Exporter;
63 @ISA = qw(Exporter);
64 @EXPORT = qw(pdf2xml);
65
66 my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;
67
68
69 sub pdf2xml {
70 my $file_or_content = shift;
71
72
73 my $file = ref $file_or_content
74 ? create_temp_file( $file_or_content )
75 : $file_or_content;
76
77 my $headers = get_pdf_headers( $file ) || '';
78
79 my $content_ref = get_pdf_content_ref( $file );
80
81 my $txt = <<EOF;
82 <all>
83
84 <headers>
85 $headers
86 </headers>
87 <content>
88 $$content_ref
89 </content>
90 </all>
91 EOF
92
93 return \$txt if ref $file_or_content;
94
95 my $mtime = (stat $file )[9];
96
97 my $size = length $txt;
98
99 my $ret = <<EOF;
100 Content-Length: $size
101 Last-Mtime: $mtime
102 Path-Name: $file
103
104 EOF
105
106 $ret .= $txt;
107
108 return \$ret;
109
110
111 }
112
113 sub get_pdf_headers {
114
115 my $file = shift;
116 my $sym = gensym;
117
118 open $sym, "pdfinfo $file |" || die "$0: Failed to open $file $!";
119
120 my %metadata;
121
122 while (<$sym>) {
123 if ( /^\s*([^:]+):\s+(.+)$/ ) {
124 my ( $metaname, $value ) = ( lc( $1 ), escapeXML( $2 ) );
125 $metaname =~ tr/ /_/;
126 $metadata{$metaname} = $value;
127 }
128 }
129 close $sym or die "$0: Failed close on pipe to pdfinfo for $file: $?";
130
131 return join "\n", map { "<$_>$metadata{$_}</$_>" } sort keys %metadata;
132
133 }
134
135 sub get_pdf_content_ref {
136 my $file = shift;
137
138 my $sym = gensym;
139 open $sym, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!";
140
141 local $/ = undef;
142 my $content = escapeXML(<$sym>);
143
144 close $sym or die "$0: Failed close on pipe to pdftotext for $file: $?";
145
146 return \$content;
147 }
148
149
150
151 # How are URLs printed with pdftotext?
152 sub escapeXML {
153
154 my $str = shift;
155
156 for ( $str ) {
157 s/</&lt;/go;
158 s/>/&gt;/go;
159 tr/\014/ /; # ^L
160 # s/&/&amp;/go;
161 # s/"/&quot;/go;
162 }
163 return $str;
164 }
165
166 # This is the portable way to do this, I suppose.
167 # Otherwise, just create a file in the local directory.
168
169 sub create_temp_file {
170 my $scalar_ref = shift;
171
172 require "File/Temp.pm";
173
174 my ( $fh, $file_name ) = File::Temp::tempfile( UNLINK => 1 );
175
176 print $fh $$scalar_ref or die $!;
177
178
179 close $fh or die "Failed to close '$file_name' $!";
180
181 return $file_name;
182 }
183
184

  ViewVC Help
Powered by ViewVC 1.1.22