/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2xml.pm
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/prog-bin/pdf2xml.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 adcroft 1.1 package pdf2xml;
2     use strict;
3    
4     =pod
5    
6     =head1 NAME
7    
8     pdf2xml - swish-e sample module to convert pdf2xml
9    
10     =head1 SYNOPSIS
11    
12     use pdf2xml;
13     my $xml_record_ref = pdf2xml( $pdf_file_name );
14    
15     # or by passing content in a scalar reference
16     my $xml_text_ref = pdf2xml( \$pdf_content );
17    
18    
19    
20    
21     =head1 DESCRIPTION
22    
23     Sample module for use with other swish-e 'prog' document source programs.
24    
25     Pass either a file name, or a scalar reference.
26    
27     The differece is when you pass a reference to a scalar
28     only the content is returned. When you pass a file name
29     an entire record is returned ready to be fed to swish -- this
30     includes the headers required by swish for indexing.
31    
32     The plan is to find a library that will do this to avoid forking an external
33     program.
34    
35     =head1 REQUIREMENTS
36    
37     Uses the xpdf package that includes the pdftotext conversion program.
38     This is available from http://www.foolabs.com/xpdf/xpdf.html.
39    
40     You will also need the module File::Temp (and its dependencies)
41     available from CPAN if passing content to this module (instead of a file name).
42    
43    
44     =head1 AUTHOR
45    
46     Bill Moseley
47    
48     =cut
49    
50     use Symbol;
51    
52    
53     use vars qw(
54     @ISA
55     @EXPORT
56     $VERSION
57     );
58    
59     # $Id: pdf2xml.pm,v 1.3 2001/05/11 14:48:47 whmoseley Exp $
60     $VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;
61    
62     require Exporter;
63     @ISA = qw(Exporter);
64     @EXPORT = qw(pdf2xml);
65    
66     my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;
67    
68    
69     sub pdf2xml {
70     my $file_or_content = shift;
71    
72    
73     my $file = ref $file_or_content
74     ? create_temp_file( $file_or_content )
75     : $file_or_content;
76    
77     my $headers = get_pdf_headers( $file ) || '';
78    
79     my $content_ref = get_pdf_content_ref( $file );
80    
81     my $txt = <<EOF;
82     <all>
83    
84     <headers>
85     $headers
86     </headers>
87     <content>
88     $$content_ref
89     </content>
90     </all>
91     EOF
92    
93     return \$txt if ref $file_or_content;
94    
95     my $mtime = (stat $file )[9];
96    
97     my $size = length $txt;
98    
99     my $ret = <<EOF;
100     Content-Length: $size
101     Last-Mtime: $mtime
102     Path-Name: $file
103    
104     EOF
105    
106     $ret .= $txt;
107    
108     return \$ret;
109    
110    
111     }
112    
113     sub get_pdf_headers {
114    
115     my $file = shift;
116     my $sym = gensym;
117    
118     open $sym, "pdfinfo $file |" || die "$0: Failed to open $file $!";
119    
120     my %metadata;
121    
122     while (<$sym>) {
123     if ( /^\s*([^:]+):\s+(.+)$/ ) {
124     my ( $metaname, $value ) = ( lc( $1 ), escapeXML( $2 ) );
125     $metaname =~ tr/ /_/;
126     $metadata{$metaname} = $value;
127     }
128     }
129     close $sym or die "$0: Failed close on pipe to pdfinfo for $file: $?";
130    
131     return join "\n", map { "<$_>$metadata{$_}</$_>" } sort keys %metadata;
132    
133     }
134    
135     sub get_pdf_content_ref {
136     my $file = shift;
137    
138     my $sym = gensym;
139     open $sym, "pdftotext $file - |" or die "$0: failed to run pdftotext: $!";
140    
141     local $/ = undef;
142     my $content = escapeXML(<$sym>);
143    
144     close $sym or die "$0: Failed close on pipe to pdftotext for $file: $?";
145    
146     return \$content;
147     }
148    
149    
150    
151     # How are URLs printed with pdftotext?
152     sub escapeXML {
153    
154     my $str = shift;
155    
156     for ( $str ) {
157     s/</&lt;/go;
158     s/>/&gt;/go;
159     tr/\014/ /; # ^L
160     # s/&/&amp;/go;
161     # s/"/&quot;/go;
162     }
163     return $str;
164     }
165    
166     # This is the portable way to do this, I suppose.
167     # Otherwise, just create a file in the local directory.
168    
169     sub create_temp_file {
170     my $scalar_ref = shift;
171    
172     require "File/Temp.pm";
173    
174     my ( $fh, $file_name ) = File::Temp::tempfile( UNLINK => 1 );
175    
176     print $fh $$scalar_ref or die $!;
177    
178    
179     close $fh or die "Failed to close '$file_name' $!";
180    
181     return $file_name;
182     }
183    
184    

  ViewVC Help
Powered by ViewVC 1.1.22