1 |
adcroft |
1.1 |
package doc2txt; |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
=pod |
5 |
|
|
|
6 |
|
|
=head1 NAME |
7 |
|
|
|
8 |
|
|
doc2txt - swish-e sample module to convert MS Word docs to text |
9 |
|
|
|
10 |
|
|
=head1 SYNOPSIS |
11 |
|
|
|
12 |
|
|
use doc2txt; |
13 |
|
|
my $doc_record_ref = doc2txt( $doc_file_name ); |
14 |
|
|
|
15 |
|
|
# or by passing content in a scalar reference |
16 |
|
|
my $doc_text_ref = doc2txt( \$doc_content ); |
17 |
|
|
|
18 |
|
|
|
19 |
|
|
|
20 |
|
|
|
21 |
|
|
=head1 DESCRIPTION |
22 |
|
|
|
23 |
|
|
Sample module for use with other swish-e 'prog' document source programs. |
24 |
|
|
|
25 |
|
|
Pass either a file name, or a scalar reference. |
26 |
|
|
|
27 |
|
|
The differece is when you pass a reference to a scalar |
28 |
|
|
only the content is returned. When you pass a file name |
29 |
|
|
an entire record is returned ready to be fed to swish -- this |
30 |
|
|
includes the headers required by swish for indexing. |
31 |
|
|
|
32 |
|
|
|
33 |
|
|
=head1 REQUIREMENTS |
34 |
|
|
|
35 |
|
|
Uses the catdoc program. http://www.fe.msk.ru/~vitus/catdoc/ |
36 |
|
|
|
37 |
|
|
You may need to adjust the parameters used to call catdoc. |
38 |
|
|
|
39 |
|
|
You will also need the module File::Temp available from CPAN if passing content |
40 |
|
|
to this module (instead of a file name). I'm not thrilled about how that |
41 |
|
|
currently works... |
42 |
|
|
|
43 |
|
|
|
44 |
|
|
=head1 AUTHOR |
45 |
|
|
|
46 |
|
|
Bill Moseley |
47 |
|
|
|
48 |
|
|
=cut |
49 |
|
|
|
50 |
|
|
use Symbol; |
51 |
|
|
|
52 |
|
|
|
53 |
|
|
use vars qw( |
54 |
|
|
@ISA |
55 |
|
|
@EXPORT |
56 |
|
|
$VERSION |
57 |
|
|
); |
58 |
|
|
|
59 |
|
|
# $Id: doc2txt.pm,v 1.2 2002/05/27 06:35:32 whmoseley Exp $ |
60 |
|
|
$VERSION = sprintf '%d.%02d', q$Revision: 1.2 $ =~ /: (\d+)\.(\d+)/; |
61 |
|
|
|
62 |
|
|
require Exporter; |
63 |
|
|
@ISA = qw(Exporter); |
64 |
|
|
@EXPORT = qw(doc2txt); |
65 |
|
|
|
66 |
|
|
my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/; |
67 |
|
|
|
68 |
|
|
my $catdoc = 'catdoc -a'; # how cat doc is called. Rainer uses catdoc -s8859-1 -d8859-1 |
69 |
|
|
|
70 |
|
|
|
71 |
|
|
sub doc2txt { |
72 |
|
|
my $file_or_content = shift; |
73 |
|
|
|
74 |
|
|
|
75 |
|
|
my $file = ref $file_or_content |
76 |
|
|
? create_temp_file( $file_or_content ) |
77 |
|
|
: $file_or_content; |
78 |
|
|
|
79 |
|
|
|
80 |
|
|
my $content = `$catdoc $file`; |
81 |
|
|
|
82 |
|
|
return \$content if ref $file_or_content; |
83 |
|
|
|
84 |
|
|
# otherwise build the headers |
85 |
|
|
|
86 |
|
|
my $mtime = (stat $file )[9]; |
87 |
|
|
|
88 |
|
|
my $size = length $content; |
89 |
|
|
|
90 |
|
|
my $ret = <<EOF; |
91 |
|
|
Content-Length: $size |
92 |
|
|
Last-Mtime: $mtime |
93 |
|
|
Path-Name: $file |
94 |
|
|
|
95 |
|
|
EOF |
96 |
|
|
|
97 |
|
|
$ret .= $content; |
98 |
|
|
|
99 |
|
|
return \$ret; |
100 |
|
|
|
101 |
|
|
|
102 |
|
|
} |
103 |
|
|
|
104 |
|
|
|
105 |
|
|
# This is the portable way to do this, I suppose. |
106 |
|
|
# Otherwise, just create a file in the local directory. |
107 |
|
|
|
108 |
|
|
sub create_temp_file { |
109 |
|
|
my $scalar_ref = shift; |
110 |
|
|
|
111 |
|
|
require "File/Temp.pm"; |
112 |
|
|
|
113 |
|
|
my ( $fh, $file_name ) = File::Temp::tempfile( UNLINK => 1 ); |
114 |
|
|
|
115 |
|
|
print $fh $$scalar_ref or die $!; |
116 |
|
|
|
117 |
|
|
|
118 |
|
|
close $fh or die "Failed to close '$file_name' $!"; |
119 |
|
|
|
120 |
|
|
return $file_name; |
121 |
|
|
} |
122 |
|
|
|
123 |
|
|
|