| 1 | adcroft | 1.1 | #!/usr/bin/perl -w | 
| 2 |  |  | use strict; | 
| 3 |  |  |  | 
| 4 |  |  | # This is a short example that basically does the same | 
| 5 |  |  | # thing as the default file system access method by | 
| 6 |  |  | # recursing directories, but also shows how to process different | 
| 7 |  |  | # file types -- in this example pdf is converted to xml for indexing. | 
| 8 |  |  |  | 
| 9 |  |  | # for the odd chance of running under Windows | 
| 10 |  |  | # Now extprog.c expects file in text mode so no need to binmode. | 
| 11 |  |  | # binmode STDOUT; | 
| 12 |  |  |  | 
| 13 |  |  | use File::Find;  # for recursing a directory tree | 
| 14 |  |  | #use pdf2xml;     # example module for pdf to xml conversion | 
| 15 |  |  | # Not that you need IndexContents XML .pdf in the | 
| 16 |  |  | # swish-e config file | 
| 17 |  |  |  | 
| 18 |  |  | # See perldoc File::Find for information on following symbolic links | 
| 19 |  |  | # and other important topics. | 
| 20 |  |  |  | 
| 21 |  |  | use constant DEBUG => 0; | 
| 22 |  |  |  | 
| 23 |  |  |  | 
| 24 |  |  | find( | 
| 25 |  |  | { | 
| 26 |  |  | wanted => \&wanted, | 
| 27 |  |  | # no_chdir => 1,  # 5.6 feature | 
| 28 |  |  | }, | 
| 29 |  |  | @ARGV, | 
| 30 |  |  | ); | 
| 31 |  |  |  | 
| 32 |  |  | sub wanted { | 
| 33 |  |  | return if -d; | 
| 34 |  |  |  | 
| 35 |  |  | if ( !-r _ ) { | 
| 36 |  |  | warn "$File::Find::name is not readable\n"; | 
| 37 |  |  | return; | 
| 38 |  |  | } | 
| 39 |  |  |  | 
| 40 |  |  | #    if ( /\.pdf$/ ) { | 
| 41 |  |  | #        print STDERR "Indexing pdf $File::Find::name\n" if DEBUG; | 
| 42 |  |  | #        print ${ pdf2xml( $File::Find::name ) }; | 
| 43 |  |  | # | 
| 44 |  |  | #    } elsif ( /\.(txt|log|pl|html|htm)$/ ) { | 
| 45 |  |  | if ( /\.(html|htm)$/ ) { | 
| 46 |  |  | print STDERR "Indexing $File::Find::name\n" if DEBUG; | 
| 47 |  |  | print ${ get_content( $_ ) }; | 
| 48 |  |  |  | 
| 49 |  |  | } else { | 
| 50 |  |  | print STDERR "Skipping $File::Find::name\n" if DEBUG; | 
| 51 |  |  | } | 
| 52 |  |  | } | 
| 53 |  |  |  | 
| 54 |  |  |  | 
| 55 |  |  | sub get_content { | 
| 56 |  |  | my $path = shift; | 
| 57 |  |  |  | 
| 58 |  |  | my ( $size, $mtime )  = (stat $path )[7,9]; | 
| 59 |  |  | open FH, $path or die "Failed to open $path: $!"; | 
| 60 |  |  |  | 
| 61 |  |  | #   local $/ = undef; | 
| 62 |  |  | #   $_ = <FH>; | 
| 63 |  |  | #   my $fcontent = $_; | 
| 64 |  |  |  | 
| 65 |  |  | my $fcontent; | 
| 66 |  |  | my $navpan=0; | 
| 67 |  |  | while (<FH>) { | 
| 68 |  |  | s/Header:.*$//g; | 
| 69 |  |  | s/Name:.*$//g; | 
| 70 |  |  | my $line = $_; | 
| 71 |  |  | if (/<!--Navigation Panel-->/) { $navpan=1; }; | 
| 72 |  |  | if ($navpan == 0) { $fcontent .= $line; }; | 
| 73 |  |  | if (/<!--End of Navigation Panel-->/) { $navpan=0; }; | 
| 74 |  |  | } | 
| 75 |  |  | $size = length $fcontent; | 
| 76 |  |  |  | 
| 77 |  |  | my $content =  <<EOF; | 
| 78 |  |  | Content-Length: $size | 
| 79 |  |  | Last-Mtime: $mtime | 
| 80 |  |  | Path-Name: $File::Find::name | 
| 81 |  |  |  | 
| 82 |  |  | EOF | 
| 83 |  |  | $content .= $fcontent; | 
| 84 |  |  | #   local $/ = undef; | 
| 85 |  |  | #   $content .= <FH>; | 
| 86 |  |  | return \$content; | 
| 87 |  |  | } | 
| 88 |  |  |  |