#!/usr/bin/perl -w use strict; # This is a short example that basically does the same # thing as the default file system access method by # recursing directories, but also shows how to process different # file types -- in this example pdf is converted to xml for indexing. # for the odd chance of running under Windows # Now extprog.c expects file in text mode so no need to binmode. # binmode STDOUT; use File::Find; # for recursing a directory tree #use pdf2xml; # example module for pdf to xml conversion # Not that you need IndexContents XML .pdf in the # swish-e config file # See perldoc File::Find for information on following symbolic links # and other important topics. use constant DEBUG => 0; find( { wanted => \&wanted, # no_chdir => 1, # 5.6 feature }, @ARGV, ); sub wanted { return if -d; if ( !-r _ ) { warn "$File::Find::name is not readable\n"; return; } # if ( /\.pdf$/ ) { # print STDERR "Indexing pdf $File::Find::name\n" if DEBUG; # print ${ pdf2xml( $File::Find::name ) }; # # } elsif ( /\.(txt|log|pl|html|htm)$/ ) { if ( /\.(html|htm)$/ ) { print STDERR "Indexing $File::Find::name\n" if DEBUG; print ${ get_content( $_ ) }; } else { print STDERR "Skipping $File::Find::name\n" if DEBUG; } } sub get_content { my $path = shift; my ( $size, $mtime ) = (stat $path )[7,9]; open FH, $path or die "Failed to open $path: $!"; # local $/ = undef; # $_ = ; # my $fcontent = $_; my $fcontent; my $navpan=0; while () { s/Header:.*$//g; s/Name:.*$//g; my $line = $_; if (//) { $navpan=1; }; if ($navpan == 0) { $fcontent .= $line; }; if (//) { $navpan=0; }; } $size = length $fcontent; my $content = <; return \$content; }