1 |
#!/usr/local/bin/perl -w |
2 |
use strict; |
3 |
|
4 |
=pod |
5 |
|
6 |
This is an example program for use with swish-e's -S prog indexing method. |
7 |
|
8 |
This will scan and index a hypermail (http://hypermail.org) mailing list archive. |
9 |
|
10 |
You might use a config file such as: |
11 |
|
12 |
IndexDir ./index_hypermail.pl |
13 |
SwishProgParameters /usr/local/hypermail/foo |
14 |
|
15 |
MetaNames swishtitle name email |
16 |
PropertyNames name email |
17 |
PropertyNamesDate sent |
18 |
IndexContents HTML2 .html |
19 |
StoreDescription HTML2 <body> 100000 |
20 |
UndefinedMetaTags ignore |
21 |
|
22 |
The above expects this file (index_hypermail.pl) to be in the current diretory, |
23 |
and expects the hypermail files to be in the directory /usr/local/hypermail/foo. |
24 |
|
25 |
Index with the command: |
26 |
|
27 |
./swish-e -c swish.conf -S prog |
28 |
|
29 |
See perldoc examples/swish.cgi for how to search this index. Here's a possible |
30 |
config file for use with swish.cgi: |
31 |
|
32 |
>cat .swishcgi.conf |
33 |
|
34 |
return { |
35 |
title => "Search the Foo List Archive", |
36 |
swish_binary => '../swish-e', |
37 |
display_props => [qw/ name email sent /], |
38 |
sorts => [qw/swishrank swishtitle email sent/], |
39 |
secondary_sort => [qw/sent desc/], |
40 |
metanames => [qw/swishdefault swishtitle name email/], |
41 |
name_labels => { |
42 |
swishrank => 'Rank', |
43 |
swishtitle => 'Subject Only', |
44 |
name => "Poster's Name", |
45 |
email => "Poster's Email", |
46 |
sent => 'Message Date', |
47 |
swishdefault => 'Subject & Body', |
48 |
}, |
49 |
|
50 |
highlight => { |
51 |
package => 'PhraseHighlight', |
52 |
show_words => 10, # Number of swish words words to show around highlighted word |
53 |
max_words => 100, # If no words are found to highlighted then show this many words |
54 |
occurrences => 6, # Limit number of occurrences of highlighted words |
55 |
highlight_on => '<font style="background:#FFFF99">', |
56 |
highlight_off => '</font>', |
57 |
meta_to_prop_map => { # this maps search metatags to display properties |
58 |
swishdefault => [ qw/swishtitle swishdescription/ ], |
59 |
swishtitle => [ qw/swishtitle/ ], |
60 |
email => [ qw/email/ ], |
61 |
name => [ qw/name/ ], |
62 |
swishdocpath => [ qw/swishdocpath/ ], |
63 |
}, |
64 |
}, |
65 |
date_ranges => { |
66 |
property_name => 'sent', # property name to limit by |
67 |
time_periods => [ |
68 |
'All', |
69 |
'Today', |
70 |
'Yesterday', |
71 |
'This Week', |
72 |
'Last Week', |
73 |
'Last 90 Days', |
74 |
'This Month', |
75 |
'Last Month', |
76 |
], |
77 |
|
78 |
line_break => 0, |
79 |
default => 'All', |
80 |
date_range => 1, |
81 |
}, |
82 |
}; |
83 |
|
84 |
|
85 |
|
86 |
=cut |
87 |
|
88 |
|
89 |
|
90 |
use File::Find; # for recursing a directory tree |
91 |
use Date::Parse; |
92 |
|
93 |
# Recurse the directory(s) passed in on the command line |
94 |
|
95 |
find( { wanted => \&wanted, no_chdir => 1, }, @ARGV ); |
96 |
|
97 |
|
98 |
sub wanted { |
99 |
return if -d; |
100 |
return unless m!(^|/)\d+\.html$!; |
101 |
|
102 |
my $mtime = (stat $File::Find::name )[9]; |
103 |
|
104 |
my $html = format_message($File::Find::name ); |
105 |
return unless $html; |
106 |
|
107 |
my $size = length $html; |
108 |
|
109 |
my $name = $File::Find::name; |
110 |
$name =~ s[^./][]; |
111 |
|
112 |
print <<EOF; |
113 |
Content-Length: $size |
114 |
Last-Mtime: $mtime |
115 |
Path-Name: $name |
116 |
|
117 |
EOF |
118 |
|
119 |
print $html; |
120 |
} |
121 |
|
122 |
|
123 |
|
124 |
sub format_message { |
125 |
my $file = shift; |
126 |
local $_; |
127 |
|
128 |
unless ( open FH, "<$file" ) { |
129 |
warn "Failed to open '$file'. Error: $!"; |
130 |
return; |
131 |
} |
132 |
|
133 |
my %fields; |
134 |
|
135 |
while (<FH>) { |
136 |
if ( my( $tag, $content) = /<!-- ([^=]+)="(.+)" -->$/ ) { |
137 |
last if $tag eq 'body'; |
138 |
|
139 |
unless ( $content ) { |
140 |
warn "File '$file' tag '$tag' empty content\n"; |
141 |
next; |
142 |
} |
143 |
|
144 |
if ( $tag eq 'sent' ) { |
145 |
my $date = str2time $content; |
146 |
unless ( defined $date ) { |
147 |
warn "File '$file' tag '$tag' failed to parse '$content'\n"; |
148 |
next; |
149 |
} |
150 |
$content = $date; |
151 |
} |
152 |
|
153 |
if ( $tag eq 'received' ) { |
154 |
my $date = str2time $content; |
155 |
unless ( defined $date ) { |
156 |
warn "File '$file' tag '$tag' failed to parse '$content'\n"; |
157 |
next; |
158 |
} |
159 |
$content = $date; |
160 |
} |
161 |
|
162 |
|
163 |
if ( $tag eq 'subject' ) { |
164 |
$content =~ s/\Q[SWISH-E]\E//; |
165 |
if ( $content =~ s/\s*Re:\s*//i ) { |
166 |
$content .= ' (Re)'; |
167 |
} |
168 |
} |
169 |
|
170 |
$fields{$tag} = $content; |
171 |
} |
172 |
} |
173 |
|
174 |
# Let's now use received header, if possible |
175 |
$fields{sent} = $fields{received} || $fields{sent}; |
176 |
|
177 |
my $body = ''; |
178 |
|
179 |
while ( <FH> ) { |
180 |
last if /<!-- body="end" -->/ || /^-- $/ || /^--$/ || /^(_|-){40,}\s*$/; |
181 |
|
182 |
$body .= $_; |
183 |
} |
184 |
|
185 |
return join "\n", |
186 |
'<html>', |
187 |
'<head>', |
188 |
'<title>', |
189 |
($fields{subject} || '' ), |
190 |
'</title>', |
191 |
map( { qq[<meta name="$_" content="$fields{$_}">] } keys %fields ), |
192 |
'</head><body>', |
193 |
$body, |
194 |
'</body>', |
195 |
'</html>', |
196 |
''; |
197 |
} |
198 |
|