1 |
adcroft |
1.1 |
/* |
2 |
|
|
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
3 |
|
|
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
4 |
|
|
** |
5 |
|
|
** This program and library is free software; you can redistribute it and/or |
6 |
|
|
** modify it under the terms of the GNU (Library) General Public License |
7 |
|
|
** as published by the Free Software Foundation; either version 2 |
8 |
|
|
** of the License, or any later version. |
9 |
|
|
** |
10 |
|
|
** This program is distributed in the hope that it will be useful, |
11 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
|
|
** GNU (Library) General Public License for more details. |
14 |
|
|
** |
15 |
|
|
** You should have received a copy of the GNU (Library) General Public License |
16 |
|
|
** long with this program; if not, write to the Free Software |
17 |
|
|
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
18 |
|
|
**-------------------------------------------------------------------- |
19 |
|
|
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98 |
20 |
|
|
** |
21 |
|
|
** change sprintf to snprintf to avoid corruption, |
22 |
|
|
** test length of spiderdirectory before strcat to avoid corruption, |
23 |
|
|
** added safestrcpy() macro to avoid corruption from strcpy overflow, |
24 |
|
|
** define MAXPIDLEN instead of literal "32" - assumed return length from lgetpid() |
25 |
|
|
** SRE 11/17/99 |
26 |
|
|
** |
27 |
|
|
** added buffer size arg to grabStringValue - core dumping from overrun |
28 |
|
|
** SRE 2/22/00 |
29 |
|
|
** |
30 |
|
|
** 2000-11 jruiz,rasc some redesign |
31 |
|
|
*/ |
32 |
|
|
|
33 |
|
|
/* |
34 |
|
|
** http.c |
35 |
|
|
*/ |
36 |
|
|
|
37 |
|
|
#ifdef HAVE_CONFIG_H |
38 |
|
|
#include "acconfig.h" |
39 |
|
|
#endif |
40 |
|
|
|
41 |
|
|
#ifdef HAVE_UNISTD_H |
42 |
|
|
#include <unistd.h> |
43 |
|
|
#endif |
44 |
|
|
|
45 |
|
|
#ifdef HAVE_STDLIB_H |
46 |
|
|
#include <stdlib.h> |
47 |
|
|
#endif |
48 |
|
|
|
49 |
|
|
#ifdef HAVE_PROCESS_H |
50 |
|
|
#include <process.h> |
51 |
|
|
#endif |
52 |
|
|
|
53 |
|
|
#include <time.h> |
54 |
|
|
#include <stdarg.h> |
55 |
|
|
|
56 |
|
|
// for wait |
57 |
|
|
#ifndef _WIN32 |
58 |
|
|
#include <sys/types.h> |
59 |
|
|
#include <sys/wait.h> |
60 |
|
|
#endif |
61 |
|
|
|
62 |
|
|
#include "swish.h" |
63 |
|
|
#include "mem.h" |
64 |
|
|
#include "string.h" |
65 |
|
|
#include "index.h" |
66 |
|
|
#include "hash.h" |
67 |
|
|
#include "file.h" |
68 |
|
|
#include "check.h" |
69 |
|
|
#include "error.h" |
70 |
|
|
|
71 |
|
|
#include "http.h" |
72 |
|
|
#include "httpserver.h" |
73 |
|
|
|
74 |
|
|
#include "xml.h" |
75 |
|
|
#include "txt.h" |
76 |
|
|
#include "html.h" |
77 |
|
|
|
78 |
|
|
/* |
79 |
|
|
-- init structures for this module |
80 |
|
|
*/ |
81 |
|
|
|
82 |
|
|
void initModule_HTTP(SWISH * sw) |
83 |
|
|
{ |
84 |
|
|
struct MOD_HTTP *http; |
85 |
|
|
int i; |
86 |
|
|
|
87 |
|
|
http = (struct MOD_HTTP *) emalloc(sizeof(struct MOD_HTTP)); |
88 |
|
|
|
89 |
|
|
sw->HTTP = http; |
90 |
|
|
|
91 |
|
|
http->lenspiderdirectory = MAXSTRLEN; |
92 |
|
|
http->spiderdirectory = (char *) emalloc(http->lenspiderdirectory + 1); |
93 |
|
|
http->spiderdirectory[0] = '\0'; |
94 |
|
|
/* Initialize spider directory */ |
95 |
|
|
http->spiderdirectory = SafeStrCopy(http->spiderdirectory, SPIDERDIRECTORY, &http->lenspiderdirectory); |
96 |
|
|
|
97 |
|
|
for (i = 0; i < BIGHASHSIZE; i++) |
98 |
|
|
http->url_hash[i] = NULL; |
99 |
|
|
|
100 |
|
|
http->equivalentservers = NULL; |
101 |
|
|
|
102 |
|
|
/* http default system parameters */ |
103 |
|
|
http->maxdepth = 5; |
104 |
|
|
http->delay = 60; |
105 |
|
|
} |
106 |
|
|
|
107 |
|
|
void freeModule_HTTP(SWISH * sw) |
108 |
|
|
{ |
109 |
|
|
struct MOD_HTTP *http = sw->HTTP; |
110 |
|
|
|
111 |
|
|
if (http->spiderdirectory) |
112 |
|
|
efree(http->spiderdirectory); |
113 |
|
|
efree(http); |
114 |
|
|
sw->HTTP = NULL; |
115 |
|
|
} |
116 |
|
|
|
117 |
|
|
int configModule_HTTP(SWISH * sw, StringList * sl) |
118 |
|
|
{ |
119 |
|
|
struct MOD_HTTP *http = sw->HTTP; |
120 |
|
|
char *w0 = sl->word[0]; |
121 |
|
|
int retval = 1; |
122 |
|
|
|
123 |
|
|
int i; |
124 |
|
|
struct multiswline *list; |
125 |
|
|
struct swline *slist; |
126 |
|
|
|
127 |
|
|
if (strcasecmp(w0, "maxdepth") == 0) |
128 |
|
|
{ |
129 |
|
|
if (sl->n == 2) |
130 |
|
|
{ |
131 |
|
|
retval = 1; |
132 |
|
|
http->maxdepth = atoi(sl->word[1]); |
133 |
|
|
} |
134 |
|
|
else |
135 |
|
|
progerr("MaxDepth requires one value"); |
136 |
|
|
} |
137 |
|
|
else if (strcasecmp(w0, "delay") == 0) |
138 |
|
|
{ |
139 |
|
|
if (sl->n == 2) |
140 |
|
|
{ |
141 |
|
|
retval = 1; |
142 |
|
|
http->delay = atoi(sl->word[1]); |
143 |
|
|
} |
144 |
|
|
else |
145 |
|
|
progerr("Delay requires one value"); |
146 |
|
|
} |
147 |
|
|
else if (strcasecmp(w0, "spiderdirectory") == 0) |
148 |
|
|
{ |
149 |
|
|
if (sl->n == 2) |
150 |
|
|
{ |
151 |
|
|
retval = 1; |
152 |
|
|
http->spiderdirectory = erealloc( http->spiderdirectory, strlen(sl->word[1])+2); |
153 |
|
|
strcpy( http->spiderdirectory, sl->word[1] ); |
154 |
|
|
normalize_path( http->spiderdirectory ); |
155 |
|
|
|
156 |
|
|
|
157 |
|
|
if (!isdirectory(http->spiderdirectory)) |
158 |
|
|
{ |
159 |
|
|
progerr("SpiderDirectory. %s is not a directory", http->spiderdirectory); |
160 |
|
|
} |
161 |
|
|
|
162 |
|
|
if ( strlen( http->spiderdirectory ) != 1 || http->spiderdirectory[0] != '/' ) |
163 |
|
|
strcat(http->spiderdirectory, "/" ); /* In this case, we just add the delimiter */ |
164 |
|
|
|
165 |
|
|
} |
166 |
|
|
else |
167 |
|
|
progerr("SpiderDirectory requires one value"); |
168 |
|
|
} |
169 |
|
|
else if (strcasecmp(w0, "equivalentserver") == 0) |
170 |
|
|
{ |
171 |
|
|
if (sl->n > 1) |
172 |
|
|
{ |
173 |
|
|
retval = 1; |
174 |
|
|
/* Add a new list of equivalent servers */ |
175 |
|
|
list = (struct multiswline *) emalloc(sizeof(struct multiswline)); |
176 |
|
|
|
177 |
|
|
list->next = http->equivalentservers; |
178 |
|
|
list->list = 0; |
179 |
|
|
http->equivalentservers = list; |
180 |
|
|
|
181 |
|
|
for (i = 1; i < sl->n; i++) |
182 |
|
|
{ |
183 |
|
|
/* Add a new entry to this list */ |
184 |
|
|
slist = (struct swline *) emalloc(sizeof(struct swline)); |
185 |
|
|
|
186 |
|
|
slist->line = estrdup( sl->word[i] ); |
187 |
|
|
slist->next = list->list; |
188 |
|
|
list->list = slist; |
189 |
|
|
} |
190 |
|
|
|
191 |
|
|
} |
192 |
|
|
else |
193 |
|
|
progerr("EquivalentServers requires at least one value"); |
194 |
|
|
} |
195 |
|
|
else |
196 |
|
|
{ |
197 |
|
|
retval = 0; |
198 |
|
|
} |
199 |
|
|
|
200 |
|
|
return retval; |
201 |
|
|
} |
202 |
|
|
typedef struct urldepth |
203 |
|
|
{ |
204 |
|
|
char *url; |
205 |
|
|
int depth; |
206 |
|
|
struct urldepth *next; |
207 |
|
|
} |
208 |
|
|
urldepth; |
209 |
|
|
|
210 |
|
|
|
211 |
|
|
int http_already_indexed(SWISH * sw, char *url); |
212 |
|
|
urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl); |
213 |
|
|
|
214 |
|
|
|
215 |
|
|
urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl) |
216 |
|
|
{ |
217 |
|
|
urldepth *item; |
218 |
|
|
struct MOD_HTTP *http = sw->HTTP; |
219 |
|
|
|
220 |
|
|
|
221 |
|
|
if (!equivalentserver(sw, url, baseurl)) |
222 |
|
|
{ |
223 |
|
|
if (sw->verbose >= 3) |
224 |
|
|
printf("Skipping %s: %s\n", url, "Wrong method or server."); |
225 |
|
|
|
226 |
|
|
|
227 |
|
|
} |
228 |
|
|
else if (http->maxdepth && (depth >= http->maxdepth)) |
229 |
|
|
{ |
230 |
|
|
if (sw->verbose >= 3) |
231 |
|
|
printf("Skipping %s: %s\n", url, "Too deep."); |
232 |
|
|
} |
233 |
|
|
else if (sw->nocontentslist && isoksuffix(url, sw->nocontentslist)) |
234 |
|
|
{ |
235 |
|
|
if (sw->verbose >= 3) |
236 |
|
|
printf("Skipping %s: %s\n", url, "Wrong suffix."); |
237 |
|
|
|
238 |
|
|
} |
239 |
|
|
else if (urldisallowed(sw, url)) |
240 |
|
|
{ |
241 |
|
|
if (sw->verbose >= 3) |
242 |
|
|
printf("Skipping %s: %s\n", url, "URL disallowed by robots.txt."); |
243 |
|
|
} |
244 |
|
|
else if (!http_already_indexed(sw, url)) |
245 |
|
|
{ |
246 |
|
|
item = (urldepth *) emalloc(sizeof(urldepth)); |
247 |
|
|
item->url = estrdup(url); |
248 |
|
|
item->depth = depth; |
249 |
|
|
#if 0 |
250 |
|
|
/* Depth first searching |
251 |
|
|
* */ |
252 |
|
|
item->next = list; |
253 |
|
|
list = item; |
254 |
|
|
#else |
255 |
|
|
/* Breadth first searching |
256 |
|
|
* */ |
257 |
|
|
item->next = 0; |
258 |
|
|
if (!list) |
259 |
|
|
{ |
260 |
|
|
list = item; |
261 |
|
|
} |
262 |
|
|
else |
263 |
|
|
{ |
264 |
|
|
urldepth *walk; |
265 |
|
|
|
266 |
|
|
for (walk = list; walk->next; walk = walk->next) |
267 |
|
|
{ |
268 |
|
|
} |
269 |
|
|
walk->next = item; |
270 |
|
|
} |
271 |
|
|
#endif |
272 |
|
|
} |
273 |
|
|
|
274 |
|
|
return list; |
275 |
|
|
} |
276 |
|
|
|
277 |
|
|
|
278 |
|
|
/* Have we already indexed a file or directory? |
279 |
|
|
** This function is used to avoid multiple index entries |
280 |
|
|
** or endless looping due to symbolic links. |
281 |
|
|
*/ |
282 |
|
|
|
283 |
|
|
int http_already_indexed(SWISH * sw, char *url) |
284 |
|
|
{ |
285 |
|
|
struct url_info *p; |
286 |
|
|
|
287 |
|
|
int len; |
288 |
|
|
unsigned hashval; |
289 |
|
|
struct MOD_HTTP *http = sw->HTTP; |
290 |
|
|
|
291 |
|
|
/* Hash with via the uri alone. Depending on the equivalent |
292 |
|
|
** servers, we may or may not make the decision of the entire |
293 |
|
|
** url or just the uri. |
294 |
|
|
*/ |
295 |
|
|
hashval = bighash(url_uri(url, &len)); /* Search hash for this file. */ |
296 |
|
|
for (p = http->url_hash[hashval]; p != NULL; p = p->next) |
297 |
|
|
if ((strcmp(url, p->url) == 0) || (equivalentserver(sw, url, p->url) && (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0))) |
298 |
|
|
{ /* We found it. */ |
299 |
|
|
if (sw->verbose >= 3) |
300 |
|
|
printf("Skipping %s: %s\n", url, "Already indexed."); |
301 |
|
|
return 1; |
302 |
|
|
} |
303 |
|
|
|
304 |
|
|
/* Not found, make new entry. */ |
305 |
|
|
p = (struct url_info *) emalloc(sizeof(struct url_info)); |
306 |
|
|
|
307 |
|
|
p->url = estrdup(url); |
308 |
|
|
p->next = http->url_hash[hashval]; |
309 |
|
|
http->url_hash[hashval] = p; |
310 |
|
|
|
311 |
|
|
return 0; |
312 |
|
|
} |
313 |
|
|
|
314 |
|
|
|
315 |
|
|
char *url_method(char *url, int *plen) |
316 |
|
|
{ |
317 |
|
|
char *end; |
318 |
|
|
|
319 |
|
|
if ((end = strstr(url, "://")) == NULL) |
320 |
|
|
{ |
321 |
|
|
return NULL; |
322 |
|
|
} |
323 |
|
|
*plen = end - url; |
324 |
|
|
return url; |
325 |
|
|
} |
326 |
|
|
|
327 |
|
|
|
328 |
|
|
char *url_serverport(char *url, int *plen) |
329 |
|
|
{ |
330 |
|
|
int methodlen; |
331 |
|
|
char *serverstart; |
332 |
|
|
char *serverend; |
333 |
|
|
|
334 |
|
|
if (url_method(url, &methodlen) == NULL) |
335 |
|
|
{ |
336 |
|
|
return NULL; |
337 |
|
|
} |
338 |
|
|
|
339 |
|
|
/* +3 for |
340 |
|
|
* */ |
341 |
|
|
serverstart = url + methodlen + 3; |
342 |
|
|
if ((serverend = strchr(serverstart, '/')) == NULL) |
343 |
|
|
{ |
344 |
|
|
*plen = strlen(serverstart); |
345 |
|
|
} |
346 |
|
|
else |
347 |
|
|
{ |
348 |
|
|
*plen = serverend - serverstart; |
349 |
|
|
} |
350 |
|
|
|
351 |
|
|
return serverstart; |
352 |
|
|
} |
353 |
|
|
|
354 |
|
|
|
355 |
|
|
char *url_uri(char *url, int *plen) |
356 |
|
|
{ |
357 |
|
|
if ((url = url_serverport(url, plen)) == 0) |
358 |
|
|
{ |
359 |
|
|
return 0; |
360 |
|
|
} |
361 |
|
|
url += *plen; |
362 |
|
|
*plen = strlen(url); |
363 |
|
|
return url; |
364 |
|
|
} |
365 |
|
|
/************************************************************ |
366 |
|
|
* |
367 |
|
|
* Fork and exec a program, and wait for child to exit. |
368 |
|
|
* Returns |
369 |
|
|
* |
370 |
|
|
*************************************************************/ |
371 |
|
|
#ifndef _WIN32 |
372 |
|
|
static void run_program(char* prog, char** args) |
373 |
|
|
{ |
374 |
|
|
pid_t pid = fork(); |
375 |
|
|
int status; |
376 |
|
|
|
377 |
|
|
/* In parent, wait for child */ |
378 |
|
|
if ( pid ) |
379 |
|
|
{ |
380 |
|
|
wait( &status ); |
381 |
|
|
if ( WIFEXITED( status ) ) // exited normally if non-zero |
382 |
|
|
return; |
383 |
|
|
|
384 |
|
|
progerr("%s exited with non-zero status (%d)", prog, WEXITSTATUS(status) ); |
385 |
|
|
} |
386 |
|
|
|
387 |
|
|
execvp (prog, args); |
388 |
|
|
progerrno("Failed to fork '%s'. Error: ", prog ); |
389 |
|
|
} |
390 |
|
|
#endif |
391 |
|
|
|
392 |
|
|
/************************************************************ |
393 |
|
|
* |
394 |
|
|
* Fetch a URL |
395 |
|
|
* Side effect that it appends to "response_file" |
396 |
|
|
* -- lazy programmer hoping that -S http will go away... |
397 |
|
|
* |
398 |
|
|
* Under Windows system() is used to call "perl" |
399 |
|
|
* Otherwise, exec is called on the swishspider program |
400 |
|
|
* |
401 |
|
|
*************************************************************/ |
402 |
|
|
|
403 |
|
|
int get(SWISH * sw, char *contenttype_or_redirect, time_t *last_modified, time_t * plastretrieval, char *file_prefix, char *url) |
404 |
|
|
{ |
405 |
|
|
int code = 500; |
406 |
|
|
FILE *fp; |
407 |
|
|
struct MOD_HTTP *http = sw->HTTP; |
408 |
|
|
|
409 |
|
|
/* Build path to swishspider program */ |
410 |
|
|
char *spider_prog = emalloc( strlen(http->spiderdirectory) + strlen("swishspider+fill") ); |
411 |
|
|
sprintf(spider_prog, "%sswishspider", http->spiderdirectory ); // note that spiderdir MUST be set. |
412 |
|
|
|
413 |
|
|
|
414 |
|
|
/* Sleep a little so we don't overwhelm the server */ |
415 |
|
|
if ((time(0) - *plastretrieval) < http->delay) |
416 |
|
|
{ |
417 |
|
|
int num_sec = http->delay - (time(0) - *plastretrieval); |
418 |
|
|
sleep(num_sec); |
419 |
|
|
} |
420 |
|
|
|
421 |
|
|
*plastretrieval = time(0); |
422 |
|
|
|
423 |
|
|
|
424 |
|
|
#ifdef _WIN32 |
425 |
|
|
/* Should be in autoconf or obsoleted by extprog. - DLN 2001-11-05 */ |
426 |
|
|
{ |
427 |
|
|
int retval; |
428 |
|
|
char commandline[] = "perl %s %s \"%s\""; |
429 |
|
|
char *command = emalloc( strlen(commandline) + strlen(spider_prog) + strlen(file_prefix) + strlen(url) + 1 ); |
430 |
|
|
|
431 |
|
|
sprintf(command, commandline, spider_prog, file_prefix, url); |
432 |
|
|
|
433 |
|
|
retval = system( command ); |
434 |
|
|
efree( command ); |
435 |
|
|
efree( spider_prog ); |
436 |
|
|
|
437 |
|
|
printf("Returned %d\n", retval ); |
438 |
|
|
|
439 |
|
|
if ( retval ) |
440 |
|
|
return 500; |
441 |
|
|
} |
442 |
|
|
#else |
443 |
|
|
{ |
444 |
|
|
char *args[4]; |
445 |
|
|
|
446 |
|
|
args[0] = spider_prog; |
447 |
|
|
args[1] = file_prefix; |
448 |
|
|
args[2] = url; |
449 |
|
|
args[3] = NULL; |
450 |
|
|
run_program( spider_prog, args ); |
451 |
|
|
efree( spider_prog ); |
452 |
|
|
} |
453 |
|
|
#endif |
454 |
|
|
|
455 |
|
|
|
456 |
|
|
/* NAUGHTY SIDE EFFECT */ |
457 |
|
|
strcat( file_prefix, ".response" ); |
458 |
|
|
|
459 |
|
|
if ( !(fp = fopen(file_prefix, F_READ_TEXT)) ) |
460 |
|
|
{ |
461 |
|
|
progerrno("Failed to open file '%s': ", file_prefix ); |
462 |
|
|
} |
463 |
|
|
else |
464 |
|
|
{ |
465 |
|
|
char buffer[500]; |
466 |
|
|
|
467 |
|
|
fgets(buffer, 400, fp); |
468 |
|
|
code = atoi(buffer); |
469 |
|
|
if ((code == 200) || ((code / 100) == 3)) |
470 |
|
|
{ |
471 |
|
|
/* read content-type redirect */ |
472 |
|
|
fgets(contenttype_or_redirect, MAXSTRLEN, fp); /* more yuck */ |
473 |
|
|
*(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '\0'; |
474 |
|
|
} |
475 |
|
|
if (code == 200) |
476 |
|
|
{ |
477 |
|
|
/* read last-mod time */ |
478 |
|
|
fgets(buffer, 400, fp); /* more yuck */ |
479 |
|
|
*last_modified = (time_t)strtol(buffer, NULL, 10); // go away http.c -- no error checking |
480 |
|
|
} |
481 |
|
|
|
482 |
|
|
|
483 |
|
|
fclose(fp); |
484 |
|
|
} |
485 |
|
|
|
486 |
|
|
return code; |
487 |
|
|
} |
488 |
|
|
|
489 |
|
|
int cmdf(int (*cmd) (const char *), char *fmt, char *string, pid_t pid) |
490 |
|
|
{ |
491 |
|
|
int rc; |
492 |
|
|
char *buffer; |
493 |
|
|
|
494 |
|
|
buffer = emalloc(strlen(fmt) + strlen(string) + sizeof(pid_t) * 8 + 1); |
495 |
|
|
|
496 |
|
|
sprintf(buffer, fmt, string, pid); |
497 |
|
|
|
498 |
|
|
rc = cmd(buffer); |
499 |
|
|
efree(buffer); |
500 |
|
|
return rc; |
501 |
|
|
} |
502 |
|
|
|
503 |
|
|
char *readline(FILE * fp) |
504 |
|
|
{ |
505 |
|
|
static char *buffer = 0; |
506 |
|
|
static int buffersize = 512; |
507 |
|
|
|
508 |
|
|
if (buffer == 0) |
509 |
|
|
{ |
510 |
|
|
buffer = (char *) emalloc(buffersize); |
511 |
|
|
} |
512 |
|
|
/* |
513 |
|
|
*Try to read in the line |
514 |
|
|
*/ |
515 |
|
|
|
516 |
|
|
if (fgets(buffer, buffersize, fp) == NULL) |
517 |
|
|
{ |
518 |
|
|
return NULL; |
519 |
|
|
} |
520 |
|
|
|
521 |
|
|
/* |
522 |
|
|
* Make sure we read the entire line. If not, double the buffer |
523 |
|
|
* size and try to read the rest |
524 |
|
|
*/ |
525 |
|
|
while (buffer[strlen(buffer) - 1] != '\n') |
526 |
|
|
{ |
527 |
|
|
buffer = (char *) erealloc(buffer, buffersize * 2); |
528 |
|
|
|
529 |
|
|
/* |
530 |
|
|
* The easiest way to verify that this line is okay is to consider |
531 |
|
|
* the situation where the buffer is 2 bytes longs. Since fgets() |
532 |
|
|
* always guarantees to put the trailing NULL, it will have essentially |
533 |
|
|
* used only 1 bytes. We double it to four, so we now have the left |
534 |
|
|
* over byte (that currently contains NULL) in addition to the doubling |
535 |
|
|
* which gets us to read buffersize + 1. |
536 |
|
|
*/ |
537 |
|
|
if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0) |
538 |
|
|
{ |
539 |
|
|
break; |
540 |
|
|
} |
541 |
|
|
buffersize *= 2; |
542 |
|
|
} |
543 |
|
|
|
544 |
|
|
return buffer; |
545 |
|
|
} |
546 |
|
|
|
547 |
|
|
|
548 |
|
|
/* A local version of getpid() so that we don't have to suffer |
549 |
|
|
** a system call each time we need it. |
550 |
|
|
*/ |
551 |
|
|
pid_t lgetpid() |
552 |
|
|
{ |
553 |
|
|
static pid_t pid = -1; |
554 |
|
|
|
555 |
|
|
if (pid == -1) |
556 |
|
|
{ |
557 |
|
|
pid = getpid(); |
558 |
|
|
} |
559 |
|
|
return pid; |
560 |
|
|
} |
561 |
|
|
|
562 |
|
|
#if 0 |
563 |
|
|
|
564 |
|
|
/* Testing the robot rules parsing code... |
565 |
|
|
**/ |
566 |
|
|
void http_indexpath(char *url) |
567 |
|
|
{ |
568 |
|
|
httpserverinfo *server = getserverinfo(url); |
569 |
|
|
robotrules *robotrule; |
570 |
|
|
|
571 |
|
|
printf("User-agent: %s\n", server->useragent ? server->useragent : "(none)"); |
572 |
|
|
for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next) |
573 |
|
|
{ |
574 |
|
|
printf("Disallow: %s\n", robotrule->disallow); |
575 |
|
|
} |
576 |
|
|
} |
577 |
|
|
|
578 |
|
|
#else |
579 |
|
|
|
580 |
|
|
/********************************************************/ |
581 |
|
|
/* "Public" functions */ |
582 |
|
|
/********************************************************/ |
583 |
|
|
|
584 |
|
|
/* The main entry point for the module. For fs.c, decides whether this |
585 |
|
|
** is a file or directory and routes to the correct routine. |
586 |
|
|
*/ |
587 |
|
|
void http_indexpath(SWISH * sw, char *url) |
588 |
|
|
{ |
589 |
|
|
urldepth *urllist = 0; |
590 |
|
|
urldepth *item; |
591 |
|
|
static int lentitle = 0; |
592 |
|
|
static char *title = NULL; |
593 |
|
|
char *tmptitle; |
594 |
|
|
static int lencontenttype = 0; |
595 |
|
|
static char *contenttype = NULL; |
596 |
|
|
int code; |
597 |
|
|
time_t last_modified = 0; |
598 |
|
|
|
599 |
|
|
httpserverinfo *server; |
600 |
|
|
char *link; |
601 |
|
|
char *p; |
602 |
|
|
FileProp *fprop; |
603 |
|
|
FILE *fp; |
604 |
|
|
struct MOD_Index *idx = sw->Index; |
605 |
|
|
|
606 |
|
|
char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack! |
607 |
|
|
char *file_suffix; // where to copy the suffix |
608 |
|
|
|
609 |
|
|
|
610 |
|
|
/* Initialize buffers */ |
611 |
|
|
|
612 |
|
|
|
613 |
|
|
file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") ); |
614 |
|
|
sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid()); |
615 |
|
|
file_suffix = file_prefix + strlen( file_prefix ); |
616 |
|
|
|
617 |
|
|
|
618 |
|
|
if (!lentitle) |
619 |
|
|
title = emalloc((lentitle = MAXSTRLEN) + 1); |
620 |
|
|
|
621 |
|
|
if (!lencontenttype) |
622 |
|
|
contenttype = emalloc((lencontenttype = MAXSTRLEN) + 1); |
623 |
|
|
|
624 |
|
|
|
625 |
|
|
|
626 |
|
|
/* prime the pump with the first url */ |
627 |
|
|
urllist = add_url(sw, urllist, url, 0, url); |
628 |
|
|
|
629 |
|
|
|
630 |
|
|
|
631 |
|
|
/* retrieve each url and add urls to a certain depth */ |
632 |
|
|
|
633 |
|
|
while (urllist) |
634 |
|
|
{ |
635 |
|
|
item = urllist; |
636 |
|
|
urllist = urllist->next; |
637 |
|
|
|
638 |
|
|
if (sw->verbose >= 2) |
639 |
|
|
{ |
640 |
|
|
printf("retrieving %s (%d)...\n", item->url, item->depth); |
641 |
|
|
fflush(stdout); |
642 |
|
|
} |
643 |
|
|
|
644 |
|
|
/* We don't check if this url is legal here, because we do that before adding to the list. */ |
645 |
|
|
server = getserverinfo(sw, item->url); |
646 |
|
|
|
647 |
|
|
strcpy( file_suffix, "" ); // reset to just the prefix |
648 |
|
|
|
649 |
|
|
if ((code = get(sw, contenttype, &last_modified, &server->lastretrieval, file_prefix, item->url)) == 200) |
650 |
|
|
{ |
651 |
|
|
/* Set the file_prefix to be the path to "contents" */ |
652 |
|
|
strcpy( file_suffix, ".contents" ); |
653 |
|
|
|
654 |
|
|
|
655 |
|
|
/* Patch from Steve van der Burg */ |
656 |
|
|
/* change from strcmp to strncmp */ |
657 |
|
|
|
658 |
|
|
|
659 |
|
|
/* Fetch title from doc if it's HTML */ |
660 |
|
|
|
661 |
|
|
if (strncmp(contenttype, "text/html", 9) == 0) |
662 |
|
|
title = SafeStrCopy(title, (char *) (tmptitle = parseHTMLtitle(sw , file_prefix)), &lentitle); |
663 |
|
|
else |
664 |
|
|
if ((p = strrchr(item->url, '/'))) |
665 |
|
|
title = SafeStrCopy(title, p + 1, &lentitle); |
666 |
|
|
else |
667 |
|
|
title = SafeStrCopy(title, item->url, &lentitle); |
668 |
|
|
|
669 |
|
|
|
670 |
|
|
/* Now index the file */ |
671 |
|
|
|
672 |
|
|
/* What to do with non text files?? */ |
673 |
|
|
if ( strncmp(contenttype, "text/", 5) == 0 ) |
674 |
|
|
{ |
675 |
|
|
fprop = file_properties(item->url, file_prefix, sw); |
676 |
|
|
fprop->mtime = last_modified; |
677 |
|
|
|
678 |
|
|
/* only index contents of text docs */ |
679 |
|
|
// this would just index the path name |
680 |
|
|
//fprop->index_no_content = strncmp(contenttype, "text/", 5); |
681 |
|
|
|
682 |
|
|
do_index_file(sw, fprop); |
683 |
|
|
free_file_properties(fprop); |
684 |
|
|
} |
685 |
|
|
else if (sw->verbose >= 3) |
686 |
|
|
printf("Skipping %s: Wrong content type: %s.\n", url, contenttype); |
687 |
|
|
|
688 |
|
|
|
689 |
|
|
|
690 |
|
|
|
691 |
|
|
/* add new links as extracted by the spider */ |
692 |
|
|
|
693 |
|
|
if (strncmp(contenttype, "text/html", 9) == 0) |
694 |
|
|
{ |
695 |
|
|
strcpy( file_suffix, ".links" ); |
696 |
|
|
|
697 |
|
|
if ((fp = fopen(file_prefix, F_READ_TEXT)) != NULL) |
698 |
|
|
{ |
699 |
|
|
/* URLs can get quite large so don't depend on a fixed size buffer */ |
700 |
|
|
|
701 |
|
|
while ((link = readline(fp)) != NULL) |
702 |
|
|
{ |
703 |
|
|
*(link + strlen(link) - 1) = '\0'; |
704 |
|
|
urllist = add_url(sw, urllist, link, item->depth + 1, url); |
705 |
|
|
} |
706 |
|
|
fclose(fp); |
707 |
|
|
} |
708 |
|
|
} |
709 |
|
|
|
710 |
|
|
} |
711 |
|
|
else if ((code / 100) == 3) |
712 |
|
|
{ |
713 |
|
|
if ( *contenttype ) |
714 |
|
|
urllist = add_url(sw, urllist, contenttype, item->depth, url); |
715 |
|
|
else |
716 |
|
|
if (sw->verbose >= 3) |
717 |
|
|
printf("URL '%s' returned redirect code %d without a Location.\n", url, code); |
718 |
|
|
} |
719 |
|
|
|
720 |
|
|
|
721 |
|
|
|
722 |
|
|
/* Clean up the files left by swishspider */ |
723 |
|
|
cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid()); |
724 |
|
|
cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid()); |
725 |
|
|
cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid()); |
726 |
|
|
} |
727 |
|
|
efree(file_prefix); |
728 |
|
|
} |
729 |
|
|
|
730 |
|
|
#endif |
731 |
|
|
|
732 |
|
|
|
733 |
|
|
|
734 |
|
|
|
735 |
|
|
struct _indexing_data_source_def HTTPIndexingDataSource = { |
736 |
|
|
"HTTP-Crawler", |
737 |
|
|
"http", |
738 |
|
|
http_indexpath, |
739 |
|
|
configModule_HTTP |
740 |
|
|
}; |