/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/http.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/http.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4 **
5 ** This program and library is free software; you can redistribute it and/or
6 ** modify it under the terms of the GNU (Library) General Public License
7 ** as published by the Free Software Foundation; either version 2
8 ** of the License, or any later version.
9 **
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU (Library) General Public License for more details.
14 **
15 ** You should have received a copy of the GNU (Library) General Public License
16 ** long with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 **--------------------------------------------------------------------
19 ** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
20 **
21 ** change sprintf to snprintf to avoid corruption,
22 ** test length of spiderdirectory before strcat to avoid corruption,
23 ** added safestrcpy() macro to avoid corruption from strcpy overflow,
24 ** define MAXPIDLEN instead of literal "32" - assumed return length from lgetpid()
25 ** SRE 11/17/99
26 **
27 ** added buffer size arg to grabStringValue - core dumping from overrun
28 ** SRE 2/22/00
29 **
30 ** 2000-11 jruiz,rasc some redesign
31 */
32
33 /*
34 ** http.c
35 */
36
37 #ifdef HAVE_CONFIG_H
38 #include "acconfig.h"
39 #endif
40
41 #ifdef HAVE_UNISTD_H
42 #include <unistd.h>
43 #endif
44
45 #ifdef HAVE_STDLIB_H
46 #include <stdlib.h>
47 #endif
48
49 #ifdef HAVE_PROCESS_H
50 #include <process.h>
51 #endif
52
53 #include <time.h>
54 #include <stdarg.h>
55
56 // for wait
57 #ifndef _WIN32
58 #include <sys/types.h>
59 #include <sys/wait.h>
60 #endif
61
62 #include "swish.h"
63 #include "mem.h"
64 #include "string.h"
65 #include "index.h"
66 #include "hash.h"
67 #include "file.h"
68 #include "check.h"
69 #include "error.h"
70
71 #include "http.h"
72 #include "httpserver.h"
73
74 #include "xml.h"
75 #include "txt.h"
76 #include "html.h"
77
78 /*
79 -- init structures for this module
80 */
81
82 void initModule_HTTP(SWISH * sw)
83 {
84 struct MOD_HTTP *http;
85 int i;
86
87 http = (struct MOD_HTTP *) emalloc(sizeof(struct MOD_HTTP));
88
89 sw->HTTP = http;
90
91 http->lenspiderdirectory = MAXSTRLEN;
92 http->spiderdirectory = (char *) emalloc(http->lenspiderdirectory + 1);
93 http->spiderdirectory[0] = '\0';
94 /* Initialize spider directory */
95 http->spiderdirectory = SafeStrCopy(http->spiderdirectory, SPIDERDIRECTORY, &http->lenspiderdirectory);
96
97 for (i = 0; i < BIGHASHSIZE; i++)
98 http->url_hash[i] = NULL;
99
100 http->equivalentservers = NULL;
101
102 /* http default system parameters */
103 http->maxdepth = 5;
104 http->delay = 60;
105 }
106
107 void freeModule_HTTP(SWISH * sw)
108 {
109 struct MOD_HTTP *http = sw->HTTP;
110
111 if (http->spiderdirectory)
112 efree(http->spiderdirectory);
113 efree(http);
114 sw->HTTP = NULL;
115 }
116
117 int configModule_HTTP(SWISH * sw, StringList * sl)
118 {
119 struct MOD_HTTP *http = sw->HTTP;
120 char *w0 = sl->word[0];
121 int retval = 1;
122
123 int i;
124 struct multiswline *list;
125 struct swline *slist;
126
127 if (strcasecmp(w0, "maxdepth") == 0)
128 {
129 if (sl->n == 2)
130 {
131 retval = 1;
132 http->maxdepth = atoi(sl->word[1]);
133 }
134 else
135 progerr("MaxDepth requires one value");
136 }
137 else if (strcasecmp(w0, "delay") == 0)
138 {
139 if (sl->n == 2)
140 {
141 retval = 1;
142 http->delay = atoi(sl->word[1]);
143 }
144 else
145 progerr("Delay requires one value");
146 }
147 else if (strcasecmp(w0, "spiderdirectory") == 0)
148 {
149 if (sl->n == 2)
150 {
151 retval = 1;
152 http->spiderdirectory = erealloc( http->spiderdirectory, strlen(sl->word[1])+2);
153 strcpy( http->spiderdirectory, sl->word[1] );
154 normalize_path( http->spiderdirectory );
155
156
157 if (!isdirectory(http->spiderdirectory))
158 {
159 progerr("SpiderDirectory. %s is not a directory", http->spiderdirectory);
160 }
161
162 if ( strlen( http->spiderdirectory ) != 1 || http->spiderdirectory[0] != '/' )
163 strcat(http->spiderdirectory, "/" ); /* In this case, we just add the delimiter */
164
165 }
166 else
167 progerr("SpiderDirectory requires one value");
168 }
169 else if (strcasecmp(w0, "equivalentserver") == 0)
170 {
171 if (sl->n > 1)
172 {
173 retval = 1;
174 /* Add a new list of equivalent servers */
175 list = (struct multiswline *) emalloc(sizeof(struct multiswline));
176
177 list->next = http->equivalentservers;
178 list->list = 0;
179 http->equivalentservers = list;
180
181 for (i = 1; i < sl->n; i++)
182 {
183 /* Add a new entry to this list */
184 slist = (struct swline *) emalloc(sizeof(struct swline));
185
186 slist->line = estrdup( sl->word[i] );
187 slist->next = list->list;
188 list->list = slist;
189 }
190
191 }
192 else
193 progerr("EquivalentServers requires at least one value");
194 }
195 else
196 {
197 retval = 0;
198 }
199
200 return retval;
201 }
202 typedef struct urldepth
203 {
204 char *url;
205 int depth;
206 struct urldepth *next;
207 }
208 urldepth;
209
210
211 int http_already_indexed(SWISH * sw, char *url);
212 urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl);
213
214
215 urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl)
216 {
217 urldepth *item;
218 struct MOD_HTTP *http = sw->HTTP;
219
220
221 if (!equivalentserver(sw, url, baseurl))
222 {
223 if (sw->verbose >= 3)
224 printf("Skipping %s: %s\n", url, "Wrong method or server.");
225
226
227 }
228 else if (http->maxdepth && (depth >= http->maxdepth))
229 {
230 if (sw->verbose >= 3)
231 printf("Skipping %s: %s\n", url, "Too deep.");
232 }
233 else if (sw->nocontentslist && isoksuffix(url, sw->nocontentslist))
234 {
235 if (sw->verbose >= 3)
236 printf("Skipping %s: %s\n", url, "Wrong suffix.");
237
238 }
239 else if (urldisallowed(sw, url))
240 {
241 if (sw->verbose >= 3)
242 printf("Skipping %s: %s\n", url, "URL disallowed by robots.txt.");
243 }
244 else if (!http_already_indexed(sw, url))
245 {
246 item = (urldepth *) emalloc(sizeof(urldepth));
247 item->url = estrdup(url);
248 item->depth = depth;
249 #if 0
250 /* Depth first searching
251 * */
252 item->next = list;
253 list = item;
254 #else
255 /* Breadth first searching
256 * */
257 item->next = 0;
258 if (!list)
259 {
260 list = item;
261 }
262 else
263 {
264 urldepth *walk;
265
266 for (walk = list; walk->next; walk = walk->next)
267 {
268 }
269 walk->next = item;
270 }
271 #endif
272 }
273
274 return list;
275 }
276
277
278 /* Have we already indexed a file or directory?
279 ** This function is used to avoid multiple index entries
280 ** or endless looping due to symbolic links.
281 */
282
283 int http_already_indexed(SWISH * sw, char *url)
284 {
285 struct url_info *p;
286
287 int len;
288 unsigned hashval;
289 struct MOD_HTTP *http = sw->HTTP;
290
291 /* Hash with via the uri alone. Depending on the equivalent
292 ** servers, we may or may not make the decision of the entire
293 ** url or just the uri.
294 */
295 hashval = bighash(url_uri(url, &len)); /* Search hash for this file. */
296 for (p = http->url_hash[hashval]; p != NULL; p = p->next)
297 if ((strcmp(url, p->url) == 0) || (equivalentserver(sw, url, p->url) && (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)))
298 { /* We found it. */
299 if (sw->verbose >= 3)
300 printf("Skipping %s: %s\n", url, "Already indexed.");
301 return 1;
302 }
303
304 /* Not found, make new entry. */
305 p = (struct url_info *) emalloc(sizeof(struct url_info));
306
307 p->url = estrdup(url);
308 p->next = http->url_hash[hashval];
309 http->url_hash[hashval] = p;
310
311 return 0;
312 }
313
314
315 char *url_method(char *url, int *plen)
316 {
317 char *end;
318
319 if ((end = strstr(url, "://")) == NULL)
320 {
321 return NULL;
322 }
323 *plen = end - url;
324 return url;
325 }
326
327
328 char *url_serverport(char *url, int *plen)
329 {
330 int methodlen;
331 char *serverstart;
332 char *serverend;
333
334 if (url_method(url, &methodlen) == NULL)
335 {
336 return NULL;
337 }
338
339 /* +3 for
340 * */
341 serverstart = url + methodlen + 3;
342 if ((serverend = strchr(serverstart, '/')) == NULL)
343 {
344 *plen = strlen(serverstart);
345 }
346 else
347 {
348 *plen = serverend - serverstart;
349 }
350
351 return serverstart;
352 }
353
354
355 char *url_uri(char *url, int *plen)
356 {
357 if ((url = url_serverport(url, plen)) == 0)
358 {
359 return 0;
360 }
361 url += *plen;
362 *plen = strlen(url);
363 return url;
364 }
365 /************************************************************
366 *
367 * Fork and exec a program, and wait for child to exit.
368 * Returns
369 *
370 *************************************************************/
371 #ifndef _WIN32
372 static void run_program(char* prog, char** args)
373 {
374 pid_t pid = fork();
375 int status;
376
377 /* In parent, wait for child */
378 if ( pid )
379 {
380 wait( &status );
381 if ( WIFEXITED( status ) ) // exited normally if non-zero
382 return;
383
384 progerr("%s exited with non-zero status (%d)", prog, WEXITSTATUS(status) );
385 }
386
387 execvp (prog, args);
388 progerrno("Failed to fork '%s'. Error: ", prog );
389 }
390 #endif
391
392 /************************************************************
393 *
394 * Fetch a URL
395 * Side effect that it appends to "response_file"
396 * -- lazy programmer hoping that -S http will go away...
397 *
398 * Under Windows system() is used to call "perl"
399 * Otherwise, exec is called on the swishspider program
400 *
401 *************************************************************/
402
403 int get(SWISH * sw, char *contenttype_or_redirect, time_t *last_modified, time_t * plastretrieval, char *file_prefix, char *url)
404 {
405 int code = 500;
406 FILE *fp;
407 struct MOD_HTTP *http = sw->HTTP;
408
409 /* Build path to swishspider program */
410 char *spider_prog = emalloc( strlen(http->spiderdirectory) + strlen("swishspider+fill") );
411 sprintf(spider_prog, "%sswishspider", http->spiderdirectory ); // note that spiderdir MUST be set.
412
413
414 /* Sleep a little so we don't overwhelm the server */
415 if ((time(0) - *plastretrieval) < http->delay)
416 {
417 int num_sec = http->delay - (time(0) - *plastretrieval);
418 sleep(num_sec);
419 }
420
421 *plastretrieval = time(0);
422
423
424 #ifdef _WIN32
425 /* Should be in autoconf or obsoleted by extprog. - DLN 2001-11-05 */
426 {
427 int retval;
428 char commandline[] = "perl %s %s \"%s\"";
429 char *command = emalloc( strlen(commandline) + strlen(spider_prog) + strlen(file_prefix) + strlen(url) + 1 );
430
431 sprintf(command, commandline, spider_prog, file_prefix, url);
432
433 retval = system( command );
434 efree( command );
435 efree( spider_prog );
436
437 printf("Returned %d\n", retval );
438
439 if ( retval )
440 return 500;
441 }
442 #else
443 {
444 char *args[4];
445
446 args[0] = spider_prog;
447 args[1] = file_prefix;
448 args[2] = url;
449 args[3] = NULL;
450 run_program( spider_prog, args );
451 efree( spider_prog );
452 }
453 #endif
454
455
456 /* NAUGHTY SIDE EFFECT */
457 strcat( file_prefix, ".response" );
458
459 if ( !(fp = fopen(file_prefix, F_READ_TEXT)) )
460 {
461 progerrno("Failed to open file '%s': ", file_prefix );
462 }
463 else
464 {
465 char buffer[500];
466
467 fgets(buffer, 400, fp);
468 code = atoi(buffer);
469 if ((code == 200) || ((code / 100) == 3))
470 {
471 /* read content-type redirect */
472 fgets(contenttype_or_redirect, MAXSTRLEN, fp); /* more yuck */
473 *(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '\0';
474 }
475 if (code == 200)
476 {
477 /* read last-mod time */
478 fgets(buffer, 400, fp); /* more yuck */
479 *last_modified = (time_t)strtol(buffer, NULL, 10); // go away http.c -- no error checking
480 }
481
482
483 fclose(fp);
484 }
485
486 return code;
487 }
488
489 int cmdf(int (*cmd) (const char *), char *fmt, char *string, pid_t pid)
490 {
491 int rc;
492 char *buffer;
493
494 buffer = emalloc(strlen(fmt) + strlen(string) + sizeof(pid_t) * 8 + 1);
495
496 sprintf(buffer, fmt, string, pid);
497
498 rc = cmd(buffer);
499 efree(buffer);
500 return rc;
501 }
502
503 char *readline(FILE * fp)
504 {
505 static char *buffer = 0;
506 static int buffersize = 512;
507
508 if (buffer == 0)
509 {
510 buffer = (char *) emalloc(buffersize);
511 }
512 /*
513 *Try to read in the line
514 */
515
516 if (fgets(buffer, buffersize, fp) == NULL)
517 {
518 return NULL;
519 }
520
521 /*
522 * Make sure we read the entire line. If not, double the buffer
523 * size and try to read the rest
524 */
525 while (buffer[strlen(buffer) - 1] != '\n')
526 {
527 buffer = (char *) erealloc(buffer, buffersize * 2);
528
529 /*
530 * The easiest way to verify that this line is okay is to consider
531 * the situation where the buffer is 2 bytes longs. Since fgets()
532 * always guarantees to put the trailing NULL, it will have essentially
533 * used only 1 bytes. We double it to four, so we now have the left
534 * over byte (that currently contains NULL) in addition to the doubling
535 * which gets us to read buffersize + 1.
536 */
537 if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0)
538 {
539 break;
540 }
541 buffersize *= 2;
542 }
543
544 return buffer;
545 }
546
547
548 /* A local version of getpid() so that we don't have to suffer
549 ** a system call each time we need it.
550 */
551 pid_t lgetpid()
552 {
553 static pid_t pid = -1;
554
555 if (pid == -1)
556 {
557 pid = getpid();
558 }
559 return pid;
560 }
561
562 #if 0
563
564 /* Testing the robot rules parsing code...
565 **/
566 void http_indexpath(char *url)
567 {
568 httpserverinfo *server = getserverinfo(url);
569 robotrules *robotrule;
570
571 printf("User-agent: %s\n", server->useragent ? server->useragent : "(none)");
572 for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next)
573 {
574 printf("Disallow: %s\n", robotrule->disallow);
575 }
576 }
577
578 #else
579
580 /********************************************************/
581 /* "Public" functions */
582 /********************************************************/
583
584 /* The main entry point for the module. For fs.c, decides whether this
585 ** is a file or directory and routes to the correct routine.
586 */
587 void http_indexpath(SWISH * sw, char *url)
588 {
589 urldepth *urllist = 0;
590 urldepth *item;
591 static int lentitle = 0;
592 static char *title = NULL;
593 char *tmptitle;
594 static int lencontenttype = 0;
595 static char *contenttype = NULL;
596 int code;
597 time_t last_modified = 0;
598
599 httpserverinfo *server;
600 char *link;
601 char *p;
602 FileProp *fprop;
603 FILE *fp;
604 struct MOD_Index *idx = sw->Index;
605
606 char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
607 char *file_suffix; // where to copy the suffix
608
609
610 /* Initialize buffers */
611
612
613 file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
614 sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
615 file_suffix = file_prefix + strlen( file_prefix );
616
617
618 if (!lentitle)
619 title = emalloc((lentitle = MAXSTRLEN) + 1);
620
621 if (!lencontenttype)
622 contenttype = emalloc((lencontenttype = MAXSTRLEN) + 1);
623
624
625
626 /* prime the pump with the first url */
627 urllist = add_url(sw, urllist, url, 0, url);
628
629
630
631 /* retrieve each url and add urls to a certain depth */
632
633 while (urllist)
634 {
635 item = urllist;
636 urllist = urllist->next;
637
638 if (sw->verbose >= 2)
639 {
640 printf("retrieving %s (%d)...\n", item->url, item->depth);
641 fflush(stdout);
642 }
643
644 /* We don't check if this url is legal here, because we do that before adding to the list. */
645 server = getserverinfo(sw, item->url);
646
647 strcpy( file_suffix, "" ); // reset to just the prefix
648
649 if ((code = get(sw, contenttype, &last_modified, &server->lastretrieval, file_prefix, item->url)) == 200)
650 {
651 /* Set the file_prefix to be the path to "contents" */
652 strcpy( file_suffix, ".contents" );
653
654
655 /* Patch from Steve van der Burg */
656 /* change from strcmp to strncmp */
657
658
659 /* Fetch title from doc if it's HTML */
660
661 if (strncmp(contenttype, "text/html", 9) == 0)
662 title = SafeStrCopy(title, (char *) (tmptitle = parseHTMLtitle(sw , file_prefix)), &lentitle);
663 else
664 if ((p = strrchr(item->url, '/')))
665 title = SafeStrCopy(title, p + 1, &lentitle);
666 else
667 title = SafeStrCopy(title, item->url, &lentitle);
668
669
670 /* Now index the file */
671
672 /* What to do with non text files?? */
673 if ( strncmp(contenttype, "text/", 5) == 0 )
674 {
675 fprop = file_properties(item->url, file_prefix, sw);
676 fprop->mtime = last_modified;
677
678 /* only index contents of text docs */
679 // this would just index the path name
680 //fprop->index_no_content = strncmp(contenttype, "text/", 5);
681
682 do_index_file(sw, fprop);
683 free_file_properties(fprop);
684 }
685 else if (sw->verbose >= 3)
686 printf("Skipping %s: Wrong content type: %s.\n", url, contenttype);
687
688
689
690
691 /* add new links as extracted by the spider */
692
693 if (strncmp(contenttype, "text/html", 9) == 0)
694 {
695 strcpy( file_suffix, ".links" );
696
697 if ((fp = fopen(file_prefix, F_READ_TEXT)) != NULL)
698 {
699 /* URLs can get quite large so don't depend on a fixed size buffer */
700
701 while ((link = readline(fp)) != NULL)
702 {
703 *(link + strlen(link) - 1) = '\0';
704 urllist = add_url(sw, urllist, link, item->depth + 1, url);
705 }
706 fclose(fp);
707 }
708 }
709
710 }
711 else if ((code / 100) == 3)
712 {
713 if ( *contenttype )
714 urllist = add_url(sw, urllist, contenttype, item->depth, url);
715 else
716 if (sw->verbose >= 3)
717 printf("URL '%s' returned redirect code %d without a Location.\n", url, code);
718 }
719
720
721
722 /* Clean up the files left by swishspider */
723 cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
724 cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
725 cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
726 }
727 efree(file_prefix);
728 }
729
730 #endif
731
732
733
734
735 struct _indexing_data_source_def HTTPIndexingDataSource = {
736 "HTTP-Crawler",
737 "http",
738 http_indexpath,
739 configModule_HTTP
740 };

  ViewVC Help
Powered by ViewVC 1.1.22