/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/http.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/http.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4     **
5     ** This program and library is free software; you can redistribute it and/or
6     ** modify it under the terms of the GNU (Library) General Public License
7     ** as published by the Free Software Foundation; either version 2
8     ** of the License, or any later version.
9     **
10     ** This program is distributed in the hope that it will be useful,
11     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13     ** GNU (Library) General Public License for more details.
14     **
15     ** You should have received a copy of the GNU (Library) General Public License
16     ** long with this program; if not, write to the Free Software
17     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18     **--------------------------------------------------------------------
19     ** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
20     **
21     ** change sprintf to snprintf to avoid corruption,
22     ** test length of spiderdirectory before strcat to avoid corruption,
23     ** added safestrcpy() macro to avoid corruption from strcpy overflow,
24     ** define MAXPIDLEN instead of literal "32" - assumed return length from lgetpid()
25     ** SRE 11/17/99
26     **
27     ** added buffer size arg to grabStringValue - core dumping from overrun
28     ** SRE 2/22/00
29     **
30     ** 2000-11 jruiz,rasc some redesign
31     */
32    
33     /*
34     ** http.c
35     */
36    
37     #ifdef HAVE_CONFIG_H
38     #include "acconfig.h"
39     #endif
40    
41     #ifdef HAVE_UNISTD_H
42     #include <unistd.h>
43     #endif
44    
45     #ifdef HAVE_STDLIB_H
46     #include <stdlib.h>
47     #endif
48    
49     #ifdef HAVE_PROCESS_H
50     #include <process.h>
51     #endif
52    
53     #include <time.h>
54     #include <stdarg.h>
55    
56     // for wait
57     #ifndef _WIN32
58     #include <sys/types.h>
59     #include <sys/wait.h>
60     #endif
61    
62     #include "swish.h"
63     #include "mem.h"
64     #include "string.h"
65     #include "index.h"
66     #include "hash.h"
67     #include "file.h"
68     #include "check.h"
69     #include "error.h"
70    
71     #include "http.h"
72     #include "httpserver.h"
73    
74     #include "xml.h"
75     #include "txt.h"
76     #include "html.h"
77    
78     /*
79     -- init structures for this module
80     */
81    
82     void initModule_HTTP(SWISH * sw)
83     {
84     struct MOD_HTTP *http;
85     int i;
86    
87     http = (struct MOD_HTTP *) emalloc(sizeof(struct MOD_HTTP));
88    
89     sw->HTTP = http;
90    
91     http->lenspiderdirectory = MAXSTRLEN;
92     http->spiderdirectory = (char *) emalloc(http->lenspiderdirectory + 1);
93     http->spiderdirectory[0] = '\0';
94     /* Initialize spider directory */
95     http->spiderdirectory = SafeStrCopy(http->spiderdirectory, SPIDERDIRECTORY, &http->lenspiderdirectory);
96    
97     for (i = 0; i < BIGHASHSIZE; i++)
98     http->url_hash[i] = NULL;
99    
100     http->equivalentservers = NULL;
101    
102     /* http default system parameters */
103     http->maxdepth = 5;
104     http->delay = 60;
105     }
106    
107     void freeModule_HTTP(SWISH * sw)
108     {
109     struct MOD_HTTP *http = sw->HTTP;
110    
111     if (http->spiderdirectory)
112     efree(http->spiderdirectory);
113     efree(http);
114     sw->HTTP = NULL;
115     }
116    
117     int configModule_HTTP(SWISH * sw, StringList * sl)
118     {
119     struct MOD_HTTP *http = sw->HTTP;
120     char *w0 = sl->word[0];
121     int retval = 1;
122    
123     int i;
124     struct multiswline *list;
125     struct swline *slist;
126    
127     if (strcasecmp(w0, "maxdepth") == 0)
128     {
129     if (sl->n == 2)
130     {
131     retval = 1;
132     http->maxdepth = atoi(sl->word[1]);
133     }
134     else
135     progerr("MaxDepth requires one value");
136     }
137     else if (strcasecmp(w0, "delay") == 0)
138     {
139     if (sl->n == 2)
140     {
141     retval = 1;
142     http->delay = atoi(sl->word[1]);
143     }
144     else
145     progerr("Delay requires one value");
146     }
147     else if (strcasecmp(w0, "spiderdirectory") == 0)
148     {
149     if (sl->n == 2)
150     {
151     retval = 1;
152     http->spiderdirectory = erealloc( http->spiderdirectory, strlen(sl->word[1])+2);
153     strcpy( http->spiderdirectory, sl->word[1] );
154     normalize_path( http->spiderdirectory );
155    
156    
157     if (!isdirectory(http->spiderdirectory))
158     {
159     progerr("SpiderDirectory. %s is not a directory", http->spiderdirectory);
160     }
161    
162     if ( strlen( http->spiderdirectory ) != 1 || http->spiderdirectory[0] != '/' )
163     strcat(http->spiderdirectory, "/" ); /* In this case, we just add the delimiter */
164    
165     }
166     else
167     progerr("SpiderDirectory requires one value");
168     }
169     else if (strcasecmp(w0, "equivalentserver") == 0)
170     {
171     if (sl->n > 1)
172     {
173     retval = 1;
174     /* Add a new list of equivalent servers */
175     list = (struct multiswline *) emalloc(sizeof(struct multiswline));
176    
177     list->next = http->equivalentservers;
178     list->list = 0;
179     http->equivalentservers = list;
180    
181     for (i = 1; i < sl->n; i++)
182     {
183     /* Add a new entry to this list */
184     slist = (struct swline *) emalloc(sizeof(struct swline));
185    
186     slist->line = estrdup( sl->word[i] );
187     slist->next = list->list;
188     list->list = slist;
189     }
190    
191     }
192     else
193     progerr("EquivalentServers requires at least one value");
194     }
195     else
196     {
197     retval = 0;
198     }
199    
200     return retval;
201     }
202     typedef struct urldepth
203     {
204     char *url;
205     int depth;
206     struct urldepth *next;
207     }
208     urldepth;
209    
210    
211     int http_already_indexed(SWISH * sw, char *url);
212     urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl);
213    
214    
215     urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl)
216     {
217     urldepth *item;
218     struct MOD_HTTP *http = sw->HTTP;
219    
220    
221     if (!equivalentserver(sw, url, baseurl))
222     {
223     if (sw->verbose >= 3)
224     printf("Skipping %s: %s\n", url, "Wrong method or server.");
225    
226    
227     }
228     else if (http->maxdepth && (depth >= http->maxdepth))
229     {
230     if (sw->verbose >= 3)
231     printf("Skipping %s: %s\n", url, "Too deep.");
232     }
233     else if (sw->nocontentslist && isoksuffix(url, sw->nocontentslist))
234     {
235     if (sw->verbose >= 3)
236     printf("Skipping %s: %s\n", url, "Wrong suffix.");
237    
238     }
239     else if (urldisallowed(sw, url))
240     {
241     if (sw->verbose >= 3)
242     printf("Skipping %s: %s\n", url, "URL disallowed by robots.txt.");
243     }
244     else if (!http_already_indexed(sw, url))
245     {
246     item = (urldepth *) emalloc(sizeof(urldepth));
247     item->url = estrdup(url);
248     item->depth = depth;
249     #if 0
250     /* Depth first searching
251     * */
252     item->next = list;
253     list = item;
254     #else
255     /* Breadth first searching
256     * */
257     item->next = 0;
258     if (!list)
259     {
260     list = item;
261     }
262     else
263     {
264     urldepth *walk;
265    
266     for (walk = list; walk->next; walk = walk->next)
267     {
268     }
269     walk->next = item;
270     }
271     #endif
272     }
273    
274     return list;
275     }
276    
277    
278     /* Have we already indexed a file or directory?
279     ** This function is used to avoid multiple index entries
280     ** or endless looping due to symbolic links.
281     */
282    
283     int http_already_indexed(SWISH * sw, char *url)
284     {
285     struct url_info *p;
286    
287     int len;
288     unsigned hashval;
289     struct MOD_HTTP *http = sw->HTTP;
290    
291     /* Hash with via the uri alone. Depending on the equivalent
292     ** servers, we may or may not make the decision of the entire
293     ** url or just the uri.
294     */
295     hashval = bighash(url_uri(url, &len)); /* Search hash for this file. */
296     for (p = http->url_hash[hashval]; p != NULL; p = p->next)
297     if ((strcmp(url, p->url) == 0) || (equivalentserver(sw, url, p->url) && (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)))
298     { /* We found it. */
299     if (sw->verbose >= 3)
300     printf("Skipping %s: %s\n", url, "Already indexed.");
301     return 1;
302     }
303    
304     /* Not found, make new entry. */
305     p = (struct url_info *) emalloc(sizeof(struct url_info));
306    
307     p->url = estrdup(url);
308     p->next = http->url_hash[hashval];
309     http->url_hash[hashval] = p;
310    
311     return 0;
312     }
313    
314    
315     char *url_method(char *url, int *plen)
316     {
317     char *end;
318    
319     if ((end = strstr(url, "://")) == NULL)
320     {
321     return NULL;
322     }
323     *plen = end - url;
324     return url;
325     }
326    
327    
328     char *url_serverport(char *url, int *plen)
329     {
330     int methodlen;
331     char *serverstart;
332     char *serverend;
333    
334     if (url_method(url, &methodlen) == NULL)
335     {
336     return NULL;
337     }
338    
339     /* +3 for
340     * */
341     serverstart = url + methodlen + 3;
342     if ((serverend = strchr(serverstart, '/')) == NULL)
343     {
344     *plen = strlen(serverstart);
345     }
346     else
347     {
348     *plen = serverend - serverstart;
349     }
350    
351     return serverstart;
352     }
353    
354    
355     char *url_uri(char *url, int *plen)
356     {
357     if ((url = url_serverport(url, plen)) == 0)
358     {
359     return 0;
360     }
361     url += *plen;
362     *plen = strlen(url);
363     return url;
364     }
365     /************************************************************
366     *
367     * Fork and exec a program, and wait for child to exit.
368     * Returns
369     *
370     *************************************************************/
371     #ifndef _WIN32
372     static void run_program(char* prog, char** args)
373     {
374     pid_t pid = fork();
375     int status;
376    
377     /* In parent, wait for child */
378     if ( pid )
379     {
380     wait( &status );
381     if ( WIFEXITED( status ) ) // exited normally if non-zero
382     return;
383    
384     progerr("%s exited with non-zero status (%d)", prog, WEXITSTATUS(status) );
385     }
386    
387     execvp (prog, args);
388     progerrno("Failed to fork '%s'. Error: ", prog );
389     }
390     #endif
391    
392     /************************************************************
393     *
394     * Fetch a URL
395     * Side effect that it appends to "response_file"
396     * -- lazy programmer hoping that -S http will go away...
397     *
398     * Under Windows system() is used to call "perl"
399     * Otherwise, exec is called on the swishspider program
400     *
401     *************************************************************/
402    
403     int get(SWISH * sw, char *contenttype_or_redirect, time_t *last_modified, time_t * plastretrieval, char *file_prefix, char *url)
404     {
405     int code = 500;
406     FILE *fp;
407     struct MOD_HTTP *http = sw->HTTP;
408    
409     /* Build path to swishspider program */
410     char *spider_prog = emalloc( strlen(http->spiderdirectory) + strlen("swishspider+fill") );
411     sprintf(spider_prog, "%sswishspider", http->spiderdirectory ); // note that spiderdir MUST be set.
412    
413    
414     /* Sleep a little so we don't overwhelm the server */
415     if ((time(0) - *plastretrieval) < http->delay)
416     {
417     int num_sec = http->delay - (time(0) - *plastretrieval);
418     sleep(num_sec);
419     }
420    
421     *plastretrieval = time(0);
422    
423    
424     #ifdef _WIN32
425     /* Should be in autoconf or obsoleted by extprog. - DLN 2001-11-05 */
426     {
427     int retval;
428     char commandline[] = "perl %s %s \"%s\"";
429     char *command = emalloc( strlen(commandline) + strlen(spider_prog) + strlen(file_prefix) + strlen(url) + 1 );
430    
431     sprintf(command, commandline, spider_prog, file_prefix, url);
432    
433     retval = system( command );
434     efree( command );
435     efree( spider_prog );
436    
437     printf("Returned %d\n", retval );
438    
439     if ( retval )
440     return 500;
441     }
442     #else
443     {
444     char *args[4];
445    
446     args[0] = spider_prog;
447     args[1] = file_prefix;
448     args[2] = url;
449     args[3] = NULL;
450     run_program( spider_prog, args );
451     efree( spider_prog );
452     }
453     #endif
454    
455    
456     /* NAUGHTY SIDE EFFECT */
457     strcat( file_prefix, ".response" );
458    
459     if ( !(fp = fopen(file_prefix, F_READ_TEXT)) )
460     {
461     progerrno("Failed to open file '%s': ", file_prefix );
462     }
463     else
464     {
465     char buffer[500];
466    
467     fgets(buffer, 400, fp);
468     code = atoi(buffer);
469     if ((code == 200) || ((code / 100) == 3))
470     {
471     /* read content-type redirect */
472     fgets(contenttype_or_redirect, MAXSTRLEN, fp); /* more yuck */
473     *(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '\0';
474     }
475     if (code == 200)
476     {
477     /* read last-mod time */
478     fgets(buffer, 400, fp); /* more yuck */
479     *last_modified = (time_t)strtol(buffer, NULL, 10); // go away http.c -- no error checking
480     }
481    
482    
483     fclose(fp);
484     }
485    
486     return code;
487     }
488    
489     int cmdf(int (*cmd) (const char *), char *fmt, char *string, pid_t pid)
490     {
491     int rc;
492     char *buffer;
493    
494     buffer = emalloc(strlen(fmt) + strlen(string) + sizeof(pid_t) * 8 + 1);
495    
496     sprintf(buffer, fmt, string, pid);
497    
498     rc = cmd(buffer);
499     efree(buffer);
500     return rc;
501     }
502    
503     char *readline(FILE * fp)
504     {
505     static char *buffer = 0;
506     static int buffersize = 512;
507    
508     if (buffer == 0)
509     {
510     buffer = (char *) emalloc(buffersize);
511     }
512     /*
513     *Try to read in the line
514     */
515    
516     if (fgets(buffer, buffersize, fp) == NULL)
517     {
518     return NULL;
519     }
520    
521     /*
522     * Make sure we read the entire line. If not, double the buffer
523     * size and try to read the rest
524     */
525     while (buffer[strlen(buffer) - 1] != '\n')
526     {
527     buffer = (char *) erealloc(buffer, buffersize * 2);
528    
529     /*
530     * The easiest way to verify that this line is okay is to consider
531     * the situation where the buffer is 2 bytes longs. Since fgets()
532     * always guarantees to put the trailing NULL, it will have essentially
533     * used only 1 bytes. We double it to four, so we now have the left
534     * over byte (that currently contains NULL) in addition to the doubling
535     * which gets us to read buffersize + 1.
536     */
537     if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0)
538     {
539     break;
540     }
541     buffersize *= 2;
542     }
543    
544     return buffer;
545     }
546    
547    
548     /* A local version of getpid() so that we don't have to suffer
549     ** a system call each time we need it.
550     */
551     pid_t lgetpid()
552     {
553     static pid_t pid = -1;
554    
555     if (pid == -1)
556     {
557     pid = getpid();
558     }
559     return pid;
560     }
561    
562     #if 0
563    
564     /* Testing the robot rules parsing code...
565     **/
566     void http_indexpath(char *url)
567     {
568     httpserverinfo *server = getserverinfo(url);
569     robotrules *robotrule;
570    
571     printf("User-agent: %s\n", server->useragent ? server->useragent : "(none)");
572     for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next)
573     {
574     printf("Disallow: %s\n", robotrule->disallow);
575     }
576     }
577    
578     #else
579    
580     /********************************************************/
581     /* "Public" functions */
582     /********************************************************/
583    
584     /* The main entry point for the module. For fs.c, decides whether this
585     ** is a file or directory and routes to the correct routine.
586     */
587     void http_indexpath(SWISH * sw, char *url)
588     {
589     urldepth *urllist = 0;
590     urldepth *item;
591     static int lentitle = 0;
592     static char *title = NULL;
593     char *tmptitle;
594     static int lencontenttype = 0;
595     static char *contenttype = NULL;
596     int code;
597     time_t last_modified = 0;
598    
599     httpserverinfo *server;
600     char *link;
601     char *p;
602     FileProp *fprop;
603     FILE *fp;
604     struct MOD_Index *idx = sw->Index;
605    
606     char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
607     char *file_suffix; // where to copy the suffix
608    
609    
610     /* Initialize buffers */
611    
612    
613     file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
614     sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
615     file_suffix = file_prefix + strlen( file_prefix );
616    
617    
618     if (!lentitle)
619     title = emalloc((lentitle = MAXSTRLEN) + 1);
620    
621     if (!lencontenttype)
622     contenttype = emalloc((lencontenttype = MAXSTRLEN) + 1);
623    
624    
625    
626     /* prime the pump with the first url */
627     urllist = add_url(sw, urllist, url, 0, url);
628    
629    
630    
631     /* retrieve each url and add urls to a certain depth */
632    
633     while (urllist)
634     {
635     item = urllist;
636     urllist = urllist->next;
637    
638     if (sw->verbose >= 2)
639     {
640     printf("retrieving %s (%d)...\n", item->url, item->depth);
641     fflush(stdout);
642     }
643    
644     /* We don't check if this url is legal here, because we do that before adding to the list. */
645     server = getserverinfo(sw, item->url);
646    
647     strcpy( file_suffix, "" ); // reset to just the prefix
648    
649     if ((code = get(sw, contenttype, &last_modified, &server->lastretrieval, file_prefix, item->url)) == 200)
650     {
651     /* Set the file_prefix to be the path to "contents" */
652     strcpy( file_suffix, ".contents" );
653    
654    
655     /* Patch from Steve van der Burg */
656     /* change from strcmp to strncmp */
657    
658    
659     /* Fetch title from doc if it's HTML */
660    
661     if (strncmp(contenttype, "text/html", 9) == 0)
662     title = SafeStrCopy(title, (char *) (tmptitle = parseHTMLtitle(sw , file_prefix)), &lentitle);
663     else
664     if ((p = strrchr(item->url, '/')))
665     title = SafeStrCopy(title, p + 1, &lentitle);
666     else
667     title = SafeStrCopy(title, item->url, &lentitle);
668    
669    
670     /* Now index the file */
671    
672     /* What to do with non text files?? */
673     if ( strncmp(contenttype, "text/", 5) == 0 )
674     {
675     fprop = file_properties(item->url, file_prefix, sw);
676     fprop->mtime = last_modified;
677    
678     /* only index contents of text docs */
679     // this would just index the path name
680     //fprop->index_no_content = strncmp(contenttype, "text/", 5);
681    
682     do_index_file(sw, fprop);
683     free_file_properties(fprop);
684     }
685     else if (sw->verbose >= 3)
686     printf("Skipping %s: Wrong content type: %s.\n", url, contenttype);
687    
688    
689    
690    
691     /* add new links as extracted by the spider */
692    
693     if (strncmp(contenttype, "text/html", 9) == 0)
694     {
695     strcpy( file_suffix, ".links" );
696    
697     if ((fp = fopen(file_prefix, F_READ_TEXT)) != NULL)
698     {
699     /* URLs can get quite large so don't depend on a fixed size buffer */
700    
701     while ((link = readline(fp)) != NULL)
702     {
703     *(link + strlen(link) - 1) = '\0';
704     urllist = add_url(sw, urllist, link, item->depth + 1, url);
705     }
706     fclose(fp);
707     }
708     }
709    
710     }
711     else if ((code / 100) == 3)
712     {
713     if ( *contenttype )
714     urllist = add_url(sw, urllist, contenttype, item->depth, url);
715     else
716     if (sw->verbose >= 3)
717     printf("URL '%s' returned redirect code %d without a Location.\n", url, code);
718     }
719    
720    
721    
722     /* Clean up the files left by swishspider */
723     cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
724     cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
725     cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
726     }
727     efree(file_prefix);
728     }
729    
730     #endif
731    
732    
733    
734    
735     struct _indexing_data_source_def HTTPIndexingDataSource = {
736     "HTTP-Crawler",
737     "http",
738     http_indexpath,
739     configModule_HTTP
740     };

  ViewVC Help
Powered by ViewVC 1.1.22