/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/httpserver.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/httpserver.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     $Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $
3     **
4     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6     **
7     ** This program and library is free software; you can redistribute it and/or
8     ** modify it under the terms of the GNU (Library) General Public License
9     ** as published by the Free Software Foundation; either version 2
10     ** of the License, or any later version.
11     **
12     ** This program is distributed in the hope that it will be useful,
13     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15     ** GNU (Library) General Public License for more details.
16     **
17     ** You should have received a copy of the GNU (Library) General Public License
18     ** long with this program; if not, write to the Free Software
19     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20     **--------------------------------------------------------------------
21     ** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
22     **
23     ** change sprintf to snprintf to avoid corruption
24     ** SRE 11/17/99
25     **
26     ** fixed cast to int problems pointed out by "gcc -Wall"
27     ** SRE 2/22/00
28     **
29     */
30    
31     /*
32     ** httpserver.c
33     */
34    
35     #ifndef _WIN32
36     #include <unistd.h>
37     #endif
38    
39     #include <time.h>
40     #include <stdarg.h>
41    
42     #include "swish.h"
43     #include "mem.h"
44     #include "string.h"
45     #include "index.h"
46    
47     #include "http.h"
48     #include "httpserver.h"
49     #include "file.h"
50    
51    
52     /* The list of servers that we are acting on.
53     **/
54     static httpserverinfo *servers = 0;
55    
56    
57     static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server);
58     static char *isolatevalue(char *line, char *keyword, int *plen);
59     static int serverinlist(char *url, struct swline *list);
60    
61    
62    
63     /* Find the robot rules for this URL. If haven't retrieved them
64     ** yet, do so now.
65     **/
66     httpserverinfo *getserverinfo(SWISH *sw, char *url)
67     {
68     httpserverinfo *server;
69     char *method;
70     int methodlen;
71     char *serverport;
72     int serverportlen;
73     static int lencontenttype=0;
74     static char *contenttype=NULL;
75     static int lenbuffer=0;
76     static char *buffer=NULL;
77     FILE *fp;
78     struct MOD_Index *idx = sw->Index;
79     time_t last_modified;
80    
81     // argh, this is ugly
82     char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
83    
84    
85     if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1);
86     if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1);
87    
88     if ((method = url_method(url, &methodlen)) == 0) {
89     return 0;
90     }
91     if ((serverport = url_serverport(url, &serverportlen)) == 0) {
92     return 0;
93     }
94    
95     /* Search for the rules
96     **/
97     for (server = servers; server; server = server->next) {
98     if (equivalentserver(sw, url, server->baseurl)) {
99     return server;
100     }
101     }
102    
103     /* Create a new entry for this server and add it to the list.
104     **/
105     server = (httpserverinfo *)emalloc(sizeof(httpserverinfo));
106    
107     /* +3 for the ://, +1 for the trailing /, +1 for the terminating null
108     **/
109     server->baseurl = (char *)emalloc(methodlen + serverportlen + 5);
110     /* These 4 lines to avoid a call to non ANSI snprintf . May not be the
111     best way but it ensures no buffer overruns */
112     memcpy (server->baseurl,method,methodlen);
113     memcpy (server->baseurl+methodlen,"://",3);
114     memcpy (server->baseurl+methodlen+3,serverport,serverportlen);
115     strcpy (server->baseurl+methodlen+3+serverportlen,"/");
116    
117     server->lastretrieval = 0;
118     server->robotrules = 0;
119     server->next = servers;
120     servers = server;
121    
122     /* Only http(s) servers can full rules, all the other ones just get dummies
123     ** (this is useful for holding last retrieval)
124     **
125     ** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what
126     ** many people consider the official web exclusion rules. Unfortunately,
127     ** the rules are not consistent about how records are formed. One line
128     ** states "the file consists of one or more records separated by one or more
129     ** blank lines" while another states "the record starts with one or more User-agent
130     ** lines, followed by one or more Disallow lines."
131     **
132     ** So, does a blank line after a User-agent line end a record? The spec is
133     ** unclear on this matter. If the next legal line afer the blank line is
134     ** a Disallow line, the blank line should most likely be ignored. But what
135     ** if the next line is another User-agent line? For example:
136     **
137     ** User-agent: MooBot
138     **
139     ** User-agent: CreepySpider
140     ** Disallow: /cgi-bin
141     **
142     ** One interpretation (based on blank lines termination records) is that MooBot
143     ** may visit any location (since there are no Disallows for it). Another
144     ** interpretation (based on records needing both User-agent and Disallow lines)
145     ** is that MooBot may not visit /cgi-bin
146     **
147     ** While poking around, I found at least one site (www.sun.com) that uses blank
148     ** lines within records. Because of that, I have decided to rely on records
149     ** having both User-agent and Disallow lines (the second interpretation above).
150     **/
151     if (strncmp(server->baseurl, "http", 4) == 0) {
152     if((int)(strlen(server->baseurl)+20)>=lenbuffer) {
153     lenbuffer=strlen(server->baseurl)+20+200;
154     buffer=erealloc(buffer,lenbuffer+1);
155     }
156     sprintf(buffer, "%srobots.txt", server->baseurl);
157    
158    
159    
160     file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
161     sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
162    
163    
164     if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200)
165     {
166     char *robots_buffer;
167     int filelen;
168     int bytes_read;
169    
170     if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) {
171     lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200;
172     buffer=erealloc(buffer,lenbuffer+1);
173     }
174     sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid());
175     fp = fopen(buffer, F_READ_TEXT);
176    
177     filelen = getsize(buffer);
178    
179     robots_buffer = emalloc( filelen + 1 );
180     *robots_buffer = '\0';
181     bytes_read = fread(robots_buffer, 1, filelen, fp);
182     robots_buffer[bytes_read] = '\0';
183     parserobotstxt( robots_buffer, bytes_read, server );
184    
185     efree( robots_buffer );
186    
187     //parserobotstxt(fp, server);
188     //fclose(fp);
189     }
190     efree( file_prefix );
191    
192     cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
193     cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
194     cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
195     }
196    
197     return server;
198     }
199    
200    
201     int urldisallowed(SWISH *sw, char *url)
202     {
203     httpserverinfo *server;
204     robotrules *rule;
205     char *uri;
206     int urilen;
207    
208     if ((server = getserverinfo(sw, url)) == 0) {
209     return 1;
210     }
211     if ((uri = url_uri(url, &urilen)) == 0) {
212     return 1;
213     }
214    
215     for (rule = server->robotrules; rule; rule = rule->next) {
216     if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) {
217     return 1;
218     }
219     }
220    
221     return 0;
222     }
223    
224     // quick fix to parse from Mac and Windows.
225     // Pass in:
226     // char **next_start == pointer to a *char that has where the next string starts.
227     // char *last_char == pointer to last char in buffer. Buffer MUST have room for one more char
228     //
229     // returns NULL on no more strings
230    
231     static char *next_line( char **next_start, char *last_char )
232     {
233     char *buffer = *next_start;
234     char *start;
235    
236    
237     // skip over any leading new lines or cr.
238     while ( buffer <= last_char && ( *buffer == '\0' || *buffer == '\n' || *buffer == '\r' ) )
239     buffer++;
240    
241     if ( buffer > last_char )
242     return NULL;
243    
244     start = buffer; // start of this word
245    
246     // Now find the end of this string
247     while ( buffer <= last_char && ( *buffer != '\0' && *buffer != '\n' && *buffer != '\r' ) )
248     buffer++;
249    
250     *buffer = '\0'; // mark the end of the string
251    
252     buffer++;
253     *next_start = buffer;
254    
255     return start;
256     }
257    
258     static char useragent[] = "user-agent:";
259     static char disallow[] = "disallow:";
260     static char swishspider[] = "swishspider";
261    
262     static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server)
263     {
264     char *buffer;
265     char *bufend = robots_buffer + buflen -1; // last char of string
266     char *next_start = robots_buffer;
267    
268     enum {START, USERAGENT, DISALLOW} state = START;
269     enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING;
270     char *p;
271     int len;
272     robotrules *entry;
273     robotrules *entry2;
274    
275     server->useragent = 0;
276    
277     buffer = NULL;
278    
279     while ( (buffer = next_line( &next_start, bufend ) ) )
280     {
281     if ( strchr( buffer, '#' ) )
282     *(strchr( buffer, '#' )) = '\0';
283    
284     if ((*buffer == '#') || (*buffer == '\0'))
285     continue;
286    
287    
288     if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) {
289     switch (state) {
290     case DISALLOW:
291     /* Since we found our specific user-agent, we can
292     ** skip the rest of the file.
293     **/
294     if (useragentstate == SPECIFIC) {
295     return;
296     }
297    
298     useragentstate = SKIPPING;
299    
300     /* explict fallthrough */
301    
302     case START:
303     case USERAGENT:
304     state = USERAGENT;
305    
306     if (useragentstate != SPECIFIC) {
307     p = isolatevalue(buffer, useragent, &len);
308    
309     if ((len == (sizeof(swishspider) - 1)) &&
310     (strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) {
311     useragentstate = SPECIFIC;
312    
313     /* We might have already parsed generic rules,
314     ** so clean them up if necessary.
315     */
316     if (server->useragent) {
317     efree(server->useragent);
318     }
319     for (entry = server->robotrules; entry; ) {
320     entry2 = entry->next;
321     efree(entry);
322     entry = entry2;
323     }
324     server->robotrules = 0;
325    
326     server->useragent = (char *)emalloc(len + 1);
327     strncpy(server->useragent, p, len);
328     *(server->useragent + len) = '\0';
329    
330     }
331     else if ((len == 1) && (*p == '*')) {
332     useragentstate = GENERIC;
333     server->useragent = (char *)emalloc(2);
334     strcpy(server->useragent, "*"); /* emalloc'd 2 bytes, no safestrcpy */
335     }
336    
337     }
338    
339    
340     break;
341    
342     }
343     }
344    
345     if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) {
346     state = DISALLOW;
347     if (useragentstate != SKIPPING) {
348     p = isolatevalue(buffer, disallow, &len);
349     if (len) {
350     entry = (robotrules *)emalloc(sizeof(robotrules));
351     entry->next = server->robotrules;
352     server->robotrules = entry;
353     entry->disallow = (char *)emalloc(len + 1);
354     strncpy(entry->disallow, p, len);
355     *(entry->disallow + len) = '\0';
356     }
357     }
358     }
359     }
360     }
361    
362    
363     static char *isolatevalue(char *line, char *keyword, int *plen)
364     {
365     /* Find the beginning of the value
366     **/
367     for (line += strlen(keyword); isspace((int)((unsigned char)*line)); line++ ) { /* cast to int 2/22/00 */
368     }
369    
370     /* Strip off trailing spaces
371     **/
372     for (*plen = strlen(line); isspace((int)((unsigned char)*(line + *plen - 1))); (*plen)--) { /* cast to int 2/22/00 */
373     }
374    
375     return line;
376     }
377    
378    
379     int equivalentserver(SWISH *sw, char *url, char *baseurl)
380     {
381     char *method;
382     int methodlen;
383     char *serverport;
384     int serverportlen;
385     char *basemethod;
386     int basemethodlen;
387     char *baseserverport;
388     int baseserverportlen;
389     struct multiswline *walk=NULL;
390     struct MOD_HTTP *http = sw->HTTP;
391    
392     method = url_method(url, &methodlen);
393     serverport = url_serverport(url, &serverportlen);
394     basemethod = url_method(baseurl, &basemethodlen);
395     baseserverport = url_serverport(baseurl, &baseserverportlen);
396    
397     if (!method || !serverport || !basemethod || !baseserverport) {
398     return 0;
399     }
400    
401     /* If this is the same server, we just go for it
402     **/
403     if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) &&
404     (strncasecmp(method, basemethod, methodlen) == 0) &&
405     (strncasecmp(serverport, baseserverport, serverportlen) == 0)) {
406     return 1;
407     }
408    
409     /* Do we find the method/server info for this and the base url
410     ** in the same equivalence list?
411     **/
412     for (walk = http->equivalentservers; walk; walk = walk->next ) {
413     if (serverinlist(url, walk->list) &&
414     serverinlist(baseurl, walk->list)) {
415     return 1;
416     }
417     }
418    
419     return 0;
420     }
421    
422    
423     static int serverinlist(char *url, struct swline *list)
424     {
425     char *method;
426     int methodlen;
427     char *serverport;
428     int serverportlen;
429     char *listmethod;
430     int listmethodlen;
431     char *listserverport;
432     int listserverportlen;
433    
434     method = url_method(url, &methodlen);
435     serverport = url_serverport(url, &serverportlen);
436     if (!method || !serverport) {
437     return 0;
438     }
439    
440     for ( ; list; list = list->next) {
441     listmethod = url_method(list->line, &listmethodlen);
442     listserverport = url_serverport(list->line, &listserverportlen);
443     if (listmethod && listserverport) {
444     if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) &&
445     (strncasecmp(method, listmethod, methodlen) == 0) &&
446     (strncasecmp(serverport, listserverport, serverportlen) == 0)) {
447     return 1;
448     }
449     }
450     }
451     return 0;
452     }
453    

  ViewVC Help
Powered by ViewVC 1.1.22