swish-e/src/httpserver.c

/*
$Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $
**
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
**
** change sprintf to snprintf to avoid corruption
** SRE 11/17/99
**
** fixed cast to int problems pointed out by "gcc -Wall"
** SRE 2/22/00
** 
*/

/*
** httpserver.c
*/

#ifndef _WIN32
#include <unistd.h>
#endif

#include <time.h>
#include <stdarg.h>

#include "swish.h"
#include "mem.h"
#include "string.h"
#include "index.h"

#include "http.h"
#include "httpserver.h"
#include "file.h"


/* The list of servers that we are acting on.
**/
static httpserverinfo *servers = 0;


static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server);
static char *isolatevalue(char *line, char *keyword, int *plen);
static int serverinlist(char *url, struct swline *list);


/* Find the robot rules for this URL.  If haven't retrieved them
** yet, do so now.
**/
httpserverinfo *getserverinfo(SWISH *sw, char *url)
{
    httpserverinfo *server;
    char *method;
    int methodlen;
    char *serverport;
    int serverportlen;
    static int lencontenttype=0;
    static char *contenttype=NULL;
    static int lenbuffer=0;
    static char *buffer=NULL;
    FILE *fp;
    struct MOD_Index *idx = sw->Index;
    time_t  last_modified;

    // argh, this is ugly
    char   *file_prefix;  // prefix for use with files written by swishspider -- should just be on the stack!
    

    if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1);
    if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1);

    if ((method = url_method(url, &methodlen)) == 0) {
                return 0;
    }
    if ((serverport = url_serverport(url, &serverportlen)) == 0) {
                return 0;
    }
        
    /* Search for the rules
    **/
    for (server = servers; server; server = server->next) {
                if (equivalentserver(sw, url, server->baseurl)) {
                        return server;
                }
    }
    
    /* Create a new entry for this server and add it to the list.
    **/
    server = (httpserverinfo *)emalloc(sizeof(httpserverinfo));
        
    /* +3 for the ://, +1 for the trailing /, +1 for the terminating null
    **/
    server->baseurl = (char *)emalloc(methodlen + serverportlen + 5);
    /* These 4 lines to avoid a call to non ANSI snprintf . May not be the
     best way but it ensures no buffer overruns */
    memcpy (server->baseurl,method,methodlen);
    memcpy (server->baseurl+methodlen,"://",3);
    memcpy (server->baseurl+methodlen+3,serverport,serverportlen);
    strcpy (server->baseurl+methodlen+3+serverportlen,"/");
    
    server->lastretrieval = 0;
    server->robotrules = 0;
    server->next = servers;
    servers = server;
        
    /* Only http(s) servers can full rules, all the other ones just get dummies
    ** (this is useful for holding last retrieval)
    **
    ** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what
    ** many people consider the official web exclusion rules.  Unfortunately,
    ** the rules are not consistent about how records are formed.  One line
    ** states "the file consists of one or more records separated by one or more
    ** blank lines" while another states "the record starts with one or more User-agent
    ** lines, followed by one or more Disallow lines."
    **
    ** So, does a blank line after a User-agent line end a record?  The spec is
    ** unclear on this matter.  If the next legal line afer the blank line is
    ** a Disallow line, the blank line should most likely be ignored.  But what
    ** if the next line is another User-agent line?  For example:
    **
    ** User-agent: MooBot
    **
    ** User-agent: CreepySpider
    ** Disallow: /cgi-bin
    **
    ** One interpretation (based on blank lines termination records) is that MooBot
    ** may visit any location (since there are no Disallows for it).  Another
    ** interpretation (based on records needing both User-agent and Disallow lines)
    ** is that MooBot may not visit /cgi-bin
    **
    ** While poking around, I found at least one site (www.sun.com) that uses blank
    ** lines within records.  Because of that, I have decided to rely on records
    ** having both User-agent and Disallow lines (the second interpretation above).
    **/
    if (strncmp(server->baseurl, "http", 4) == 0) {
                if((int)(strlen(server->baseurl)+20)>=lenbuffer) {
                        lenbuffer=strlen(server->baseurl)+20+200;
                        buffer=erealloc(buffer,lenbuffer+1);
                }
                sprintf(buffer, "%srobots.txt", server->baseurl);


        file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
        sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());


                if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200)
                {
                    char   *robots_buffer;
                    int     filelen;
                    int     bytes_read;
                    
                        if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) {
                                lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200;
                                buffer=erealloc(buffer,lenbuffer+1);
                        }
                        sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid());
                        fp = fopen(buffer, F_READ_TEXT);

                        filelen = getsize(buffer);

            robots_buffer = emalloc( filelen + 1 );
            *robots_buffer = '\0';
            bytes_read = fread(robots_buffer, 1, filelen, fp);
            robots_buffer[bytes_read] = '\0';
            parserobotstxt( robots_buffer, bytes_read, server );

                        efree( robots_buffer );

                        //parserobotstxt(fp, server);
                        //fclose(fp);
                }
                efree( file_prefix );
                
                cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
                cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
                cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
    }
        
    return server;
}


int urldisallowed(SWISH *sw, char *url)
{
    httpserverinfo *server;
    robotrules *rule;
    char *uri;
    int urilen;
        
    if ((server = getserverinfo(sw, url)) == 0) {
                return 1;
    }
    if ((uri = url_uri(url, &urilen)) == 0) {
                return 1;
    }
        
    for (rule = server->robotrules; rule; rule = rule->next) {
                if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) {
                        return 1;
                }
    }
        
    return 0;
}

// quick fix to parse from Mac and Windows.
// Pass in:
//      char **next_start == pointer to a *char that has where the next string starts.
//      char *last_char   == pointer to last char in buffer.  Buffer MUST have room for one more char
// 
// returns NULL on no more strings

static char *next_line( char **next_start, char *last_char  )
{
    char *buffer = *next_start;
    char *start;


    // skip over any leading new lines or cr.
    while ( buffer <= last_char && ( *buffer == '\0' || *buffer == '\n' || *buffer == '\r' ) )
        buffer++;

    if ( buffer > last_char )
        return NULL;

    start = buffer;  // start of this word

    // Now find the end of this string
    while ( buffer <= last_char && ( *buffer != '\0' && *buffer != '\n' && *buffer != '\r' ) )
        buffer++;

    *buffer = '\0';  // mark the end of the string

    buffer++;
    *next_start = buffer;

    return start;
}

static char useragent[] = "user-agent:";
static char disallow[] = "disallow:";
static char swishspider[] = "swishspider";

static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server)
{
    char *buffer;
    char *bufend = robots_buffer + buflen -1;  // last char of string
    char *next_start = robots_buffer;
    
    enum {START, USERAGENT, DISALLOW} state = START;
    enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING;
    char *p;
    int len;
    robotrules *entry;
    robotrules *entry2;
        
    server->useragent = 0;

    buffer = NULL;

    while ( (buffer = next_line( &next_start, bufend ) ) )
    {
        if ( strchr( buffer, '#' ) )
            *(strchr( buffer, '#' )) = '\0';

                if ((*buffer == '#') || (*buffer == '\0'))
                        continue;

                
                if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) {
                        switch (state) {
                        case DISALLOW:
                        /* Since we found our specific user-agent, we can
                        ** skip the rest of the file.
                                **/
                                if (useragentstate == SPECIFIC) {
                                        return;
                                }
                                
                                useragentstate = SKIPPING;
                                
                                /* explict fallthrough */
                                
                        case START:
                        case USERAGENT:
                                state = USERAGENT;
                                
                                if (useragentstate != SPECIFIC) {
                                        p = isolatevalue(buffer, useragent, &len);
                                        
                                        if ((len == (sizeof(swishspider) - 1)) &&
                                                (strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) {
                                                useragentstate = SPECIFIC;
                                                
                                                /* We might have already parsed generic rules,
                                                ** so clean them up if necessary.
                                                */
                                                if (server->useragent) {
                                                        efree(server->useragent);
                                                }
                                                for (entry = server->robotrules; entry; ) {
                                                        entry2 = entry->next;
                                                        efree(entry);
                                                        entry = entry2;
                                                }
                                                server->robotrules = 0;
                                                
                                                server->useragent = (char *)emalloc(len + 1);
                                                strncpy(server->useragent, p, len);
                                                *(server->useragent + len) = '\0';
                                                
                                        }
                                        else if ((len == 1) && (*p == '*')) {
                                                useragentstate = GENERIC;
                                                server->useragent = (char *)emalloc(2);
                                                strcpy(server->useragent, "*"); /* emalloc'd 2 bytes, no safestrcpy */
                                        }
                                        
                                }
                                
                                
                                break;
                                
                        }
                }
                
                if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) {
                        state = DISALLOW;
                        if (useragentstate != SKIPPING) {
                                p = isolatevalue(buffer, disallow, &len);
                                if (len) {
                                        entry = (robotrules *)emalloc(sizeof(robotrules));
                                        entry->next = server->robotrules;
                                        server->robotrules = entry;
                                        entry->disallow = (char *)emalloc(len + 1);
                                        strncpy(entry->disallow, p, len);
                                        *(entry->disallow + len) = '\0';
                                }
                        }
                }
    }
}


static char *isolatevalue(char *line, char *keyword, int *plen)
{
        /* Find the beginning of the value
    **/
    for (line += strlen(keyword); isspace((int)((unsigned char)*line)); line++ ) { /* cast to int 2/22/00 */
    }
        
    /* Strip off trailing spaces
    **/
    for (*plen = strlen(line); isspace((int)((unsigned char)*(line + *plen - 1))); (*plen)--) { /* cast to int 2/22/00 */
    }
        
    return line;
}


int equivalentserver(SWISH *sw, char *url, char *baseurl)
{
char *method;
int methodlen;
char *serverport;
int serverportlen;
char *basemethod;
int basemethodlen;
char *baseserverport;
int baseserverportlen;
struct multiswline *walk=NULL;
struct MOD_HTTP *http = sw->HTTP;
        
    method = url_method(url, &methodlen);
    serverport = url_serverport(url, &serverportlen);
    basemethod = url_method(baseurl, &basemethodlen);
    baseserverport = url_serverport(baseurl, &baseserverportlen);
        
    if (!method || !serverport || !basemethod || !baseserverport) {
                return 0;
    }
        
    /* If this is the same server, we just go for it
    **/
    if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) &&
                (strncasecmp(method, basemethod, methodlen) == 0) &&
                (strncasecmp(serverport, baseserverport, serverportlen) == 0)) {
                return 1;
    }
        
    /* Do we find the method/server info for this and the base url
    ** in the same equivalence list?
    **/
    for (walk = http->equivalentservers; walk; walk = walk->next ) {
                if (serverinlist(url, walk->list) &&
                        serverinlist(baseurl, walk->list)) {
                        return 1;
                }
    }
        
    return 0;
}


static int serverinlist(char *url, struct swline *list)
{
    char *method;
    int methodlen;
    char *serverport;
    int serverportlen;
    char *listmethod;
    int listmethodlen;
    char *listserverport;
    int listserverportlen;
    
    method = url_method(url, &methodlen);
    serverport = url_serverport(url, &serverportlen);
    if (!method || !serverport) {
                return 0;
    }
        
    for ( ; list; list = list->next) {
                listmethod = url_method(list->line, &listmethodlen);
                listserverport = url_serverport(list->line, &listserverportlen);
                if (listmethod && listserverport) {
                        if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) &&
                                (strncasecmp(method, listmethod, methodlen) == 0) &&
                                (strncasecmp(serverport, listserverport, serverportlen) == 0)) {
                                return 1;
                        }
                }
    }
    return 0;
}

1	adcroft	1.1	/*
2			$Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $
3			**
4			** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5			** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6			**
7			** This program and library is free software; you can redistribute it and/or
8			** modify it under the terms of the GNU (Library) General Public License
9			** as published by the Free Software Foundation; either version 2
10			** of the License, or any later version.
11			**
12			** This program is distributed in the hope that it will be useful,
13			** but WITHOUT ANY WARRANTY; without even the implied warranty of
14			** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15			** GNU (Library) General Public License for more details.
16			**
17			** You should have received a copy of the GNU (Library) General Public License
18			** long with this program; if not, write to the Free Software
19			** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20			**--------------------------------------------------------------------
21			** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
22			**
23			** change sprintf to snprintf to avoid corruption
24			** SRE 11/17/99
25			**
26			** fixed cast to int problems pointed out by "gcc -Wall"
27			** SRE 2/22/00
28			**
29			*/
30
31			/*
32			** httpserver.c
33			*/
34
35			#ifndef _WIN32
36			#include <unistd.h>
37			#endif
38
39			#include <time.h>
40			#include <stdarg.h>
41
42			#include "swish.h"
43			#include "mem.h"
44			#include "string.h"
45			#include "index.h"
46
47			#include "http.h"
48			#include "httpserver.h"
49			#include "file.h"
50
51
52			/* The list of servers that we are acting on.
53			**/
54			static httpserverinfo *servers = 0;
55
56
57			static void parserobotstxt(char robots_buffer, int buflen, httpserverinfo server);
58			static char isolatevalue(char line, char keyword, int plen);
59			static int serverinlist(char url, struct swline list);
60
61
62
63			/* Find the robot rules for this URL. If haven't retrieved them
64			** yet, do so now.
65			**/
66			httpserverinfo getserverinfo(SWISH sw, char *url)
67			{
68			httpserverinfo *server;
69			char *method;
70			int methodlen;
71			char *serverport;
72			int serverportlen;
73			static int lencontenttype=0;
74			static char *contenttype=NULL;
75			static int lenbuffer=0;
76			static char *buffer=NULL;
77			FILE *fp;
78			struct MOD_Index *idx = sw->Index;
79			time_t last_modified;
80
81			// argh, this is ugly
82			char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
83
84
85			if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1);
86			if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1);
87
88			if ((method = url_method(url, &methodlen)) == 0) {
89			return 0;
90			}
91			if ((serverport = url_serverport(url, &serverportlen)) == 0) {
92			return 0;
93			}
94
95			/* Search for the rules
96			**/
97			for (server = servers; server; server = server->next) {
98			if (equivalentserver(sw, url, server->baseurl)) {
99			return server;
100			}
101			}
102
103			/* Create a new entry for this server and add it to the list.
104			**/
105			server = (httpserverinfo *)emalloc(sizeof(httpserverinfo));
106
107			/* +3 for the ://, +1 for the trailing /, +1 for the terminating null
108			**/
109			server->baseurl = (char *)emalloc(methodlen + serverportlen + 5);
110			/* These 4 lines to avoid a call to non ANSI snprintf . May not be the
111			best way but it ensures no buffer overruns */
112			memcpy (server->baseurl,method,methodlen);
113			memcpy (server->baseurl+methodlen,"://",3);
114			memcpy (server->baseurl+methodlen+3,serverport,serverportlen);
115			strcpy (server->baseurl+methodlen+3+serverportlen,"/");
116
117			server->lastretrieval = 0;
118			server->robotrules = 0;
119			server->next = servers;
120			servers = server;
121
122			/* Only http(s) servers can full rules, all the other ones just get dummies
123			** (this is useful for holding last retrieval)
124			**
125			** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what
126			** many people consider the official web exclusion rules. Unfortunately,
127			** the rules are not consistent about how records are formed. One line
128			** states "the file consists of one or more records separated by one or more
129			** blank lines" while another states "the record starts with one or more User-agent
130			** lines, followed by one or more Disallow lines."
131			**
132			** So, does a blank line after a User-agent line end a record? The spec is
133			** unclear on this matter. If the next legal line afer the blank line is
134			** a Disallow line, the blank line should most likely be ignored. But what
135			** if the next line is another User-agent line? For example:
136			**
137			** User-agent: MooBot
138			**
139			** User-agent: CreepySpider
140			** Disallow: /cgi-bin
141			**
142			** One interpretation (based on blank lines termination records) is that MooBot
143			** may visit any location (since there are no Disallows for it). Another
144			** interpretation (based on records needing both User-agent and Disallow lines)
145			** is that MooBot may not visit /cgi-bin
146			**
147			** While poking around, I found at least one site (www.sun.com) that uses blank
148			** lines within records. Because of that, I have decided to rely on records
149			** having both User-agent and Disallow lines (the second interpretation above).
150			**/
151			if (strncmp(server->baseurl, "http", 4) == 0) {
152			if((int)(strlen(server->baseurl)+20)>=lenbuffer) {
153			lenbuffer=strlen(server->baseurl)+20+200;
154			buffer=erealloc(buffer,lenbuffer+1);
155			}
156			sprintf(buffer, "%srobots.txt", server->baseurl);
157
158
159
160			file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
161			sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
162
163
164			if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200)
165			{
166			char *robots_buffer;
167			int filelen;
168			int bytes_read;
169
170			if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) {
171			lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200;
172			buffer=erealloc(buffer,lenbuffer+1);
173			}
174			sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid());
175			fp = fopen(buffer, F_READ_TEXT);
176
177			filelen = getsize(buffer);
178
179			robots_buffer = emalloc( filelen + 1 );
180			*robots_buffer = '\0';
181			bytes_read = fread(robots_buffer, 1, filelen, fp);
182			robots_buffer[bytes_read] = '\0';
183			parserobotstxt( robots_buffer, bytes_read, server );
184
185			efree( robots_buffer );
186
187			//parserobotstxt(fp, server);
188			//fclose(fp);
189			}
190			efree( file_prefix );
191
192			cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
193			cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
194			cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
195			}
196
197			return server;
198			}
199
200
201			int urldisallowed(SWISH sw, char url)
202			{
203			httpserverinfo *server;
204			robotrules *rule;
205			char *uri;
206			int urilen;
207
208			if ((server = getserverinfo(sw, url)) == 0) {
209			return 1;
210			}
211			if ((uri = url_uri(url, &urilen)) == 0) {
212			return 1;
213			}
214
215			for (rule = server->robotrules; rule; rule = rule->next) {
216			if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) {
217			return 1;
218			}
219			}
220
221			return 0;
222			}
223
224			// quick fix to parse from Mac and Windows.
225			// Pass in:
226			// char *next_start == pointer to a char that has where the next string starts.
227			// char *last_char == pointer to last char in buffer. Buffer MUST have room for one more char
228			//
229			// returns NULL on no more strings
230
231			static char next_line( char next_start, char last_char )
232			{
233			char buffer = next_start;
234			char *start;
235
236
237			// skip over any leading new lines or cr.
238			while ( buffer <= last_char && ( buffer == '\0' \|\| buffer == '\n' \|\| *buffer == '\r' ) )
239			buffer++;
240
241			if ( buffer > last_char )
242			return NULL;
243
244			start = buffer; // start of this word
245
246			// Now find the end of this string
247			while ( buffer <= last_char && ( buffer != '\0' && buffer != '\n' && *buffer != '\r' ) )
248			buffer++;
249
250			*buffer = '\0'; // mark the end of the string
251
252			buffer++;
253			*next_start = buffer;
254
255			return start;
256			}
257
258			static char useragent[] = "user-agent:";
259			static char disallow[] = "disallow:";
260			static char swishspider[] = "swishspider";
261
262			static void parserobotstxt(char robots_buffer, int buflen, httpserverinfo server)
263			{
264			char *buffer;
265			char *bufend = robots_buffer + buflen -1; // last char of string
266			char *next_start = robots_buffer;
267
268			enum {START, USERAGENT, DISALLOW} state = START;
269			enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING;
270			char *p;
271			int len;
272			robotrules *entry;
273			robotrules *entry2;
274
275			server->useragent = 0;
276
277			buffer = NULL;
278
279			while ( (buffer = next_line( &next_start, bufend ) ) )
280			{
281			if ( strchr( buffer, '#' ) )
282			*(strchr( buffer, '#' )) = '\0';
283
284			if ((buffer == '#') \|\| (buffer == '\0'))
285			continue;
286
287
288			if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) {
289			switch (state) {
290			case DISALLOW:
291			/* Since we found our specific user-agent, we can
292			** skip the rest of the file.
293			**/
294			if (useragentstate == SPECIFIC) {
295			return;
296			}
297
298			useragentstate = SKIPPING;
299
300			/* explict fallthrough */
301
302			case START:
303			case USERAGENT:
304			state = USERAGENT;
305
306			if (useragentstate != SPECIFIC) {
307			p = isolatevalue(buffer, useragent, &len);
308
309			if ((len == (sizeof(swishspider) - 1)) &&
310			(strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) {
311			useragentstate = SPECIFIC;
312
313			/* We might have already parsed generic rules,
314			** so clean them up if necessary.
315			*/
316			if (server->useragent) {
317			efree(server->useragent);
318			}
319			for (entry = server->robotrules; entry; ) {
320			entry2 = entry->next;
321			efree(entry);
322			entry = entry2;
323			}
324			server->robotrules = 0;
325
326			server->useragent = (char *)emalloc(len + 1);
327			strncpy(server->useragent, p, len);
328			*(server->useragent + len) = '\0';
329
330			}
331			else if ((len == 1) && (p == '')) {
332			useragentstate = GENERIC;
333			server->useragent = (char *)emalloc(2);
334			strcpy(server->useragent, ""); / emalloc'd 2 bytes, no safestrcpy */
335			}
336
337			}
338
339
340			break;
341
342			}
343			}
344
345			if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) {
346			state = DISALLOW;
347			if (useragentstate != SKIPPING) {
348			p = isolatevalue(buffer, disallow, &len);
349			if (len) {
350			entry = (robotrules *)emalloc(sizeof(robotrules));
351			entry->next = server->robotrules;
352			server->robotrules = entry;
353			entry->disallow = (char *)emalloc(len + 1);
354			strncpy(entry->disallow, p, len);
355			*(entry->disallow + len) = '\0';
356			}
357			}
358			}
359			}
360			}
361
362
363			static char isolatevalue(char line, char keyword, int plen)
364			{
365			/* Find the beginning of the value
366			**/
367			for (line += strlen(keyword); isspace((int)((unsigned char)line)); line++ ) { / cast to int 2/22/00 */
368			}
369
370			/* Strip off trailing spaces
371			**/
372			for (plen = strlen(line); isspace((int)((unsigned char)(line + plen - 1))); (plen)--) { /* cast to int 2/22/00 */
373			}
374
375			return line;
376			}
377
378
379			int equivalentserver(SWISH sw, char url, char *baseurl)
380			{
381			char *method;
382			int methodlen;
383			char *serverport;
384			int serverportlen;
385			char *basemethod;
386			int basemethodlen;
387			char *baseserverport;
388			int baseserverportlen;
389			struct multiswline *walk=NULL;
390			struct MOD_HTTP *http = sw->HTTP;
391
392			method = url_method(url, &methodlen);
393			serverport = url_serverport(url, &serverportlen);
394			basemethod = url_method(baseurl, &basemethodlen);
395			baseserverport = url_serverport(baseurl, &baseserverportlen);
396
397			if (!method \|\| !serverport \|\| !basemethod \|\| !baseserverport) {
398			return 0;
399			}
400
401			/* If this is the same server, we just go for it
402			**/
403			if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) &&
404			(strncasecmp(method, basemethod, methodlen) == 0) &&
405			(strncasecmp(serverport, baseserverport, serverportlen) == 0)) {
406			return 1;
407			}
408
409			/* Do we find the method/server info for this and the base url
410			** in the same equivalence list?
411			**/
412			for (walk = http->equivalentservers; walk; walk = walk->next ) {
413			if (serverinlist(url, walk->list) &&
414			serverinlist(baseurl, walk->list)) {
415			return 1;
416			}
417			}
418
419			return 0;
420			}
421
422
423			static int serverinlist(char url, struct swline list)
424			{
425			char *method;
426			int methodlen;
427			char *serverport;
428			int serverportlen;
429			char *listmethod;
430			int listmethodlen;
431			char *listserverport;
432			int listserverportlen;
433
434			method = url_method(url, &methodlen);
435			serverport = url_serverport(url, &serverportlen);
436			if (!method \|\| !serverport) {
437			return 0;
438			}
439
440			for ( ; list; list = list->next) {
441			listmethod = url_method(list->line, &listmethodlen);
442			listserverport = url_serverport(list->line, &listserverportlen);
443			if (listmethod && listserverport) {
444			if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) &&
445			(strncasecmp(method, listmethod, methodlen) == 0) &&
446			(strncasecmp(serverport, listserverport, serverportlen) == 0)) {
447			return 1;
448			}
449			}
450			}
451			return 0;
452			}
453