swish-e/src/httpserver.c

/*
$Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $
**
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
**
** change sprintf to snprintf to avoid corruption
** SRE 11/17/99
**
** fixed cast to int problems pointed out by "gcc -Wall"
** SRE 2/22/00
** 
*/

/*
** httpserver.c
*/

#ifndef _WIN32
#include <unistd.h>
#endif

#include <time.h>
#include <stdarg.h>

#include "swish.h"
#include "mem.h"
#include "string.h"
#include "index.h"

#include "http.h"
#include "httpserver.h"
#include "file.h"


/* The list of servers that we are acting on.
**/
static httpserverinfo *servers = 0;


static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server);
static char *isolatevalue(char *line, char *keyword, int *plen);
static int serverinlist(char *url, struct swline *list);


/* Find the robot rules for this URL.  If haven't retrieved them
** yet, do so now.
**/
httpserverinfo *getserverinfo(SWISH *sw, char *url)
{
    httpserverinfo *server;
    char *method;
    int methodlen;
    char *serverport;
    int serverportlen;
    static int lencontenttype=0;
    static char *contenttype=NULL;
    static int lenbuffer=0;
    static char *buffer=NULL;
    FILE *fp;
    struct MOD_Index *idx = sw->Index;
    time_t  last_modified;

    // argh, this is ugly
    char   *file_prefix;  // prefix for use with files written by swishspider -- should just be on the stack!
    

    if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1);
    if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1);

    if ((method = url_method(url, &methodlen)) == 0) {
                return 0;
    }
    if ((serverport = url_serverport(url, &serverportlen)) == 0) {
                return 0;
    }
        
    /* Search for the rules
    **/
    for (server = servers; server; server = server->next) {
                if (equivalentserver(sw, url, server->baseurl)) {
                        return server;
                }
    }
    
    /* Create a new entry for this server and add it to the list.
    **/
    server = (httpserverinfo *)emalloc(sizeof(httpserverinfo));
        
    /* +3 for the ://, +1 for the trailing /, +1 for the terminating null
    **/
    server->baseurl = (char *)emalloc(methodlen + serverportlen + 5);
    /* These 4 lines to avoid a call to non ANSI snprintf . May not be the
     best way but it ensures no buffer overruns */
    memcpy (server->baseurl,method,methodlen);
    memcpy (server->baseurl+methodlen,"://",3);
    memcpy (server->baseurl+methodlen+3,serverport,serverportlen);
    strcpy (server->baseurl+methodlen+3+serverportlen,"/");
    
    server->lastretrieval = 0;
    server->robotrules = 0;
    server->next = servers;
    servers = server;
        
    /* Only http(s) servers can full rules, all the other ones just get dummies
    ** (this is useful for holding last retrieval)
    **
    ** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what
    ** many people consider the official web exclusion rules.  Unfortunately,
    ** the rules are not consistent about how records are formed.  One line
    ** states "the file consists of one or more records separated by one or more
    ** blank lines" while another states "the record starts with one or more User-agent
    ** lines, followed by one or more Disallow lines."
    **
    ** So, does a blank line after a User-agent line end a record?  The spec is
    ** unclear on this matter.  If the next legal line afer the blank line is
    ** a Disallow line, the blank line should most likely be ignored.  But what
    ** if the next line is another User-agent line?  For example:
    **
    ** User-agent: MooBot
    **
    ** User-agent: CreepySpider
    ** Disallow: /cgi-bin
    **
    ** One interpretation (based on blank lines termination records) is that MooBot
    ** may visit any location (since there are no Disallows for it).  Another
    ** interpretation (based on records needing both User-agent and Disallow lines)
    ** is that MooBot may not visit /cgi-bin
    **
    ** While poking around, I found at least one site (www.sun.com) that uses blank
    ** lines within records.  Because of that, I have decided to rely on records
    ** having both User-agent and Disallow lines (the second interpretation above).
    **/
    if (strncmp(server->baseurl, "http", 4) == 0) {
                if((int)(strlen(server->baseurl)+20)>=lenbuffer) {
                        lenbuffer=strlen(server->baseurl)+20+200;
                        buffer=erealloc(buffer,lenbuffer+1);
                }
                sprintf(buffer, "%srobots.txt", server->baseurl);


        file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
        sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());


                if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200)
                {
                    char   *robots_buffer;
                    int     filelen;
                    int     bytes_read;
                    
                        if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) {
                                lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200;
                                buffer=erealloc(buffer,lenbuffer+1);
                        }
                        sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid());
                        fp = fopen(buffer, F_READ_TEXT);

                        filelen = getsize(buffer);

            robots_buffer = emalloc( filelen + 1 );
            *robots_buffer = '\0';
            bytes_read = fread(robots_buffer, 1, filelen, fp);
            robots_buffer[bytes_read] = '\0';
            parserobotstxt( robots_buffer, bytes_read, server );

                        efree( robots_buffer );

                        //parserobotstxt(fp, server);
                        //fclose(fp);
                }
                efree( file_prefix );
                
                cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
                cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
                cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
    }
        
    return server;
}


int urldisallowed(SWISH *sw, char *url)
{
    httpserverinfo *server;
    robotrules *rule;
    char *uri;
    int urilen;
        
    if ((server = getserverinfo(sw, url)) == 0) {
                return 1;
    }
    if ((uri = url_uri(url, &urilen)) == 0) {
                return 1;
    }
        
    for (rule = server->robotrules; rule; rule = rule->next) {
                if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) {
                        return 1;
                }
    }
        
    return 0;
}

// quick fix to parse from Mac and Windows.
// Pass in:
//      char **next_start == pointer to a *char that has where the next string starts.
//      char *last_char   == pointer to last char in buffer.  Buffer MUST have room for one more char
// 
// returns NULL on no more strings

static char *next_line( char **next_start, char *last_char  )
{
    char *buffer = *next_start;
    char *start;


    // skip over any leading new lines or cr.
    while ( buffer <= last_char && ( *buffer == '\0' || *buffer == '\n' || *buffer == '\r' ) )
        buffer++;

    if ( buffer > last_char )
        return NULL;

    start = buffer;  // start of this word

    // Now find the end of this string
    while ( buffer <= last_char && ( *buffer != '\0' && *buffer != '\n' && *buffer != '\r' ) )
        buffer++;

    *buffer = '\0';  // mark the end of the string

    buffer++;
    *next_start = buffer;

    return start;
}

static char useragent[] = "user-agent:";
static char disallow[] = "disallow:";
static char swishspider[] = "swishspider";

static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server)
{
    char *buffer;
    char *bufend = robots_buffer + buflen -1;  // last char of string
    char *next_start = robots_buffer;
    
    enum {START, USERAGENT, DISALLOW} state = START;
    enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING;
    char *p;
    int len;
    robotrules *entry;
    robotrules *entry2;
        
    server->useragent = 0;

    buffer = NULL;

    while ( (buffer = next_line( &next_start, bufend ) ) )
    {
        if ( strchr( buffer, '#' ) )
            *(strchr( buffer, '#' )) = '\0';

                if ((*buffer == '#') || (*buffer == '\0'))
                        continue;

                
                if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) {
                        switch (state) {
                        case DISALLOW:
                        /* Since we found our specific user-agent, we can
                        ** skip the rest of the file.
                                **/
                                if (useragentstate == SPECIFIC) {
                                        return;
                                }
                                
                                useragentstate = SKIPPING;
                                
                                /* explict fallthrough */
                                
                        case START:
                        case USERAGENT:
                                state = USERAGENT;
                                
                                if (useragentstate != SPECIFIC) {
                                        p = isolatevalue(buffer, useragent, &len);
                                        
                                        if ((len == (sizeof(swishspider) - 1)) &&
                                                (strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) {
                                                useragentstate = SPECIFIC;
                                                
                                                /* We might have already parsed generic rules,
                                                ** so clean them up if necessary.
                                                */
                                                if (server->useragent) {
                                                        efree(server->useragent);
                                                }
                                                for (entry = server->robotrules; entry; ) {
                                                        entry2 = entry->next;
                                                        efree(entry);
                                                        entry = entry2;
                                                }
                                                server->robotrules = 0;
                                                
                                                server->useragent = (char *)emalloc(len + 1);
                                                strncpy(server->useragent, p, len);
                                                *(server->useragent + len) = '\0';
                                                
                                        }
                                        else if ((len == 1) && (*p == '*')) {
                                                useragentstate = GENERIC;
                                                server->useragent = (char *)emalloc(2);
                                                strcpy(server->useragent, "*"); /* emalloc'd 2 bytes, no safestrcpy */
                                        }
                                        
                                }
                                
                                
                                break;
                                
                        }
                }
                
                if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) {
                        state = DISALLOW;
                        if (useragentstate != SKIPPING) {
                                p = isolatevalue(buffer, disallow, &len);
                                if (len) {
                                        entry = (robotrules *)emalloc(sizeof(robotrules));
                                        entry->next = server->robotrules;
                                        server->robotrules = entry;
                                        entry->disallow = (char *)emalloc(len + 1);
                                        strncpy(entry->disallow, p, len);
                                        *(entry->disallow + len) = '\0';
                                }
                        }
                }
    }
}


static char *isolatevalue(char *line, char *keyword, int *plen)
{
        /* Find the beginning of the value
    **/
    for (line += strlen(keyword); isspace((int)((unsigned char)*line)); line++ ) { /* cast to int 2/22/00 */
    }
        
    /* Strip off trailing spaces
    **/
    for (*plen = strlen(line); isspace((int)((unsigned char)*(line + *plen - 1))); (*plen)--) { /* cast to int 2/22/00 */
    }
        
    return line;
}


int equivalentserver(SWISH *sw, char *url, char *baseurl)
{
char *method;
int methodlen;
char *serverport;
int serverportlen;
char *basemethod;
int basemethodlen;
char *baseserverport;
int baseserverportlen;
struct multiswline *walk=NULL;
struct MOD_HTTP *http = sw->HTTP;
        
    method = url_method(url, &methodlen);
    serverport = url_serverport(url, &serverportlen);
    basemethod = url_method(baseurl, &basemethodlen);
    baseserverport = url_serverport(baseurl, &baseserverportlen);
        
    if (!method || !serverport || !basemethod || !baseserverport) {
                return 0;
    }
        
    /* If this is the same server, we just go for it
    **/
    if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) &&
                (strncasecmp(method, basemethod, methodlen) == 0) &&
                (strncasecmp(serverport, baseserverport, serverportlen) == 0)) {
                return 1;
    }
        
    /* Do we find the method/server info for this and the base url
    ** in the same equivalence list?
    **/
    for (walk = http->equivalentservers; walk; walk = walk->next ) {
                if (serverinlist(url, walk->list) &&
                        serverinlist(baseurl, walk->list)) {
                        return 1;
                }
    }
        
    return 0;
}


static int serverinlist(char *url, struct swline *list)
{
    char *method;
    int methodlen;
    char *serverport;
    int serverportlen;
    char *listmethod;
    int listmethodlen;
    char *listserverport;
    int listserverportlen;
    
    method = url_method(url, &methodlen);
    serverport = url_serverport(url, &serverportlen);
    if (!method || !serverport) {
                return 0;
    }
        
    for ( ; list; list = list->next) {
                listmethod = url_method(list->line, &listmethodlen);
                listserverport = url_serverport(list->line, &listserverportlen);
                if (listmethod && listserverport) {
                        if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) &&
                                (strncasecmp(method, listmethod, methodlen) == 0) &&
                                (strncasecmp(serverport, listserverport, serverportlen) == 0)) {
                                return 1;
                        }
                }
    }
    return 0;
}

1	/*
2	$Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $
3	**
4	** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5	** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6	**
7	** This program and library is free software; you can redistribute it and/or
8	** modify it under the terms of the GNU (Library) General Public License
9	** as published by the Free Software Foundation; either version 2
10	** of the License, or any later version.
11	**
12	** This program is distributed in the hope that it will be useful,
13	** but WITHOUT ANY WARRANTY; without even the implied warranty of
14	** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	** GNU (Library) General Public License for more details.
16	**
17	** You should have received a copy of the GNU (Library) General Public License
18	** long with this program; if not, write to the Free Software
19	** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20	**--------------------------------------------------------------------
21	** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
22	**
23	** change sprintf to snprintf to avoid corruption
24	** SRE 11/17/99
25	**
26	** fixed cast to int problems pointed out by "gcc -Wall"
27	** SRE 2/22/00
28	**
29	*/
30
31	/*
32	** httpserver.c
33	*/
34
35	#ifndef _WIN32
36	#include <unistd.h>
37	#endif
38
39	#include <time.h>
40	#include <stdarg.h>
41
42	#include "swish.h"
43	#include "mem.h"
44	#include "string.h"
45	#include "index.h"
46
47	#include "http.h"
48	#include "httpserver.h"
49	#include "file.h"
50
51
52	/* The list of servers that we are acting on.
53	**/
54	static httpserverinfo *servers = 0;
55
56
57	static void parserobotstxt(char robots_buffer, int buflen, httpserverinfo server);
58	static char isolatevalue(char line, char keyword, int plen);
59	static int serverinlist(char url, struct swline list);
60
61
62
63	/* Find the robot rules for this URL. If haven't retrieved them
64	** yet, do so now.
65	**/
66	httpserverinfo getserverinfo(SWISH sw, char *url)
67	{
68	httpserverinfo *server;
69	char *method;
70	int methodlen;
71	char *serverport;
72	int serverportlen;
73	static int lencontenttype=0;
74	static char *contenttype=NULL;
75	static int lenbuffer=0;
76	static char *buffer=NULL;
77	FILE *fp;
78	struct MOD_Index *idx = sw->Index;
79	time_t last_modified;
80
81	// argh, this is ugly
82	char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
83
84
85	if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1);
86	if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1);
87
88	if ((method = url_method(url, &methodlen)) == 0) {
89	return 0;
90	}
91	if ((serverport = url_serverport(url, &serverportlen)) == 0) {
92	return 0;
93	}
94
95	/* Search for the rules
96	**/
97	for (server = servers; server; server = server->next) {
98	if (equivalentserver(sw, url, server->baseurl)) {
99	return server;
100	}
101	}
102
103	/* Create a new entry for this server and add it to the list.
104	**/
105	server = (httpserverinfo *)emalloc(sizeof(httpserverinfo));
106
107	/* +3 for the ://, +1 for the trailing /, +1 for the terminating null
108	**/
109	server->baseurl = (char *)emalloc(methodlen + serverportlen + 5);
110	/* These 4 lines to avoid a call to non ANSI snprintf . May not be the
111	best way but it ensures no buffer overruns */
112	memcpy (server->baseurl,method,methodlen);
113	memcpy (server->baseurl+methodlen,"://",3);
114	memcpy (server->baseurl+methodlen+3,serverport,serverportlen);
115	strcpy (server->baseurl+methodlen+3+serverportlen,"/");
116
117	server->lastretrieval = 0;
118	server->robotrules = 0;
119	server->next = servers;
120	servers = server;
121
122	/* Only http(s) servers can full rules, all the other ones just get dummies
123	** (this is useful for holding last retrieval)
124	**
125	** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what
126	** many people consider the official web exclusion rules. Unfortunately,
127	** the rules are not consistent about how records are formed. One line
128	** states "the file consists of one or more records separated by one or more
129	** blank lines" while another states "the record starts with one or more User-agent
130	** lines, followed by one or more Disallow lines."
131	**
132	** So, does a blank line after a User-agent line end a record? The spec is
133	** unclear on this matter. If the next legal line afer the blank line is
134	** a Disallow line, the blank line should most likely be ignored. But what
135	** if the next line is another User-agent line? For example:
136	**
137	** User-agent: MooBot
138	**
139	** User-agent: CreepySpider
140	** Disallow: /cgi-bin
141	**
142	** One interpretation (based on blank lines termination records) is that MooBot
143	** may visit any location (since there are no Disallows for it). Another
144	** interpretation (based on records needing both User-agent and Disallow lines)
145	** is that MooBot may not visit /cgi-bin
146	**
147	** While poking around, I found at least one site (www.sun.com) that uses blank
148	** lines within records. Because of that, I have decided to rely on records
149	** having both User-agent and Disallow lines (the second interpretation above).
150	**/
151	if (strncmp(server->baseurl, "http", 4) == 0) {
152	if((int)(strlen(server->baseurl)+20)>=lenbuffer) {
153	lenbuffer=strlen(server->baseurl)+20+200;
154	buffer=erealloc(buffer,lenbuffer+1);
155	}
156	sprintf(buffer, "%srobots.txt", server->baseurl);
157
158
159
160	file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
161	sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
162
163
164	if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200)
165	{
166	char *robots_buffer;
167	int filelen;
168	int bytes_read;
169
170	if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) {
171	lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200;
172	buffer=erealloc(buffer,lenbuffer+1);
173	}
174	sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid());
175	fp = fopen(buffer, F_READ_TEXT);
176
177	filelen = getsize(buffer);
178
179	robots_buffer = emalloc( filelen + 1 );
180	*robots_buffer = '\0';
181	bytes_read = fread(robots_buffer, 1, filelen, fp);
182	robots_buffer[bytes_read] = '\0';
183	parserobotstxt( robots_buffer, bytes_read, server );
184
185	efree( robots_buffer );
186
187	//parserobotstxt(fp, server);
188	//fclose(fp);
189	}
190	efree( file_prefix );
191
192	cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
193	cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
194	cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
195	}
196
197	return server;
198	}
199
200
201	int urldisallowed(SWISH sw, char url)
202	{
203	httpserverinfo *server;
204	robotrules *rule;
205	char *uri;
206	int urilen;
207
208	if ((server = getserverinfo(sw, url)) == 0) {
209	return 1;
210	}
211	if ((uri = url_uri(url, &urilen)) == 0) {
212	return 1;
213	}
214
215	for (rule = server->robotrules; rule; rule = rule->next) {
216	if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) {
217	return 1;
218	}
219	}
220
221	return 0;
222	}
223
224	// quick fix to parse from Mac and Windows.
225	// Pass in:
226	// char *next_start == pointer to a char that has where the next string starts.
227	// char *last_char == pointer to last char in buffer. Buffer MUST have room for one more char
228	//
229	// returns NULL on no more strings
230
231	static char next_line( char next_start, char last_char )
232	{
233	char buffer = next_start;
234	char *start;
235
236
237	// skip over any leading new lines or cr.
238	while ( buffer <= last_char && ( buffer == '\0' \|\| buffer == '\n' \|\| *buffer == '\r' ) )
239	buffer++;
240
241	if ( buffer > last_char )
242	return NULL;
243
244	start = buffer; // start of this word
245
246	// Now find the end of this string
247	while ( buffer <= last_char && ( buffer != '\0' && buffer != '\n' && *buffer != '\r' ) )
248	buffer++;
249
250	*buffer = '\0'; // mark the end of the string
251
252	buffer++;
253	*next_start = buffer;
254
255	return start;
256	}
257
258	static char useragent[] = "user-agent:";
259	static char disallow[] = "disallow:";
260	static char swishspider[] = "swishspider";
261
262	static void parserobotstxt(char robots_buffer, int buflen, httpserverinfo server)
263	{
264	char *buffer;
265	char *bufend = robots_buffer + buflen -1; // last char of string
266	char *next_start = robots_buffer;
267
268	enum {START, USERAGENT, DISALLOW} state = START;
269	enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING;
270	char *p;
271	int len;
272	robotrules *entry;
273	robotrules *entry2;
274
275	server->useragent = 0;
276
277	buffer = NULL;
278
279	while ( (buffer = next_line( &next_start, bufend ) ) )
280	{
281	if ( strchr( buffer, '#' ) )
282	*(strchr( buffer, '#' )) = '\0';
283
284	if ((buffer == '#') \|\| (buffer == '\0'))
285	continue;
286
287
288	if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) {
289	switch (state) {
290	case DISALLOW:
291	/* Since we found our specific user-agent, we can
292	** skip the rest of the file.
293	**/
294	if (useragentstate == SPECIFIC) {
295	return;
296	}
297
298	useragentstate = SKIPPING;
299
300	/* explict fallthrough */
301
302	case START:
303	case USERAGENT:
304	state = USERAGENT;
305
306	if (useragentstate != SPECIFIC) {
307	p = isolatevalue(buffer, useragent, &len);
308
309	if ((len == (sizeof(swishspider) - 1)) &&
310	(strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) {
311	useragentstate = SPECIFIC;
312
313	/* We might have already parsed generic rules,
314	** so clean them up if necessary.
315	*/
316	if (server->useragent) {
317	efree(server->useragent);
318	}
319	for (entry = server->robotrules; entry; ) {
320	entry2 = entry->next;
321	efree(entry);
322	entry = entry2;
323	}
324	server->robotrules = 0;
325
326	server->useragent = (char *)emalloc(len + 1);
327	strncpy(server->useragent, p, len);
328	*(server->useragent + len) = '\0';
329
330	}
331	else if ((len == 1) && (p == '')) {
332	useragentstate = GENERIC;
333	server->useragent = (char *)emalloc(2);
334	strcpy(server->useragent, ""); / emalloc'd 2 bytes, no safestrcpy */
335	}
336
337	}
338
339
340	break;
341
342	}
343	}
344
345	if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) {
346	state = DISALLOW;
347	if (useragentstate != SKIPPING) {
348	p = isolatevalue(buffer, disallow, &len);
349	if (len) {
350	entry = (robotrules *)emalloc(sizeof(robotrules));
351	entry->next = server->robotrules;
352	server->robotrules = entry;
353	entry->disallow = (char *)emalloc(len + 1);
354	strncpy(entry->disallow, p, len);
355	*(entry->disallow + len) = '\0';
356	}
357	}
358	}
359	}
360	}
361
362
363	static char isolatevalue(char line, char keyword, int plen)
364	{
365	/* Find the beginning of the value
366	**/
367	for (line += strlen(keyword); isspace((int)((unsigned char)line)); line++ ) { / cast to int 2/22/00 */
368	}
369
370	/* Strip off trailing spaces
371	**/
372	for (plen = strlen(line); isspace((int)((unsigned char)(line + plen - 1))); (plen)--) { /* cast to int 2/22/00 */
373	}
374
375	return line;
376	}
377
378
379	int equivalentserver(SWISH sw, char url, char *baseurl)
380	{
381	char *method;
382	int methodlen;
383	char *serverport;
384	int serverportlen;
385	char *basemethod;
386	int basemethodlen;
387	char *baseserverport;
388	int baseserverportlen;
389	struct multiswline *walk=NULL;
390	struct MOD_HTTP *http = sw->HTTP;
391
392	method = url_method(url, &methodlen);
393	serverport = url_serverport(url, &serverportlen);
394	basemethod = url_method(baseurl, &basemethodlen);
395	baseserverport = url_serverport(baseurl, &baseserverportlen);
396
397	if (!method \|\| !serverport \|\| !basemethod \|\| !baseserverport) {
398	return 0;
399	}
400
401	/* If this is the same server, we just go for it
402	**/
403	if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) &&
404	(strncasecmp(method, basemethod, methodlen) == 0) &&
405	(strncasecmp(serverport, baseserverport, serverportlen) == 0)) {
406	return 1;
407	}
408
409	/* Do we find the method/server info for this and the base url
410	** in the same equivalence list?
411	**/
412	for (walk = http->equivalentservers; walk; walk = walk->next ) {
413	if (serverinlist(url, walk->list) &&
414	serverinlist(baseurl, walk->list)) {
415	return 1;
416	}
417	}
418
419	return 0;
420	}
421
422
423	static int serverinlist(char url, struct swline list)
424	{
425	char *method;
426	int methodlen;
427	char *serverport;
428	int serverportlen;
429	char *listmethod;
430	int listmethodlen;
431	char *listserverport;
432	int listserverportlen;
433
434	method = url_method(url, &methodlen);
435	serverport = url_serverport(url, &serverportlen);
436	if (!method \|\| !serverport) {
437	return 0;
438	}
439
440	for ( ; list; list = list->next) {
441	listmethod = url_method(list->line, &listmethodlen);
442	listserverport = url_serverport(list->line, &listserverportlen);
443	if (listmethod && listserverport) {
444	if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) &&
445	(strncasecmp(method, listmethod, methodlen) == 0) &&
446	(strncasecmp(serverport, listserverport, serverportlen) == 0)) {
447	return 1;
448	}
449	}
450	}
451	return 0;
452	}
453