1 |
adcroft |
1.1 |
/* |
2 |
|
|
$Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $ |
3 |
|
|
** |
4 |
|
|
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
5 |
|
|
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
6 |
|
|
** |
7 |
|
|
** This program and library is free software; you can redistribute it and/or |
8 |
|
|
** modify it under the terms of the GNU (Library) General Public License |
9 |
|
|
** as published by the Free Software Foundation; either version 2 |
10 |
|
|
** of the License, or any later version. |
11 |
|
|
** |
12 |
|
|
** This program is distributed in the hope that it will be useful, |
13 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 |
|
|
** GNU (Library) General Public License for more details. |
16 |
|
|
** |
17 |
|
|
** You should have received a copy of the GNU (Library) General Public License |
18 |
|
|
** long with this program; if not, write to the Free Software |
19 |
|
|
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
20 |
|
|
**-------------------------------------------------------------------- |
21 |
|
|
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98 |
22 |
|
|
** |
23 |
|
|
** change sprintf to snprintf to avoid corruption |
24 |
|
|
** SRE 11/17/99 |
25 |
|
|
** |
26 |
|
|
** fixed cast to int problems pointed out by "gcc -Wall" |
27 |
|
|
** SRE 2/22/00 |
28 |
|
|
** |
29 |
|
|
*/ |
30 |
|
|
|
31 |
|
|
/* |
32 |
|
|
** httpserver.c |
33 |
|
|
*/ |
34 |
|
|
|
35 |
|
|
#ifndef _WIN32 |
36 |
|
|
#include <unistd.h> |
37 |
|
|
#endif |
38 |
|
|
|
39 |
|
|
#include <time.h> |
40 |
|
|
#include <stdarg.h> |
41 |
|
|
|
42 |
|
|
#include "swish.h" |
43 |
|
|
#include "mem.h" |
44 |
|
|
#include "string.h" |
45 |
|
|
#include "index.h" |
46 |
|
|
|
47 |
|
|
#include "http.h" |
48 |
|
|
#include "httpserver.h" |
49 |
|
|
#include "file.h" |
50 |
|
|
|
51 |
|
|
|
52 |
|
|
/* The list of servers that we are acting on. |
53 |
|
|
**/ |
54 |
|
|
static httpserverinfo *servers = 0; |
55 |
|
|
|
56 |
|
|
|
57 |
|
|
static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server); |
58 |
|
|
static char *isolatevalue(char *line, char *keyword, int *plen); |
59 |
|
|
static int serverinlist(char *url, struct swline *list); |
60 |
|
|
|
61 |
|
|
|
62 |
|
|
|
63 |
|
|
/* Find the robot rules for this URL. If haven't retrieved them |
64 |
|
|
** yet, do so now. |
65 |
|
|
**/ |
66 |
|
|
httpserverinfo *getserverinfo(SWISH *sw, char *url) |
67 |
|
|
{ |
68 |
|
|
httpserverinfo *server; |
69 |
|
|
char *method; |
70 |
|
|
int methodlen; |
71 |
|
|
char *serverport; |
72 |
|
|
int serverportlen; |
73 |
|
|
static int lencontenttype=0; |
74 |
|
|
static char *contenttype=NULL; |
75 |
|
|
static int lenbuffer=0; |
76 |
|
|
static char *buffer=NULL; |
77 |
|
|
FILE *fp; |
78 |
|
|
struct MOD_Index *idx = sw->Index; |
79 |
|
|
time_t last_modified; |
80 |
|
|
|
81 |
|
|
// argh, this is ugly |
82 |
|
|
char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack! |
83 |
|
|
|
84 |
|
|
|
85 |
|
|
if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1); |
86 |
|
|
if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1); |
87 |
|
|
|
88 |
|
|
if ((method = url_method(url, &methodlen)) == 0) { |
89 |
|
|
return 0; |
90 |
|
|
} |
91 |
|
|
if ((serverport = url_serverport(url, &serverportlen)) == 0) { |
92 |
|
|
return 0; |
93 |
|
|
} |
94 |
|
|
|
95 |
|
|
/* Search for the rules |
96 |
|
|
**/ |
97 |
|
|
for (server = servers; server; server = server->next) { |
98 |
|
|
if (equivalentserver(sw, url, server->baseurl)) { |
99 |
|
|
return server; |
100 |
|
|
} |
101 |
|
|
} |
102 |
|
|
|
103 |
|
|
/* Create a new entry for this server and add it to the list. |
104 |
|
|
**/ |
105 |
|
|
server = (httpserverinfo *)emalloc(sizeof(httpserverinfo)); |
106 |
|
|
|
107 |
|
|
/* +3 for the ://, +1 for the trailing /, +1 for the terminating null |
108 |
|
|
**/ |
109 |
|
|
server->baseurl = (char *)emalloc(methodlen + serverportlen + 5); |
110 |
|
|
/* These 4 lines to avoid a call to non ANSI snprintf . May not be the |
111 |
|
|
best way but it ensures no buffer overruns */ |
112 |
|
|
memcpy (server->baseurl,method,methodlen); |
113 |
|
|
memcpy (server->baseurl+methodlen,"://",3); |
114 |
|
|
memcpy (server->baseurl+methodlen+3,serverport,serverportlen); |
115 |
|
|
strcpy (server->baseurl+methodlen+3+serverportlen,"/"); |
116 |
|
|
|
117 |
|
|
server->lastretrieval = 0; |
118 |
|
|
server->robotrules = 0; |
119 |
|
|
server->next = servers; |
120 |
|
|
servers = server; |
121 |
|
|
|
122 |
|
|
/* Only http(s) servers can full rules, all the other ones just get dummies |
123 |
|
|
** (this is useful for holding last retrieval) |
124 |
|
|
** |
125 |
|
|
** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what |
126 |
|
|
** many people consider the official web exclusion rules. Unfortunately, |
127 |
|
|
** the rules are not consistent about how records are formed. One line |
128 |
|
|
** states "the file consists of one or more records separated by one or more |
129 |
|
|
** blank lines" while another states "the record starts with one or more User-agent |
130 |
|
|
** lines, followed by one or more Disallow lines." |
131 |
|
|
** |
132 |
|
|
** So, does a blank line after a User-agent line end a record? The spec is |
133 |
|
|
** unclear on this matter. If the next legal line afer the blank line is |
134 |
|
|
** a Disallow line, the blank line should most likely be ignored. But what |
135 |
|
|
** if the next line is another User-agent line? For example: |
136 |
|
|
** |
137 |
|
|
** User-agent: MooBot |
138 |
|
|
** |
139 |
|
|
** User-agent: CreepySpider |
140 |
|
|
** Disallow: /cgi-bin |
141 |
|
|
** |
142 |
|
|
** One interpretation (based on blank lines termination records) is that MooBot |
143 |
|
|
** may visit any location (since there are no Disallows for it). Another |
144 |
|
|
** interpretation (based on records needing both User-agent and Disallow lines) |
145 |
|
|
** is that MooBot may not visit /cgi-bin |
146 |
|
|
** |
147 |
|
|
** While poking around, I found at least one site (www.sun.com) that uses blank |
148 |
|
|
** lines within records. Because of that, I have decided to rely on records |
149 |
|
|
** having both User-agent and Disallow lines (the second interpretation above). |
150 |
|
|
**/ |
151 |
|
|
if (strncmp(server->baseurl, "http", 4) == 0) { |
152 |
|
|
if((int)(strlen(server->baseurl)+20)>=lenbuffer) { |
153 |
|
|
lenbuffer=strlen(server->baseurl)+20+200; |
154 |
|
|
buffer=erealloc(buffer,lenbuffer+1); |
155 |
|
|
} |
156 |
|
|
sprintf(buffer, "%srobots.txt", server->baseurl); |
157 |
|
|
|
158 |
|
|
|
159 |
|
|
|
160 |
|
|
file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") ); |
161 |
|
|
sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid()); |
162 |
|
|
|
163 |
|
|
|
164 |
|
|
if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200) |
165 |
|
|
{ |
166 |
|
|
char *robots_buffer; |
167 |
|
|
int filelen; |
168 |
|
|
int bytes_read; |
169 |
|
|
|
170 |
|
|
if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) { |
171 |
|
|
lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200; |
172 |
|
|
buffer=erealloc(buffer,lenbuffer+1); |
173 |
|
|
} |
174 |
|
|
sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid()); |
175 |
|
|
fp = fopen(buffer, F_READ_TEXT); |
176 |
|
|
|
177 |
|
|
filelen = getsize(buffer); |
178 |
|
|
|
179 |
|
|
robots_buffer = emalloc( filelen + 1 ); |
180 |
|
|
*robots_buffer = '\0'; |
181 |
|
|
bytes_read = fread(robots_buffer, 1, filelen, fp); |
182 |
|
|
robots_buffer[bytes_read] = '\0'; |
183 |
|
|
parserobotstxt( robots_buffer, bytes_read, server ); |
184 |
|
|
|
185 |
|
|
efree( robots_buffer ); |
186 |
|
|
|
187 |
|
|
//parserobotstxt(fp, server); |
188 |
|
|
//fclose(fp); |
189 |
|
|
} |
190 |
|
|
efree( file_prefix ); |
191 |
|
|
|
192 |
|
|
cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid()); |
193 |
|
|
cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid()); |
194 |
|
|
cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid()); |
195 |
|
|
} |
196 |
|
|
|
197 |
|
|
return server; |
198 |
|
|
} |
199 |
|
|
|
200 |
|
|
|
201 |
|
|
int urldisallowed(SWISH *sw, char *url) |
202 |
|
|
{ |
203 |
|
|
httpserverinfo *server; |
204 |
|
|
robotrules *rule; |
205 |
|
|
char *uri; |
206 |
|
|
int urilen; |
207 |
|
|
|
208 |
|
|
if ((server = getserverinfo(sw, url)) == 0) { |
209 |
|
|
return 1; |
210 |
|
|
} |
211 |
|
|
if ((uri = url_uri(url, &urilen)) == 0) { |
212 |
|
|
return 1; |
213 |
|
|
} |
214 |
|
|
|
215 |
|
|
for (rule = server->robotrules; rule; rule = rule->next) { |
216 |
|
|
if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) { |
217 |
|
|
return 1; |
218 |
|
|
} |
219 |
|
|
} |
220 |
|
|
|
221 |
|
|
return 0; |
222 |
|
|
} |
223 |
|
|
|
224 |
|
|
// quick fix to parse from Mac and Windows. |
225 |
|
|
// Pass in: |
226 |
|
|
// char **next_start == pointer to a *char that has where the next string starts. |
227 |
|
|
// char *last_char == pointer to last char in buffer. Buffer MUST have room for one more char |
228 |
|
|
// |
229 |
|
|
// returns NULL on no more strings |
230 |
|
|
|
231 |
|
|
static char *next_line( char **next_start, char *last_char ) |
232 |
|
|
{ |
233 |
|
|
char *buffer = *next_start; |
234 |
|
|
char *start; |
235 |
|
|
|
236 |
|
|
|
237 |
|
|
// skip over any leading new lines or cr. |
238 |
|
|
while ( buffer <= last_char && ( *buffer == '\0' || *buffer == '\n' || *buffer == '\r' ) ) |
239 |
|
|
buffer++; |
240 |
|
|
|
241 |
|
|
if ( buffer > last_char ) |
242 |
|
|
return NULL; |
243 |
|
|
|
244 |
|
|
start = buffer; // start of this word |
245 |
|
|
|
246 |
|
|
// Now find the end of this string |
247 |
|
|
while ( buffer <= last_char && ( *buffer != '\0' && *buffer != '\n' && *buffer != '\r' ) ) |
248 |
|
|
buffer++; |
249 |
|
|
|
250 |
|
|
*buffer = '\0'; // mark the end of the string |
251 |
|
|
|
252 |
|
|
buffer++; |
253 |
|
|
*next_start = buffer; |
254 |
|
|
|
255 |
|
|
return start; |
256 |
|
|
} |
257 |
|
|
|
258 |
|
|
static char useragent[] = "user-agent:"; |
259 |
|
|
static char disallow[] = "disallow:"; |
260 |
|
|
static char swishspider[] = "swishspider"; |
261 |
|
|
|
262 |
|
|
static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server) |
263 |
|
|
{ |
264 |
|
|
char *buffer; |
265 |
|
|
char *bufend = robots_buffer + buflen -1; // last char of string |
266 |
|
|
char *next_start = robots_buffer; |
267 |
|
|
|
268 |
|
|
enum {START, USERAGENT, DISALLOW} state = START; |
269 |
|
|
enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING; |
270 |
|
|
char *p; |
271 |
|
|
int len; |
272 |
|
|
robotrules *entry; |
273 |
|
|
robotrules *entry2; |
274 |
|
|
|
275 |
|
|
server->useragent = 0; |
276 |
|
|
|
277 |
|
|
buffer = NULL; |
278 |
|
|
|
279 |
|
|
while ( (buffer = next_line( &next_start, bufend ) ) ) |
280 |
|
|
{ |
281 |
|
|
if ( strchr( buffer, '#' ) ) |
282 |
|
|
*(strchr( buffer, '#' )) = '\0'; |
283 |
|
|
|
284 |
|
|
if ((*buffer == '#') || (*buffer == '\0')) |
285 |
|
|
continue; |
286 |
|
|
|
287 |
|
|
|
288 |
|
|
if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) { |
289 |
|
|
switch (state) { |
290 |
|
|
case DISALLOW: |
291 |
|
|
/* Since we found our specific user-agent, we can |
292 |
|
|
** skip the rest of the file. |
293 |
|
|
**/ |
294 |
|
|
if (useragentstate == SPECIFIC) { |
295 |
|
|
return; |
296 |
|
|
} |
297 |
|
|
|
298 |
|
|
useragentstate = SKIPPING; |
299 |
|
|
|
300 |
|
|
/* explict fallthrough */ |
301 |
|
|
|
302 |
|
|
case START: |
303 |
|
|
case USERAGENT: |
304 |
|
|
state = USERAGENT; |
305 |
|
|
|
306 |
|
|
if (useragentstate != SPECIFIC) { |
307 |
|
|
p = isolatevalue(buffer, useragent, &len); |
308 |
|
|
|
309 |
|
|
if ((len == (sizeof(swishspider) - 1)) && |
310 |
|
|
(strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) { |
311 |
|
|
useragentstate = SPECIFIC; |
312 |
|
|
|
313 |
|
|
/* We might have already parsed generic rules, |
314 |
|
|
** so clean them up if necessary. |
315 |
|
|
*/ |
316 |
|
|
if (server->useragent) { |
317 |
|
|
efree(server->useragent); |
318 |
|
|
} |
319 |
|
|
for (entry = server->robotrules; entry; ) { |
320 |
|
|
entry2 = entry->next; |
321 |
|
|
efree(entry); |
322 |
|
|
entry = entry2; |
323 |
|
|
} |
324 |
|
|
server->robotrules = 0; |
325 |
|
|
|
326 |
|
|
server->useragent = (char *)emalloc(len + 1); |
327 |
|
|
strncpy(server->useragent, p, len); |
328 |
|
|
*(server->useragent + len) = '\0'; |
329 |
|
|
|
330 |
|
|
} |
331 |
|
|
else if ((len == 1) && (*p == '*')) { |
332 |
|
|
useragentstate = GENERIC; |
333 |
|
|
server->useragent = (char *)emalloc(2); |
334 |
|
|
strcpy(server->useragent, "*"); /* emalloc'd 2 bytes, no safestrcpy */ |
335 |
|
|
} |
336 |
|
|
|
337 |
|
|
} |
338 |
|
|
|
339 |
|
|
|
340 |
|
|
break; |
341 |
|
|
|
342 |
|
|
} |
343 |
|
|
} |
344 |
|
|
|
345 |
|
|
if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) { |
346 |
|
|
state = DISALLOW; |
347 |
|
|
if (useragentstate != SKIPPING) { |
348 |
|
|
p = isolatevalue(buffer, disallow, &len); |
349 |
|
|
if (len) { |
350 |
|
|
entry = (robotrules *)emalloc(sizeof(robotrules)); |
351 |
|
|
entry->next = server->robotrules; |
352 |
|
|
server->robotrules = entry; |
353 |
|
|
entry->disallow = (char *)emalloc(len + 1); |
354 |
|
|
strncpy(entry->disallow, p, len); |
355 |
|
|
*(entry->disallow + len) = '\0'; |
356 |
|
|
} |
357 |
|
|
} |
358 |
|
|
} |
359 |
|
|
} |
360 |
|
|
} |
361 |
|
|
|
362 |
|
|
|
363 |
|
|
static char *isolatevalue(char *line, char *keyword, int *plen) |
364 |
|
|
{ |
365 |
|
|
/* Find the beginning of the value |
366 |
|
|
**/ |
367 |
|
|
for (line += strlen(keyword); isspace((int)((unsigned char)*line)); line++ ) { /* cast to int 2/22/00 */ |
368 |
|
|
} |
369 |
|
|
|
370 |
|
|
/* Strip off trailing spaces |
371 |
|
|
**/ |
372 |
|
|
for (*plen = strlen(line); isspace((int)((unsigned char)*(line + *plen - 1))); (*plen)--) { /* cast to int 2/22/00 */ |
373 |
|
|
} |
374 |
|
|
|
375 |
|
|
return line; |
376 |
|
|
} |
377 |
|
|
|
378 |
|
|
|
379 |
|
|
int equivalentserver(SWISH *sw, char *url, char *baseurl) |
380 |
|
|
{ |
381 |
|
|
char *method; |
382 |
|
|
int methodlen; |
383 |
|
|
char *serverport; |
384 |
|
|
int serverportlen; |
385 |
|
|
char *basemethod; |
386 |
|
|
int basemethodlen; |
387 |
|
|
char *baseserverport; |
388 |
|
|
int baseserverportlen; |
389 |
|
|
struct multiswline *walk=NULL; |
390 |
|
|
struct MOD_HTTP *http = sw->HTTP; |
391 |
|
|
|
392 |
|
|
method = url_method(url, &methodlen); |
393 |
|
|
serverport = url_serverport(url, &serverportlen); |
394 |
|
|
basemethod = url_method(baseurl, &basemethodlen); |
395 |
|
|
baseserverport = url_serverport(baseurl, &baseserverportlen); |
396 |
|
|
|
397 |
|
|
if (!method || !serverport || !basemethod || !baseserverport) { |
398 |
|
|
return 0; |
399 |
|
|
} |
400 |
|
|
|
401 |
|
|
/* If this is the same server, we just go for it |
402 |
|
|
**/ |
403 |
|
|
if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) && |
404 |
|
|
(strncasecmp(method, basemethod, methodlen) == 0) && |
405 |
|
|
(strncasecmp(serverport, baseserverport, serverportlen) == 0)) { |
406 |
|
|
return 1; |
407 |
|
|
} |
408 |
|
|
|
409 |
|
|
/* Do we find the method/server info for this and the base url |
410 |
|
|
** in the same equivalence list? |
411 |
|
|
**/ |
412 |
|
|
for (walk = http->equivalentservers; walk; walk = walk->next ) { |
413 |
|
|
if (serverinlist(url, walk->list) && |
414 |
|
|
serverinlist(baseurl, walk->list)) { |
415 |
|
|
return 1; |
416 |
|
|
} |
417 |
|
|
} |
418 |
|
|
|
419 |
|
|
return 0; |
420 |
|
|
} |
421 |
|
|
|
422 |
|
|
|
423 |
|
|
static int serverinlist(char *url, struct swline *list) |
424 |
|
|
{ |
425 |
|
|
char *method; |
426 |
|
|
int methodlen; |
427 |
|
|
char *serverport; |
428 |
|
|
int serverportlen; |
429 |
|
|
char *listmethod; |
430 |
|
|
int listmethodlen; |
431 |
|
|
char *listserverport; |
432 |
|
|
int listserverportlen; |
433 |
|
|
|
434 |
|
|
method = url_method(url, &methodlen); |
435 |
|
|
serverport = url_serverport(url, &serverportlen); |
436 |
|
|
if (!method || !serverport) { |
437 |
|
|
return 0; |
438 |
|
|
} |
439 |
|
|
|
440 |
|
|
for ( ; list; list = list->next) { |
441 |
|
|
listmethod = url_method(list->line, &listmethodlen); |
442 |
|
|
listserverport = url_serverport(list->line, &listserverportlen); |
443 |
|
|
if (listmethod && listserverport) { |
444 |
|
|
if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) && |
445 |
|
|
(strncasecmp(method, listmethod, methodlen) == 0) && |
446 |
|
|
(strncasecmp(serverport, listserverport, serverportlen) == 0)) { |
447 |
|
|
return 1; |
448 |
|
|
} |
449 |
|
|
} |
450 |
|
|
} |
451 |
|
|
return 0; |
452 |
|
|
} |
453 |
|
|
|