1 |
/* |
2 |
$Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $ |
3 |
** |
4 |
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
5 |
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
6 |
** |
7 |
** This program and library is free software; you can redistribute it and/or |
8 |
** modify it under the terms of the GNU (Library) General Public License |
9 |
** as published by the Free Software Foundation; either version 2 |
10 |
** of the License, or any later version. |
11 |
** |
12 |
** This program is distributed in the hope that it will be useful, |
13 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 |
** GNU (Library) General Public License for more details. |
16 |
** |
17 |
** You should have received a copy of the GNU (Library) General Public License |
18 |
** long with this program; if not, write to the Free Software |
19 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
20 |
**-------------------------------------------------------------------- |
21 |
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98 |
22 |
** |
23 |
** change sprintf to snprintf to avoid corruption |
24 |
** SRE 11/17/99 |
25 |
** |
26 |
** fixed cast to int problems pointed out by "gcc -Wall" |
27 |
** SRE 2/22/00 |
28 |
** |
29 |
*/ |
30 |
|
31 |
/* |
32 |
** httpserver.c |
33 |
*/ |
34 |
|
35 |
#ifndef _WIN32 |
36 |
#include <unistd.h> |
37 |
#endif |
38 |
|
39 |
#include <time.h> |
40 |
#include <stdarg.h> |
41 |
|
42 |
#include "swish.h" |
43 |
#include "mem.h" |
44 |
#include "string.h" |
45 |
#include "index.h" |
46 |
|
47 |
#include "http.h" |
48 |
#include "httpserver.h" |
49 |
#include "file.h" |
50 |
|
51 |
|
52 |
/* The list of servers that we are acting on. |
53 |
**/ |
54 |
static httpserverinfo *servers = 0; |
55 |
|
56 |
|
57 |
static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server); |
58 |
static char *isolatevalue(char *line, char *keyword, int *plen); |
59 |
static int serverinlist(char *url, struct swline *list); |
60 |
|
61 |
|
62 |
|
63 |
/* Find the robot rules for this URL. If haven't retrieved them |
64 |
** yet, do so now. |
65 |
**/ |
66 |
httpserverinfo *getserverinfo(SWISH *sw, char *url) |
67 |
{ |
68 |
httpserverinfo *server; |
69 |
char *method; |
70 |
int methodlen; |
71 |
char *serverport; |
72 |
int serverportlen; |
73 |
static int lencontenttype=0; |
74 |
static char *contenttype=NULL; |
75 |
static int lenbuffer=0; |
76 |
static char *buffer=NULL; |
77 |
FILE *fp; |
78 |
struct MOD_Index *idx = sw->Index; |
79 |
time_t last_modified; |
80 |
|
81 |
// argh, this is ugly |
82 |
char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack! |
83 |
|
84 |
|
85 |
if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1); |
86 |
if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1); |
87 |
|
88 |
if ((method = url_method(url, &methodlen)) == 0) { |
89 |
return 0; |
90 |
} |
91 |
if ((serverport = url_serverport(url, &serverportlen)) == 0) { |
92 |
return 0; |
93 |
} |
94 |
|
95 |
/* Search for the rules |
96 |
**/ |
97 |
for (server = servers; server; server = server->next) { |
98 |
if (equivalentserver(sw, url, server->baseurl)) { |
99 |
return server; |
100 |
} |
101 |
} |
102 |
|
103 |
/* Create a new entry for this server and add it to the list. |
104 |
**/ |
105 |
server = (httpserverinfo *)emalloc(sizeof(httpserverinfo)); |
106 |
|
107 |
/* +3 for the ://, +1 for the trailing /, +1 for the terminating null |
108 |
**/ |
109 |
server->baseurl = (char *)emalloc(methodlen + serverportlen + 5); |
110 |
/* These 4 lines to avoid a call to non ANSI snprintf . May not be the |
111 |
best way but it ensures no buffer overruns */ |
112 |
memcpy (server->baseurl,method,methodlen); |
113 |
memcpy (server->baseurl+methodlen,"://",3); |
114 |
memcpy (server->baseurl+methodlen+3,serverport,serverportlen); |
115 |
strcpy (server->baseurl+methodlen+3+serverportlen,"/"); |
116 |
|
117 |
server->lastretrieval = 0; |
118 |
server->robotrules = 0; |
119 |
server->next = servers; |
120 |
servers = server; |
121 |
|
122 |
/* Only http(s) servers can full rules, all the other ones just get dummies |
123 |
** (this is useful for holding last retrieval) |
124 |
** |
125 |
** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what |
126 |
** many people consider the official web exclusion rules. Unfortunately, |
127 |
** the rules are not consistent about how records are formed. One line |
128 |
** states "the file consists of one or more records separated by one or more |
129 |
** blank lines" while another states "the record starts with one or more User-agent |
130 |
** lines, followed by one or more Disallow lines." |
131 |
** |
132 |
** So, does a blank line after a User-agent line end a record? The spec is |
133 |
** unclear on this matter. If the next legal line afer the blank line is |
134 |
** a Disallow line, the blank line should most likely be ignored. But what |
135 |
** if the next line is another User-agent line? For example: |
136 |
** |
137 |
** User-agent: MooBot |
138 |
** |
139 |
** User-agent: CreepySpider |
140 |
** Disallow: /cgi-bin |
141 |
** |
142 |
** One interpretation (based on blank lines termination records) is that MooBot |
143 |
** may visit any location (since there are no Disallows for it). Another |
144 |
** interpretation (based on records needing both User-agent and Disallow lines) |
145 |
** is that MooBot may not visit /cgi-bin |
146 |
** |
147 |
** While poking around, I found at least one site (www.sun.com) that uses blank |
148 |
** lines within records. Because of that, I have decided to rely on records |
149 |
** having both User-agent and Disallow lines (the second interpretation above). |
150 |
**/ |
151 |
if (strncmp(server->baseurl, "http", 4) == 0) { |
152 |
if((int)(strlen(server->baseurl)+20)>=lenbuffer) { |
153 |
lenbuffer=strlen(server->baseurl)+20+200; |
154 |
buffer=erealloc(buffer,lenbuffer+1); |
155 |
} |
156 |
sprintf(buffer, "%srobots.txt", server->baseurl); |
157 |
|
158 |
|
159 |
|
160 |
file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") ); |
161 |
sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid()); |
162 |
|
163 |
|
164 |
if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200) |
165 |
{ |
166 |
char *robots_buffer; |
167 |
int filelen; |
168 |
int bytes_read; |
169 |
|
170 |
if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) { |
171 |
lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200; |
172 |
buffer=erealloc(buffer,lenbuffer+1); |
173 |
} |
174 |
sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid()); |
175 |
fp = fopen(buffer, F_READ_TEXT); |
176 |
|
177 |
filelen = getsize(buffer); |
178 |
|
179 |
robots_buffer = emalloc( filelen + 1 ); |
180 |
*robots_buffer = '\0'; |
181 |
bytes_read = fread(robots_buffer, 1, filelen, fp); |
182 |
robots_buffer[bytes_read] = '\0'; |
183 |
parserobotstxt( robots_buffer, bytes_read, server ); |
184 |
|
185 |
efree( robots_buffer ); |
186 |
|
187 |
//parserobotstxt(fp, server); |
188 |
//fclose(fp); |
189 |
} |
190 |
efree( file_prefix ); |
191 |
|
192 |
cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid()); |
193 |
cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid()); |
194 |
cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid()); |
195 |
} |
196 |
|
197 |
return server; |
198 |
} |
199 |
|
200 |
|
201 |
int urldisallowed(SWISH *sw, char *url) |
202 |
{ |
203 |
httpserverinfo *server; |
204 |
robotrules *rule; |
205 |
char *uri; |
206 |
int urilen; |
207 |
|
208 |
if ((server = getserverinfo(sw, url)) == 0) { |
209 |
return 1; |
210 |
} |
211 |
if ((uri = url_uri(url, &urilen)) == 0) { |
212 |
return 1; |
213 |
} |
214 |
|
215 |
for (rule = server->robotrules; rule; rule = rule->next) { |
216 |
if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) { |
217 |
return 1; |
218 |
} |
219 |
} |
220 |
|
221 |
return 0; |
222 |
} |
223 |
|
224 |
// quick fix to parse from Mac and Windows. |
225 |
// Pass in: |
226 |
// char **next_start == pointer to a *char that has where the next string starts. |
227 |
// char *last_char == pointer to last char in buffer. Buffer MUST have room for one more char |
228 |
// |
229 |
// returns NULL on no more strings |
230 |
|
231 |
static char *next_line( char **next_start, char *last_char ) |
232 |
{ |
233 |
char *buffer = *next_start; |
234 |
char *start; |
235 |
|
236 |
|
237 |
// skip over any leading new lines or cr. |
238 |
while ( buffer <= last_char && ( *buffer == '\0' || *buffer == '\n' || *buffer == '\r' ) ) |
239 |
buffer++; |
240 |
|
241 |
if ( buffer > last_char ) |
242 |
return NULL; |
243 |
|
244 |
start = buffer; // start of this word |
245 |
|
246 |
// Now find the end of this string |
247 |
while ( buffer <= last_char && ( *buffer != '\0' && *buffer != '\n' && *buffer != '\r' ) ) |
248 |
buffer++; |
249 |
|
250 |
*buffer = '\0'; // mark the end of the string |
251 |
|
252 |
buffer++; |
253 |
*next_start = buffer; |
254 |
|
255 |
return start; |
256 |
} |
257 |
|
258 |
static char useragent[] = "user-agent:"; |
259 |
static char disallow[] = "disallow:"; |
260 |
static char swishspider[] = "swishspider"; |
261 |
|
262 |
static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server) |
263 |
{ |
264 |
char *buffer; |
265 |
char *bufend = robots_buffer + buflen -1; // last char of string |
266 |
char *next_start = robots_buffer; |
267 |
|
268 |
enum {START, USERAGENT, DISALLOW} state = START; |
269 |
enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING; |
270 |
char *p; |
271 |
int len; |
272 |
robotrules *entry; |
273 |
robotrules *entry2; |
274 |
|
275 |
server->useragent = 0; |
276 |
|
277 |
buffer = NULL; |
278 |
|
279 |
while ( (buffer = next_line( &next_start, bufend ) ) ) |
280 |
{ |
281 |
if ( strchr( buffer, '#' ) ) |
282 |
*(strchr( buffer, '#' )) = '\0'; |
283 |
|
284 |
if ((*buffer == '#') || (*buffer == '\0')) |
285 |
continue; |
286 |
|
287 |
|
288 |
if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) { |
289 |
switch (state) { |
290 |
case DISALLOW: |
291 |
/* Since we found our specific user-agent, we can |
292 |
** skip the rest of the file. |
293 |
**/ |
294 |
if (useragentstate == SPECIFIC) { |
295 |
return; |
296 |
} |
297 |
|
298 |
useragentstate = SKIPPING; |
299 |
|
300 |
/* explict fallthrough */ |
301 |
|
302 |
case START: |
303 |
case USERAGENT: |
304 |
state = USERAGENT; |
305 |
|
306 |
if (useragentstate != SPECIFIC) { |
307 |
p = isolatevalue(buffer, useragent, &len); |
308 |
|
309 |
if ((len == (sizeof(swishspider) - 1)) && |
310 |
(strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) { |
311 |
useragentstate = SPECIFIC; |
312 |
|
313 |
/* We might have already parsed generic rules, |
314 |
** so clean them up if necessary. |
315 |
*/ |
316 |
if (server->useragent) { |
317 |
efree(server->useragent); |
318 |
} |
319 |
for (entry = server->robotrules; entry; ) { |
320 |
entry2 = entry->next; |
321 |
efree(entry); |
322 |
entry = entry2; |
323 |
} |
324 |
server->robotrules = 0; |
325 |
|
326 |
server->useragent = (char *)emalloc(len + 1); |
327 |
strncpy(server->useragent, p, len); |
328 |
*(server->useragent + len) = '\0'; |
329 |
|
330 |
} |
331 |
else if ((len == 1) && (*p == '*')) { |
332 |
useragentstate = GENERIC; |
333 |
server->useragent = (char *)emalloc(2); |
334 |
strcpy(server->useragent, "*"); /* emalloc'd 2 bytes, no safestrcpy */ |
335 |
} |
336 |
|
337 |
} |
338 |
|
339 |
|
340 |
break; |
341 |
|
342 |
} |
343 |
} |
344 |
|
345 |
if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) { |
346 |
state = DISALLOW; |
347 |
if (useragentstate != SKIPPING) { |
348 |
p = isolatevalue(buffer, disallow, &len); |
349 |
if (len) { |
350 |
entry = (robotrules *)emalloc(sizeof(robotrules)); |
351 |
entry->next = server->robotrules; |
352 |
server->robotrules = entry; |
353 |
entry->disallow = (char *)emalloc(len + 1); |
354 |
strncpy(entry->disallow, p, len); |
355 |
*(entry->disallow + len) = '\0'; |
356 |
} |
357 |
} |
358 |
} |
359 |
} |
360 |
} |
361 |
|
362 |
|
363 |
static char *isolatevalue(char *line, char *keyword, int *plen) |
364 |
{ |
365 |
/* Find the beginning of the value |
366 |
**/ |
367 |
for (line += strlen(keyword); isspace((int)((unsigned char)*line)); line++ ) { /* cast to int 2/22/00 */ |
368 |
} |
369 |
|
370 |
/* Strip off trailing spaces |
371 |
**/ |
372 |
for (*plen = strlen(line); isspace((int)((unsigned char)*(line + *plen - 1))); (*plen)--) { /* cast to int 2/22/00 */ |
373 |
} |
374 |
|
375 |
return line; |
376 |
} |
377 |
|
378 |
|
379 |
int equivalentserver(SWISH *sw, char *url, char *baseurl) |
380 |
{ |
381 |
char *method; |
382 |
int methodlen; |
383 |
char *serverport; |
384 |
int serverportlen; |
385 |
char *basemethod; |
386 |
int basemethodlen; |
387 |
char *baseserverport; |
388 |
int baseserverportlen; |
389 |
struct multiswline *walk=NULL; |
390 |
struct MOD_HTTP *http = sw->HTTP; |
391 |
|
392 |
method = url_method(url, &methodlen); |
393 |
serverport = url_serverport(url, &serverportlen); |
394 |
basemethod = url_method(baseurl, &basemethodlen); |
395 |
baseserverport = url_serverport(baseurl, &baseserverportlen); |
396 |
|
397 |
if (!method || !serverport || !basemethod || !baseserverport) { |
398 |
return 0; |
399 |
} |
400 |
|
401 |
/* If this is the same server, we just go for it |
402 |
**/ |
403 |
if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) && |
404 |
(strncasecmp(method, basemethod, methodlen) == 0) && |
405 |
(strncasecmp(serverport, baseserverport, serverportlen) == 0)) { |
406 |
return 1; |
407 |
} |
408 |
|
409 |
/* Do we find the method/server info for this and the base url |
410 |
** in the same equivalence list? |
411 |
**/ |
412 |
for (walk = http->equivalentservers; walk; walk = walk->next ) { |
413 |
if (serverinlist(url, walk->list) && |
414 |
serverinlist(baseurl, walk->list)) { |
415 |
return 1; |
416 |
} |
417 |
} |
418 |
|
419 |
return 0; |
420 |
} |
421 |
|
422 |
|
423 |
static int serverinlist(char *url, struct swline *list) |
424 |
{ |
425 |
char *method; |
426 |
int methodlen; |
427 |
char *serverport; |
428 |
int serverportlen; |
429 |
char *listmethod; |
430 |
int listmethodlen; |
431 |
char *listserverport; |
432 |
int listserverportlen; |
433 |
|
434 |
method = url_method(url, &methodlen); |
435 |
serverport = url_serverport(url, &serverportlen); |
436 |
if (!method || !serverport) { |
437 |
return 0; |
438 |
} |
439 |
|
440 |
for ( ; list; list = list->next) { |
441 |
listmethod = url_method(list->line, &listmethodlen); |
442 |
listserverport = url_serverport(list->line, &listserverportlen); |
443 |
if (listmethod && listserverport) { |
444 |
if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) && |
445 |
(strncasecmp(method, listmethod, methodlen) == 0) && |
446 |
(strncasecmp(serverport, listserverport, serverportlen) == 0)) { |
447 |
return 1; |
448 |
} |
449 |
} |
450 |
} |
451 |
return 0; |
452 |
} |
453 |
|