/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/httpserver.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/httpserver.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 $Id: httpserver.c,v 1.10 2002/08/14 22:08:48 whmoseley Exp $
3 **
4 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6 **
7 ** This program and library is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU (Library) General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU (Library) General Public License for more details.
16 **
17 ** You should have received a copy of the GNU (Library) General Public License
18 ** long with this program; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 **--------------------------------------------------------------------
21 ** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
22 **
23 ** change sprintf to snprintf to avoid corruption
24 ** SRE 11/17/99
25 **
26 ** fixed cast to int problems pointed out by "gcc -Wall"
27 ** SRE 2/22/00
28 **
29 */
30
31 /*
32 ** httpserver.c
33 */
34
35 #ifndef _WIN32
36 #include <unistd.h>
37 #endif
38
39 #include <time.h>
40 #include <stdarg.h>
41
42 #include "swish.h"
43 #include "mem.h"
44 #include "string.h"
45 #include "index.h"
46
47 #include "http.h"
48 #include "httpserver.h"
49 #include "file.h"
50
51
52 /* The list of servers that we are acting on.
53 **/
54 static httpserverinfo *servers = 0;
55
56
57 static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server);
58 static char *isolatevalue(char *line, char *keyword, int *plen);
59 static int serverinlist(char *url, struct swline *list);
60
61
62
63 /* Find the robot rules for this URL. If haven't retrieved them
64 ** yet, do so now.
65 **/
66 httpserverinfo *getserverinfo(SWISH *sw, char *url)
67 {
68 httpserverinfo *server;
69 char *method;
70 int methodlen;
71 char *serverport;
72 int serverportlen;
73 static int lencontenttype=0;
74 static char *contenttype=NULL;
75 static int lenbuffer=0;
76 static char *buffer=NULL;
77 FILE *fp;
78 struct MOD_Index *idx = sw->Index;
79 time_t last_modified;
80
81 // argh, this is ugly
82 char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
83
84
85 if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1);
86 if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1);
87
88 if ((method = url_method(url, &methodlen)) == 0) {
89 return 0;
90 }
91 if ((serverport = url_serverport(url, &serverportlen)) == 0) {
92 return 0;
93 }
94
95 /* Search for the rules
96 **/
97 for (server = servers; server; server = server->next) {
98 if (equivalentserver(sw, url, server->baseurl)) {
99 return server;
100 }
101 }
102
103 /* Create a new entry for this server and add it to the list.
104 **/
105 server = (httpserverinfo *)emalloc(sizeof(httpserverinfo));
106
107 /* +3 for the ://, +1 for the trailing /, +1 for the terminating null
108 **/
109 server->baseurl = (char *)emalloc(methodlen + serverportlen + 5);
110 /* These 4 lines to avoid a call to non ANSI snprintf . May not be the
111 best way but it ensures no buffer overruns */
112 memcpy (server->baseurl,method,methodlen);
113 memcpy (server->baseurl+methodlen,"://",3);
114 memcpy (server->baseurl+methodlen+3,serverport,serverportlen);
115 strcpy (server->baseurl+methodlen+3+serverportlen,"/");
116
117 server->lastretrieval = 0;
118 server->robotrules = 0;
119 server->next = servers;
120 servers = server;
121
122 /* Only http(s) servers can full rules, all the other ones just get dummies
123 ** (this is useful for holding last retrieval)
124 **
125 ** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what
126 ** many people consider the official web exclusion rules. Unfortunately,
127 ** the rules are not consistent about how records are formed. One line
128 ** states "the file consists of one or more records separated by one or more
129 ** blank lines" while another states "the record starts with one or more User-agent
130 ** lines, followed by one or more Disallow lines."
131 **
132 ** So, does a blank line after a User-agent line end a record? The spec is
133 ** unclear on this matter. If the next legal line afer the blank line is
134 ** a Disallow line, the blank line should most likely be ignored. But what
135 ** if the next line is another User-agent line? For example:
136 **
137 ** User-agent: MooBot
138 **
139 ** User-agent: CreepySpider
140 ** Disallow: /cgi-bin
141 **
142 ** One interpretation (based on blank lines termination records) is that MooBot
143 ** may visit any location (since there are no Disallows for it). Another
144 ** interpretation (based on records needing both User-agent and Disallow lines)
145 ** is that MooBot may not visit /cgi-bin
146 **
147 ** While poking around, I found at least one site (www.sun.com) that uses blank
148 ** lines within records. Because of that, I have decided to rely on records
149 ** having both User-agent and Disallow lines (the second interpretation above).
150 **/
151 if (strncmp(server->baseurl, "http", 4) == 0) {
152 if((int)(strlen(server->baseurl)+20)>=lenbuffer) {
153 lenbuffer=strlen(server->baseurl)+20+200;
154 buffer=erealloc(buffer,lenbuffer+1);
155 }
156 sprintf(buffer, "%srobots.txt", server->baseurl);
157
158
159
160 file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
161 sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
162
163
164 if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200)
165 {
166 char *robots_buffer;
167 int filelen;
168 int bytes_read;
169
170 if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) {
171 lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200;
172 buffer=erealloc(buffer,lenbuffer+1);
173 }
174 sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid());
175 fp = fopen(buffer, F_READ_TEXT);
176
177 filelen = getsize(buffer);
178
179 robots_buffer = emalloc( filelen + 1 );
180 *robots_buffer = '\0';
181 bytes_read = fread(robots_buffer, 1, filelen, fp);
182 robots_buffer[bytes_read] = '\0';
183 parserobotstxt( robots_buffer, bytes_read, server );
184
185 efree( robots_buffer );
186
187 //parserobotstxt(fp, server);
188 //fclose(fp);
189 }
190 efree( file_prefix );
191
192 cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
193 cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
194 cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
195 }
196
197 return server;
198 }
199
200
201 int urldisallowed(SWISH *sw, char *url)
202 {
203 httpserverinfo *server;
204 robotrules *rule;
205 char *uri;
206 int urilen;
207
208 if ((server = getserverinfo(sw, url)) == 0) {
209 return 1;
210 }
211 if ((uri = url_uri(url, &urilen)) == 0) {
212 return 1;
213 }
214
215 for (rule = server->robotrules; rule; rule = rule->next) {
216 if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) {
217 return 1;
218 }
219 }
220
221 return 0;
222 }
223
224 // quick fix to parse from Mac and Windows.
225 // Pass in:
226 // char **next_start == pointer to a *char that has where the next string starts.
227 // char *last_char == pointer to last char in buffer. Buffer MUST have room for one more char
228 //
229 // returns NULL on no more strings
230
231 static char *next_line( char **next_start, char *last_char )
232 {
233 char *buffer = *next_start;
234 char *start;
235
236
237 // skip over any leading new lines or cr.
238 while ( buffer <= last_char && ( *buffer == '\0' || *buffer == '\n' || *buffer == '\r' ) )
239 buffer++;
240
241 if ( buffer > last_char )
242 return NULL;
243
244 start = buffer; // start of this word
245
246 // Now find the end of this string
247 while ( buffer <= last_char && ( *buffer != '\0' && *buffer != '\n' && *buffer != '\r' ) )
248 buffer++;
249
250 *buffer = '\0'; // mark the end of the string
251
252 buffer++;
253 *next_start = buffer;
254
255 return start;
256 }
257
258 static char useragent[] = "user-agent:";
259 static char disallow[] = "disallow:";
260 static char swishspider[] = "swishspider";
261
262 static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server)
263 {
264 char *buffer;
265 char *bufend = robots_buffer + buflen -1; // last char of string
266 char *next_start = robots_buffer;
267
268 enum {START, USERAGENT, DISALLOW} state = START;
269 enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING;
270 char *p;
271 int len;
272 robotrules *entry;
273 robotrules *entry2;
274
275 server->useragent = 0;
276
277 buffer = NULL;
278
279 while ( (buffer = next_line( &next_start, bufend ) ) )
280 {
281 if ( strchr( buffer, '#' ) )
282 *(strchr( buffer, '#' )) = '\0';
283
284 if ((*buffer == '#') || (*buffer == '\0'))
285 continue;
286
287
288 if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) {
289 switch (state) {
290 case DISALLOW:
291 /* Since we found our specific user-agent, we can
292 ** skip the rest of the file.
293 **/
294 if (useragentstate == SPECIFIC) {
295 return;
296 }
297
298 useragentstate = SKIPPING;
299
300 /* explict fallthrough */
301
302 case START:
303 case USERAGENT:
304 state = USERAGENT;
305
306 if (useragentstate != SPECIFIC) {
307 p = isolatevalue(buffer, useragent, &len);
308
309 if ((len == (sizeof(swishspider) - 1)) &&
310 (strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) {
311 useragentstate = SPECIFIC;
312
313 /* We might have already parsed generic rules,
314 ** so clean them up if necessary.
315 */
316 if (server->useragent) {
317 efree(server->useragent);
318 }
319 for (entry = server->robotrules; entry; ) {
320 entry2 = entry->next;
321 efree(entry);
322 entry = entry2;
323 }
324 server->robotrules = 0;
325
326 server->useragent = (char *)emalloc(len + 1);
327 strncpy(server->useragent, p, len);
328 *(server->useragent + len) = '\0';
329
330 }
331 else if ((len == 1) && (*p == '*')) {
332 useragentstate = GENERIC;
333 server->useragent = (char *)emalloc(2);
334 strcpy(server->useragent, "*"); /* emalloc'd 2 bytes, no safestrcpy */
335 }
336
337 }
338
339
340 break;
341
342 }
343 }
344
345 if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) {
346 state = DISALLOW;
347 if (useragentstate != SKIPPING) {
348 p = isolatevalue(buffer, disallow, &len);
349 if (len) {
350 entry = (robotrules *)emalloc(sizeof(robotrules));
351 entry->next = server->robotrules;
352 server->robotrules = entry;
353 entry->disallow = (char *)emalloc(len + 1);
354 strncpy(entry->disallow, p, len);
355 *(entry->disallow + len) = '\0';
356 }
357 }
358 }
359 }
360 }
361
362
363 static char *isolatevalue(char *line, char *keyword, int *plen)
364 {
365 /* Find the beginning of the value
366 **/
367 for (line += strlen(keyword); isspace((int)((unsigned char)*line)); line++ ) { /* cast to int 2/22/00 */
368 }
369
370 /* Strip off trailing spaces
371 **/
372 for (*plen = strlen(line); isspace((int)((unsigned char)*(line + *plen - 1))); (*plen)--) { /* cast to int 2/22/00 */
373 }
374
375 return line;
376 }
377
378
379 int equivalentserver(SWISH *sw, char *url, char *baseurl)
380 {
381 char *method;
382 int methodlen;
383 char *serverport;
384 int serverportlen;
385 char *basemethod;
386 int basemethodlen;
387 char *baseserverport;
388 int baseserverportlen;
389 struct multiswline *walk=NULL;
390 struct MOD_HTTP *http = sw->HTTP;
391
392 method = url_method(url, &methodlen);
393 serverport = url_serverport(url, &serverportlen);
394 basemethod = url_method(baseurl, &basemethodlen);
395 baseserverport = url_serverport(baseurl, &baseserverportlen);
396
397 if (!method || !serverport || !basemethod || !baseserverport) {
398 return 0;
399 }
400
401 /* If this is the same server, we just go for it
402 **/
403 if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) &&
404 (strncasecmp(method, basemethod, methodlen) == 0) &&
405 (strncasecmp(serverport, baseserverport, serverportlen) == 0)) {
406 return 1;
407 }
408
409 /* Do we find the method/server info for this and the base url
410 ** in the same equivalence list?
411 **/
412 for (walk = http->equivalentservers; walk; walk = walk->next ) {
413 if (serverinlist(url, walk->list) &&
414 serverinlist(baseurl, walk->list)) {
415 return 1;
416 }
417 }
418
419 return 0;
420 }
421
422
423 static int serverinlist(char *url, struct swline *list)
424 {
425 char *method;
426 int methodlen;
427 char *serverport;
428 int serverportlen;
429 char *listmethod;
430 int listmethodlen;
431 char *listserverport;
432 int listserverportlen;
433
434 method = url_method(url, &methodlen);
435 serverport = url_serverport(url, &serverportlen);
436 if (!method || !serverport) {
437 return 0;
438 }
439
440 for ( ; list; list = list->next) {
441 listmethod = url_method(list->line, &listmethodlen);
442 listserverport = url_serverport(list->line, &listserverportlen);
443 if (listmethod && listserverport) {
444 if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) &&
445 (strncasecmp(method, listmethod, methodlen) == 0) &&
446 (strncasecmp(serverport, listserverport, serverportlen) == 0)) {
447 return 1;
448 }
449 }
450 }
451 return 0;
452 }
453

  ViewVC Help
Powered by ViewVC 1.1.22