swish-e/src/http.c

/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
** 
** change sprintf to snprintf to avoid corruption,
** test length of spiderdirectory before strcat to avoid corruption,
** added safestrcpy() macro to avoid corruption from strcpy overflow,
** define MAXPIDLEN instead of literal "32" - assumed return length from lgetpid()
** SRE 11/17/99
**
** added buffer size arg to grabStringValue - core dumping from overrun
** SRE 2/22/00
**
** 2000-11   jruiz,rasc  some redesign
*/

/*
** http.c
*/

#ifdef HAVE_CONFIG_H
#include "acconfig.h"
#endif

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

#ifdef HAVE_PROCESS_H
#include <process.h>
#endif

#include <time.h>
#include <stdarg.h>

// for wait
#ifndef _WIN32
#include <sys/types.h>
#include <sys/wait.h>
#endif

#include "swish.h"
#include "mem.h"
#include "string.h"
#include "index.h"
#include "hash.h"
#include "file.h"
#include "check.h"
#include "error.h"

#include "http.h"
#include "httpserver.h"

#include "xml.h"
#include "txt.h"
#include "html.h"

/*
  -- init structures for this module
*/

void    initModule_HTTP(SWISH * sw)
{
    struct MOD_HTTP *http;
    int     i;

    http = (struct MOD_HTTP *) emalloc(sizeof(struct MOD_HTTP));

    sw->HTTP = http;

    http->lenspiderdirectory = MAXSTRLEN;
    http->spiderdirectory = (char *) emalloc(http->lenspiderdirectory + 1);
    http->spiderdirectory[0] = '\0';
    /* Initialize spider directory */
    http->spiderdirectory = SafeStrCopy(http->spiderdirectory, SPIDERDIRECTORY, &http->lenspiderdirectory);

    for (i = 0; i < BIGHASHSIZE; i++)
        http->url_hash[i] = NULL;

    http->equivalentservers = NULL;

    /* http default system parameters */
    http->maxdepth = 5;
    http->delay = 60;
}

void    freeModule_HTTP(SWISH * sw)
{
    struct MOD_HTTP *http = sw->HTTP;

    if (http->spiderdirectory)
        efree(http->spiderdirectory);
    efree(http);
    sw->HTTP = NULL;
}

int     configModule_HTTP(SWISH * sw, StringList * sl)
{
    struct MOD_HTTP *http = sw->HTTP;
    char   *w0 = sl->word[0];
    int     retval = 1;

    int     i;
    struct multiswline *list;
    struct swline *slist;

    if (strcasecmp(w0, "maxdepth") == 0)
    {
        if (sl->n == 2)
        {
            retval = 1;
            http->maxdepth = atoi(sl->word[1]);
        }
        else
            progerr("MaxDepth requires one value");
    }
    else if (strcasecmp(w0, "delay") == 0)
    {
        if (sl->n == 2)
        {
            retval = 1;
            http->delay = atoi(sl->word[1]);
        }
        else
            progerr("Delay requires one value");
    }
    else if (strcasecmp(w0, "spiderdirectory") == 0)
    {
        if (sl->n == 2)
        {
            retval = 1;
            http->spiderdirectory = erealloc( http->spiderdirectory, strlen(sl->word[1])+2);
            strcpy( http->spiderdirectory, sl->word[1] );
            normalize_path( http->spiderdirectory );
            

            if (!isdirectory(http->spiderdirectory))
            {
                progerr("SpiderDirectory. %s is not a directory", http->spiderdirectory);
            }

            if ( strlen( http->spiderdirectory ) != 1 || http->spiderdirectory[0] != '/' )
                strcat(http->spiderdirectory, "/" );  /* In this case, we just add the delimiter */

        }
        else
            progerr("SpiderDirectory requires one value");
    }
    else if (strcasecmp(w0, "equivalentserver") == 0)
    {
        if (sl->n > 1)
        {
            retval = 1;
            /* Add a new list of equivalent servers */
            list = (struct multiswline *) emalloc(sizeof(struct multiswline));

            list->next = http->equivalentservers;
            list->list = 0;
            http->equivalentservers = list;

            for (i = 1; i < sl->n; i++)
            {
                /* Add a new entry to this list */
                slist = (struct swline *) emalloc(sizeof(struct swline));

                slist->line = estrdup( sl->word[i] );
                slist->next = list->list;
                list->list = slist;
            }

        }
        else
            progerr("EquivalentServers requires at least one value");
    }
    else
    {
        retval = 0;
    }

    return retval;
}
typedef struct urldepth
{
    char   *url;
    int     depth;
    struct urldepth *next;
}
urldepth;


int     http_already_indexed(SWISH * sw, char *url);
urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl);


urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl)
{
    urldepth *item;
    struct MOD_HTTP *http = sw->HTTP;


    if (!equivalentserver(sw, url, baseurl))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s:  %s\n", url, "Wrong method or server.");


    }
    else if (http->maxdepth && (depth >= http->maxdepth))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s:  %s\n", url, "Too deep.");
    }
    else if (sw->nocontentslist && isoksuffix(url, sw->nocontentslist))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s: %s\n", url, "Wrong suffix.");

    }
    else if (urldisallowed(sw, url))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s:  %s\n", url, "URL disallowed by robots.txt.");
    }
    else if (!http_already_indexed(sw, url))
    {
        item = (urldepth *) emalloc(sizeof(urldepth));
        item->url = estrdup(url);
        item->depth = depth;
#if 0
        /* Depth first searching
           * */
        item->next = list;
        list = item;
#else
        /* Breadth first searching
           * */
        item->next = 0;
        if (!list)
        {
            list = item;
        }
        else
        {
            urldepth *walk;

            for (walk = list; walk->next; walk = walk->next)
            {
            }
            walk->next = item;
        }
#endif
    }

    return list;
}


/* Have we already indexed a file or directory?
** This function is used to avoid multiple index entries
** or endless looping due to symbolic links.
*/

int     http_already_indexed(SWISH * sw, char *url)
{
    struct url_info *p;

    int     len;
    unsigned hashval;
    struct MOD_HTTP *http = sw->HTTP;

    /* Hash with via the uri alone.  Depending on the equivalent
       ** servers, we may or may not make the decision of the entire
       ** url or just the uri.
     */
    hashval = bighash(url_uri(url, &len)); /* Search hash for this file. */
    for (p = http->url_hash[hashval]; p != NULL; p = p->next)
        if ((strcmp(url, p->url) == 0) || (equivalentserver(sw, url, p->url) && (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)))
        {                       /* We found it. */
            if (sw->verbose >= 3)
                printf("Skipping %s:  %s\n", url, "Already indexed.");
            return 1;
        }

    /* Not found, make new entry. */
    p = (struct url_info *) emalloc(sizeof(struct url_info));

    p->url = estrdup(url);
    p->next = http->url_hash[hashval];
    http->url_hash[hashval] = p;

    return 0;
}


char   *url_method(char *url, int *plen)
{
    char   *end;

    if ((end = strstr(url, "://")) == NULL)
    {
        return NULL;
    }
    *plen = end - url;
    return url;
}


char   *url_serverport(char *url, int *plen)
{
    int     methodlen;
    char   *serverstart;
    char   *serverend;

    if (url_method(url, &methodlen) == NULL)
    {
        return NULL;
    }

    /* +3 for 
       * */
    serverstart = url + methodlen + 3;
    if ((serverend = strchr(serverstart, '/')) == NULL)
    {
        *plen = strlen(serverstart);
    }
    else
    {
        *plen = serverend - serverstart;
    }

    return serverstart;
}


char   *url_uri(char *url, int *plen)
{
    if ((url = url_serverport(url, plen)) == 0)
    {
        return 0;
    }
    url += *plen;
    *plen = strlen(url);
    return url;
}
/************************************************************
*
* Fork and exec a program, and wait for child to exit.
* Returns
*
*************************************************************/
#ifndef _WIN32
static void run_program(char* prog, char** args)
{
    pid_t pid = fork();
    int   status;

    /* In parent, wait for child */
    if ( pid )
    {
        wait( &status );
        if ( WIFEXITED( status ) ) // exited normally if non-zero
            return;

        progerr("%s exited with non-zero status (%d)", prog, WEXITSTATUS(status) );
    }
        
    execvp (prog, args);
    progerrno("Failed to fork '%s'. Error: ", prog );
}
#endif

/************************************************************
*
* Fetch a URL
* Side effect that it appends to "response_file"
*  -- lazy programmer hoping that -S http will go away...
*
*  Under Windows system() is used to call "perl"
*  Otherwise, exec is called on the swishspider program
*
*************************************************************/

int get(SWISH * sw, char *contenttype_or_redirect, time_t *last_modified, time_t * plastretrieval, char *file_prefix, char *url)
{
    int     code = 500;
    FILE   *fp;
    struct MOD_HTTP *http = sw->HTTP;

    /* Build path to swishspider program */
    char   *spider_prog = emalloc( strlen(http->spiderdirectory) + strlen("swishspider+fill") );
    sprintf(spider_prog, "%sswishspider", http->spiderdirectory ); // note that spiderdir MUST be set.  

    
    /* Sleep a little so we don't overwhelm the server */
    if ((time(0) - *plastretrieval) < http->delay)
    {
        int     num_sec = http->delay - (time(0) - *plastretrieval);
        sleep(num_sec);
    }

    *plastretrieval = time(0);

    
#ifdef _WIN32
    /* Should be in autoconf or obsoleted by extprog. - DLN 2001-11-05  */
    {
        int     retval;
        char    commandline[] = "perl %s %s \"%s\"";
        char   *command = emalloc( strlen(commandline) + strlen(spider_prog) + strlen(file_prefix) + strlen(url) + 1 );

        sprintf(command, commandline, spider_prog, file_prefix, url);

        retval = system( command );
        efree( command );
        efree( spider_prog );

        printf("Returned %d\n", retval );
        
        if ( retval )
            return 500;
    }
#else
    {
        char *args[4];

        args[0] = spider_prog;
        args[1] = file_prefix;
        args[2] = url;
        args[3] = NULL;
        run_program( spider_prog, args );
        efree( spider_prog );
    }
#endif
    

    /* NAUGHTY SIDE EFFECT */
    strcat( file_prefix, ".response" );
    
    if ( !(fp = fopen(file_prefix, F_READ_TEXT)) )
    {
        progerrno("Failed to open file '%s': ", file_prefix );
    }
    else
    {
        char buffer[500];
    
        fgets(buffer, 400, fp);
        code = atoi(buffer);
        if ((code == 200) || ((code / 100) == 3))
        {
            /* read content-type  redirect */
            fgets(contenttype_or_redirect, MAXSTRLEN, fp);  /* more yuck */
            *(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '\0';
        }
        if (code == 200)
        {
            /* read last-mod time */
            fgets(buffer, 400, fp);  /* more yuck */
            *last_modified = (time_t)strtol(buffer, NULL, 10);  // go away http.c -- no error checking
        }


        fclose(fp);
    }

    return code;
}

int     cmdf(int (*cmd) (const char *), char *fmt, char *string, pid_t pid)
{
    int     rc;
    char   *buffer;

    buffer = emalloc(strlen(fmt) + strlen(string) + sizeof(pid_t) * 8 + 1);

    sprintf(buffer, fmt, string, pid);

    rc = cmd(buffer);
    efree(buffer);
    return rc;
}

char   *readline(FILE * fp)
{
    static char *buffer = 0;
    static int buffersize = 512;

    if (buffer == 0)
    {
        buffer = (char *) emalloc(buffersize);
    }
    /*
       *Try to read in the line
     */

    if (fgets(buffer, buffersize, fp) == NULL)
    {
        return NULL;
    }

    /*
       * Make sure we read the entire line.  If not, double the buffer
       * size and try to read the rest
     */
    while (buffer[strlen(buffer) - 1] != '\n')
    {
        buffer = (char *) erealloc(buffer, buffersize * 2);

        /*
           * The easiest way to verify that this line is okay is to consider
           * the situation where the buffer is 2 bytes longs.  Since fgets()
           * always guarantees to put the trailing NULL, it will have essentially
           * used only 1 bytes.  We double it to four, so we now have the left
           * over byte (that currently contains NULL) in addition to the doubling
           * which gets us to read buffersize + 1.
         */
        if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0)
        {
            break;
        }
        buffersize *= 2;
    }

    return buffer;
}


/* A local version of getpid() so that we don't have to suffer
** a system call each time we need it.
*/
pid_t   lgetpid()
{
    static pid_t pid = -1;

    if (pid == -1)
    {
        pid = getpid();
    }
    return pid;
}

#if 0

/* Testing the robot rules parsing code...
**/
void    http_indexpath(char *url)
{
    httpserverinfo *server = getserverinfo(url);
    robotrules *robotrule;

    printf("User-agent: %s\n", server->useragent ? server->useragent : "(none)");
    for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next)
    {
        printf("Disallow: %s\n", robotrule->disallow);
    }
}

#else

/********************************************************/
/*                                      "Public" functions                                      */
/********************************************************/

/* The main entry point for the module.  For fs.c, decides whether this
** is a file or directory and routes to the correct routine.
*/
void    http_indexpath(SWISH * sw, char *url)
{
    urldepth *urllist = 0;
    urldepth *item;
    static int lentitle = 0;
    static char *title = NULL;
    char   *tmptitle;
    static int lencontenttype = 0;
    static char *contenttype = NULL;
    int     code;
    time_t  last_modified = 0;

    httpserverinfo *server;
    char   *link;
    char   *p;
    FileProp *fprop;
    FILE   *fp;
    struct MOD_Index *idx = sw->Index;

    char   *file_prefix;  // prefix for use with files written by swishspider -- should just be on the stack!
    char   *file_suffix;  // where to copy the suffix


    /* Initialize buffers */


    file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
    sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
    file_suffix = file_prefix + strlen( file_prefix );
    

    if (!lentitle)
        title = emalloc((lentitle = MAXSTRLEN) + 1);

    if (!lencontenttype)
        contenttype = emalloc((lencontenttype = MAXSTRLEN) + 1);


    /* prime the pump with the first url */
    urllist = add_url(sw, urllist, url, 0, url);


    /* retrieve each url and add urls to a certain depth */

    while (urllist)
    {
        item = urllist;
        urllist = urllist->next;

        if (sw->verbose >= 2)
        {
            printf("retrieving %s (%d)...\n", item->url, item->depth);
            fflush(stdout);
        }

        /* We don't check if this url is legal here, because we do that before adding to the list. */
        server = getserverinfo(sw, item->url);

        strcpy( file_suffix, "" );  // reset to just the prefix

        if ((code = get(sw, contenttype, &last_modified, &server->lastretrieval, file_prefix, item->url)) == 200)
        {
            /* Set the file_prefix to be the path to "contents" */
            strcpy( file_suffix, ".contents" );

            
            /* Patch from Steve van der Burg */
            /* change from strcmp to strncmp */


            /* Fetch title from doc if it's HTML */

            if (strncmp(contenttype, "text/html", 9) == 0)
                title = SafeStrCopy(title, (char *) (tmptitle = parseHTMLtitle(sw , file_prefix)), &lentitle);
            else
                if ((p = strrchr(item->url, '/')))
                    title = SafeStrCopy(title, p + 1, &lentitle);
                else
                    title = SafeStrCopy(title, item->url, &lentitle);


            /* Now index the file */

            /* What to do with non text files?? */
            if ( strncmp(contenttype, "text/", 5) == 0 )
            {
                fprop = file_properties(item->url, file_prefix, sw);
                fprop->mtime = last_modified;

                /* only index contents of text docs */
                // this would just index the path name
                //fprop->index_no_content = strncmp(contenttype, "text/", 5);

                do_index_file(sw, fprop);
                free_file_properties(fprop);
            }
            else if (sw->verbose >= 3)
                printf("Skipping %s:  Wrong content type: %s.\n", url, contenttype);
            

            /* add new links as extracted by the spider */

            if (strncmp(contenttype, "text/html", 9) == 0)
            {
                strcpy( file_suffix, ".links" );
            
                if ((fp = fopen(file_prefix, F_READ_TEXT)) != NULL)
                {
                    /* URLs can get quite large so don't depend on a fixed size buffer */
                
                    while ((link = readline(fp)) != NULL)
                    {
                        *(link + strlen(link) - 1) = '\0';
                        urllist = add_url(sw, urllist, link, item->depth + 1, url);
                    }
                    fclose(fp);
                }
            }

        }
        else if ((code / 100) == 3)
        {
            if ( *contenttype )
                urllist = add_url(sw, urllist, contenttype, item->depth, url);
            else
                if (sw->verbose >= 3)
                    printf("URL '%s' returned redirect code %d without a Location.\n", url, code);
        }


        /* Clean up the files left by swishspider */
        cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
        cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
        cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
    }
    efree(file_prefix);
}

#endif


struct _indexing_data_source_def HTTPIndexingDataSource = {
    "HTTP-Crawler",
    "http",
    http_indexpath,
    configModule_HTTP
};
1	adcroft	1.1	/*
2			** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3			** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4			**
5			** This program and library is free software; you can redistribute it and/or
6			** modify it under the terms of the GNU (Library) General Public License
7			** as published by the Free Software Foundation; either version 2
8			** of the License, or any later version.
9			**
10			** This program is distributed in the hope that it will be useful,
11			** but WITHOUT ANY WARRANTY; without even the implied warranty of
12			** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13			** GNU (Library) General Public License for more details.
14			**
15			** You should have received a copy of the GNU (Library) General Public License
16			** long with this program; if not, write to the Free Software
17			** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18			**--------------------------------------------------------------------
19			** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
20			**
21			** change sprintf to snprintf to avoid corruption,
22			** test length of spiderdirectory before strcat to avoid corruption,
23			** added safestrcpy() macro to avoid corruption from strcpy overflow,
24			** define MAXPIDLEN instead of literal "32" - assumed return length from lgetpid()
25			** SRE 11/17/99
26			**
27			** added buffer size arg to grabStringValue - core dumping from overrun
28			** SRE 2/22/00
29			**
30			** 2000-11 jruiz,rasc some redesign
31			*/
32
33			/*
34			** http.c
35			*/
36
37			#ifdef HAVE_CONFIG_H
38			#include "acconfig.h"
39			#endif
40
41			#ifdef HAVE_UNISTD_H
42			#include <unistd.h>
43			#endif
44
45			#ifdef HAVE_STDLIB_H
46			#include <stdlib.h>
47			#endif
48
49			#ifdef HAVE_PROCESS_H
50			#include <process.h>
51			#endif
52
53			#include <time.h>
54			#include <stdarg.h>
55
56			// for wait
57			#ifndef _WIN32
58			#include <sys/types.h>
59			#include <sys/wait.h>
60			#endif
61
62			#include "swish.h"
63			#include "mem.h"
64			#include "string.h"
65			#include "index.h"
66			#include "hash.h"
67			#include "file.h"
68			#include "check.h"
69			#include "error.h"
70
71			#include "http.h"
72			#include "httpserver.h"
73
74			#include "xml.h"
75			#include "txt.h"
76			#include "html.h"
77
78			/*
79			-- init structures for this module
80			*/
81
82			void initModule_HTTP(SWISH * sw)
83			{
84			struct MOD_HTTP *http;
85			int i;
86
87			http = (struct MOD_HTTP *) emalloc(sizeof(struct MOD_HTTP));
88
89			sw->HTTP = http;
90
91			http->lenspiderdirectory = MAXSTRLEN;
92			http->spiderdirectory = (char *) emalloc(http->lenspiderdirectory + 1);
93			http->spiderdirectory[0] = '\0';
94			/* Initialize spider directory */
95			http->spiderdirectory = SafeStrCopy(http->spiderdirectory, SPIDERDIRECTORY, &http->lenspiderdirectory);
96
97			for (i = 0; i < BIGHASHSIZE; i++)
98			http->url_hash[i] = NULL;
99
100			http->equivalentservers = NULL;
101
102			/* http default system parameters */
103			http->maxdepth = 5;
104			http->delay = 60;
105			}
106
107			void freeModule_HTTP(SWISH * sw)
108			{
109			struct MOD_HTTP *http = sw->HTTP;
110
111			if (http->spiderdirectory)
112			efree(http->spiderdirectory);
113			efree(http);
114			sw->HTTP = NULL;
115			}
116
117			int configModule_HTTP(SWISH * sw, StringList * sl)
118			{
119			struct MOD_HTTP *http = sw->HTTP;
120			char *w0 = sl->word[0];
121			int retval = 1;
122
123			int i;
124			struct multiswline *list;
125			struct swline *slist;
126
127			if (strcasecmp(w0, "maxdepth") == 0)
128			{
129			if (sl->n == 2)
130			{
131			retval = 1;
132			http->maxdepth = atoi(sl->word[1]);
133			}
134			else
135			progerr("MaxDepth requires one value");
136			}
137			else if (strcasecmp(w0, "delay") == 0)
138			{
139			if (sl->n == 2)
140			{
141			retval = 1;
142			http->delay = atoi(sl->word[1]);
143			}
144			else
145			progerr("Delay requires one value");
146			}
147			else if (strcasecmp(w0, "spiderdirectory") == 0)
148			{
149			if (sl->n == 2)
150			{
151			retval = 1;
152			http->spiderdirectory = erealloc( http->spiderdirectory, strlen(sl->word[1])+2);
153			strcpy( http->spiderdirectory, sl->word[1] );
154			normalize_path( http->spiderdirectory );
155
156
157			if (!isdirectory(http->spiderdirectory))
158			{
159			progerr("SpiderDirectory. %s is not a directory", http->spiderdirectory);
160			}
161
162			if ( strlen( http->spiderdirectory ) != 1 \|\| http->spiderdirectory[0] != '/' )
163			strcat(http->spiderdirectory, "/" ); /* In this case, we just add the delimiter */
164
165			}
166			else
167			progerr("SpiderDirectory requires one value");
168			}
169			else if (strcasecmp(w0, "equivalentserver") == 0)
170			{
171			if (sl->n > 1)
172			{
173			retval = 1;
174			/* Add a new list of equivalent servers */
175			list = (struct multiswline *) emalloc(sizeof(struct multiswline));
176
177			list->next = http->equivalentservers;
178			list->list = 0;
179			http->equivalentservers = list;
180
181			for (i = 1; i < sl->n; i++)
182			{
183			/* Add a new entry to this list */
184			slist = (struct swline *) emalloc(sizeof(struct swline));
185
186			slist->line = estrdup( sl->word[i] );
187			slist->next = list->list;
188			list->list = slist;
189			}
190
191			}
192			else
193			progerr("EquivalentServers requires at least one value");
194			}
195			else
196			{
197			retval = 0;
198			}
199
200			return retval;
201			}
202			typedef struct urldepth
203			{
204			char *url;
205			int depth;
206			struct urldepth *next;
207			}
208			urldepth;
209
210
211			int http_already_indexed(SWISH * sw, char *url);
212			urldepth add_url(SWISH sw, urldepth * list, char url, int depth, char baseurl);
213
214
215			urldepth add_url(SWISH sw, urldepth * list, char url, int depth, char baseurl)
216			{
217			urldepth *item;
218			struct MOD_HTTP *http = sw->HTTP;
219
220
221			if (!equivalentserver(sw, url, baseurl))
222			{
223			if (sw->verbose >= 3)
224			printf("Skipping %s: %s\n", url, "Wrong method or server.");
225
226
227			}
228			else if (http->maxdepth && (depth >= http->maxdepth))
229			{
230			if (sw->verbose >= 3)
231			printf("Skipping %s: %s\n", url, "Too deep.");
232			}
233			else if (sw->nocontentslist && isoksuffix(url, sw->nocontentslist))
234			{
235			if (sw->verbose >= 3)
236			printf("Skipping %s: %s\n", url, "Wrong suffix.");
237
238			}
239			else if (urldisallowed(sw, url))
240			{
241			if (sw->verbose >= 3)
242			printf("Skipping %s: %s\n", url, "URL disallowed by robots.txt.");
243			}
244			else if (!http_already_indexed(sw, url))
245			{
246			item = (urldepth *) emalloc(sizeof(urldepth));
247			item->url = estrdup(url);
248			item->depth = depth;
249			#if 0
250			/* Depth first searching
251			* */
252			item->next = list;
253			list = item;
254			#else
255			/* Breadth first searching
256			* */
257			item->next = 0;
258			if (!list)
259			{
260			list = item;
261			}
262			else
263			{
264			urldepth *walk;
265
266			for (walk = list; walk->next; walk = walk->next)
267			{
268			}
269			walk->next = item;
270			}
271			#endif
272			}
273
274			return list;
275			}
276
277
278			/* Have we already indexed a file or directory?
279			** This function is used to avoid multiple index entries
280			** or endless looping due to symbolic links.
281			*/
282
283			int http_already_indexed(SWISH * sw, char *url)
284			{
285			struct url_info *p;
286
287			int len;
288			unsigned hashval;
289			struct MOD_HTTP *http = sw->HTTP;
290
291			/* Hash with via the uri alone. Depending on the equivalent
292			** servers, we may or may not make the decision of the entire
293			** url or just the uri.
294			*/
295			hashval = bighash(url_uri(url, &len)); /* Search hash for this file. */
296			for (p = http->url_hash[hashval]; p != NULL; p = p->next)
297			if ((strcmp(url, p->url) == 0) \|\| (equivalentserver(sw, url, p->url) && (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)))
298			{ /* We found it. */
299			if (sw->verbose >= 3)
300			printf("Skipping %s: %s\n", url, "Already indexed.");
301			return 1;
302			}
303
304			/* Not found, make new entry. */
305			p = (struct url_info *) emalloc(sizeof(struct url_info));
306
307			p->url = estrdup(url);
308			p->next = http->url_hash[hashval];
309			http->url_hash[hashval] = p;
310
311			return 0;
312			}
313
314
315			char url_method(char url, int *plen)
316			{
317			char *end;
318
319			if ((end = strstr(url, "://")) == NULL)
320			{
321			return NULL;
322			}
323			*plen = end - url;
324			return url;
325			}
326
327
328			char url_serverport(char url, int *plen)
329			{
330			int methodlen;
331			char *serverstart;
332			char *serverend;
333
334			if (url_method(url, &methodlen) == NULL)
335			{
336			return NULL;
337			}
338
339			/* +3 for
340			* */
341			serverstart = url + methodlen + 3;
342			if ((serverend = strchr(serverstart, '/')) == NULL)
343			{
344			*plen = strlen(serverstart);
345			}
346			else
347			{
348			*plen = serverend - serverstart;
349			}
350
351			return serverstart;
352			}
353
354
355			char url_uri(char url, int *plen)
356			{
357			if ((url = url_serverport(url, plen)) == 0)
358			{
359			return 0;
360			}
361			url += *plen;
362			*plen = strlen(url);
363			return url;
364			}
365			/************************************************************
366			*
367			* Fork and exec a program, and wait for child to exit.
368			* Returns
369			*
370			*************************************************************/
371			#ifndef _WIN32
372			static void run_program(char* prog, char** args)
373			{
374			pid_t pid = fork();
375			int status;
376
377			/* In parent, wait for child */
378			if ( pid )
379			{
380			wait( &status );
381			if ( WIFEXITED( status ) ) // exited normally if non-zero
382			return;
383
384			progerr("%s exited with non-zero status (%d)", prog, WEXITSTATUS(status) );
385			}
386
387			execvp (prog, args);
388			progerrno("Failed to fork '%s'. Error: ", prog );
389			}
390			#endif
391
392			/************************************************************
393			*
394			* Fetch a URL
395			* Side effect that it appends to "response_file"
396			* -- lazy programmer hoping that -S http will go away...
397			*
398			* Under Windows system() is used to call "perl"
399			* Otherwise, exec is called on the swishspider program
400			*
401			*************************************************************/
402
403			int get(SWISH * sw, char contenttype_or_redirect, time_t last_modified, time_t * plastretrieval, char file_prefix, char url)
404			{
405			int code = 500;
406			FILE *fp;
407			struct MOD_HTTP *http = sw->HTTP;
408
409			/* Build path to swishspider program */
410			char *spider_prog = emalloc( strlen(http->spiderdirectory) + strlen("swishspider+fill") );
411			sprintf(spider_prog, "%sswishspider", http->spiderdirectory ); // note that spiderdir MUST be set.
412
413
414			/* Sleep a little so we don't overwhelm the server */
415			if ((time(0) - *plastretrieval) < http->delay)
416			{
417			int num_sec = http->delay - (time(0) - *plastretrieval);
418			sleep(num_sec);
419			}
420
421			*plastretrieval = time(0);
422
423
424			#ifdef _WIN32
425			/* Should be in autoconf or obsoleted by extprog. - DLN 2001-11-05 */
426			{
427			int retval;
428			char commandline[] = "perl %s %s \"%s\"";
429			char *command = emalloc( strlen(commandline) + strlen(spider_prog) + strlen(file_prefix) + strlen(url) + 1 );
430
431			sprintf(command, commandline, spider_prog, file_prefix, url);
432
433			retval = system( command );
434			efree( command );
435			efree( spider_prog );
436
437			printf("Returned %d\n", retval );
438
439			if ( retval )
440			return 500;
441			}
442			#else
443			{
444			char *args[4];
445
446			args[0] = spider_prog;
447			args[1] = file_prefix;
448			args[2] = url;
449			args[3] = NULL;
450			run_program( spider_prog, args );
451			efree( spider_prog );
452			}
453			#endif
454
455
456			/* NAUGHTY SIDE EFFECT */
457			strcat( file_prefix, ".response" );
458
459			if ( !(fp = fopen(file_prefix, F_READ_TEXT)) )
460			{
461			progerrno("Failed to open file '%s': ", file_prefix );
462			}
463			else
464			{
465			char buffer[500];
466
467			fgets(buffer, 400, fp);
468			code = atoi(buffer);
469			if ((code == 200) \|\| ((code / 100) == 3))
470			{
471			/* read content-type redirect */
472			fgets(contenttype_or_redirect, MAXSTRLEN, fp); /* more yuck */
473			*(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '\0';
474			}
475			if (code == 200)
476			{
477			/* read last-mod time */
478			fgets(buffer, 400, fp); /* more yuck */
479			*last_modified = (time_t)strtol(buffer, NULL, 10); // go away http.c -- no error checking
480			}
481
482
483			fclose(fp);
484			}
485
486			return code;
487			}
488
489			int cmdf(int (cmd) (const char ), char fmt, char string, pid_t pid)
490			{
491			int rc;
492			char *buffer;
493
494			buffer = emalloc(strlen(fmt) + strlen(string) + sizeof(pid_t) * 8 + 1);
495
496			sprintf(buffer, fmt, string, pid);
497
498			rc = cmd(buffer);
499			efree(buffer);
500			return rc;
501			}
502
503			char readline(FILE fp)
504			{
505			static char *buffer = 0;
506			static int buffersize = 512;
507
508			if (buffer == 0)
509			{
510			buffer = (char *) emalloc(buffersize);
511			}
512			/*
513			*Try to read in the line
514			*/
515
516			if (fgets(buffer, buffersize, fp) == NULL)
517			{
518			return NULL;
519			}
520
521			/*
522			* Make sure we read the entire line. If not, double the buffer
523			* size and try to read the rest
524			*/
525			while (buffer[strlen(buffer) - 1] != '\n')
526			{
527			buffer = (char ) erealloc(buffer, buffersize 2);
528
529			/*
530			* The easiest way to verify that this line is okay is to consider
531			* the situation where the buffer is 2 bytes longs. Since fgets()
532			* always guarantees to put the trailing NULL, it will have essentially
533			* used only 1 bytes. We double it to four, so we now have the left
534			* over byte (that currently contains NULL) in addition to the doubling
535			* which gets us to read buffersize + 1.
536			*/
537			if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0)
538			{
539			break;
540			}
541			buffersize *= 2;
542			}
543
544			return buffer;
545			}
546
547
548			/* A local version of getpid() so that we don't have to suffer
549			** a system call each time we need it.
550			*/
551			pid_t lgetpid()
552			{
553			static pid_t pid = -1;
554
555			if (pid == -1)
556			{
557			pid = getpid();
558			}
559			return pid;
560			}
561
562			#if 0
563
564			/* Testing the robot rules parsing code...
565			**/
566			void http_indexpath(char *url)
567			{
568			httpserverinfo *server = getserverinfo(url);
569			robotrules *robotrule;
570
571			printf("User-agent: %s\n", server->useragent ? server->useragent : "(none)");
572			for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next)
573			{
574			printf("Disallow: %s\n", robotrule->disallow);
575			}
576			}
577
578			#else
579
580			/********************************************************/
581			/* "Public" functions */
582			/********************************************************/
583
584			/* The main entry point for the module. For fs.c, decides whether this
585			** is a file or directory and routes to the correct routine.
586			*/
587			void http_indexpath(SWISH * sw, char *url)
588			{
589			urldepth *urllist = 0;
590			urldepth *item;
591			static int lentitle = 0;
592			static char *title = NULL;
593			char *tmptitle;
594			static int lencontenttype = 0;
595			static char *contenttype = NULL;
596			int code;
597			time_t last_modified = 0;
598
599			httpserverinfo *server;
600			char *link;
601			char *p;
602			FileProp *fprop;
603			FILE *fp;
604			struct MOD_Index *idx = sw->Index;
605
606			char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
607			char *file_suffix; // where to copy the suffix
608
609
610			/* Initialize buffers */
611
612
613			file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
614			sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
615			file_suffix = file_prefix + strlen( file_prefix );
616
617
618			if (!lentitle)
619			title = emalloc((lentitle = MAXSTRLEN) + 1);
620
621			if (!lencontenttype)
622			contenttype = emalloc((lencontenttype = MAXSTRLEN) + 1);
623
624
625
626			/* prime the pump with the first url */
627			urllist = add_url(sw, urllist, url, 0, url);
628
629
630
631			/* retrieve each url and add urls to a certain depth */
632
633			while (urllist)
634			{
635			item = urllist;
636			urllist = urllist->next;
637
638			if (sw->verbose >= 2)
639			{
640			printf("retrieving %s (%d)...\n", item->url, item->depth);
641			fflush(stdout);
642			}
643
644			/* We don't check if this url is legal here, because we do that before adding to the list. */
645			server = getserverinfo(sw, item->url);
646
647			strcpy( file_suffix, "" ); // reset to just the prefix
648
649			if ((code = get(sw, contenttype, &last_modified, &server->lastretrieval, file_prefix, item->url)) == 200)
650			{
651			/* Set the file_prefix to be the path to "contents" */
652			strcpy( file_suffix, ".contents" );
653
654
655			/* Patch from Steve van der Burg */
656			/* change from strcmp to strncmp */
657
658
659			/* Fetch title from doc if it's HTML */
660
661			if (strncmp(contenttype, "text/html", 9) == 0)
662			title = SafeStrCopy(title, (char *) (tmptitle = parseHTMLtitle(sw , file_prefix)), &lentitle);
663			else
664			if ((p = strrchr(item->url, '/')))
665			title = SafeStrCopy(title, p + 1, &lentitle);
666			else
667			title = SafeStrCopy(title, item->url, &lentitle);
668
669
670			/* Now index the file */
671
672			/* What to do with non text files?? */
673			if ( strncmp(contenttype, "text/", 5) == 0 )
674			{
675			fprop = file_properties(item->url, file_prefix, sw);
676			fprop->mtime = last_modified;
677
678			/* only index contents of text docs */
679			// this would just index the path name
680			//fprop->index_no_content = strncmp(contenttype, "text/", 5);
681
682			do_index_file(sw, fprop);
683			free_file_properties(fprop);
684			}
685			else if (sw->verbose >= 3)
686			printf("Skipping %s: Wrong content type: %s.\n", url, contenttype);
687
688
689
690
691			/* add new links as extracted by the spider */
692
693			if (strncmp(contenttype, "text/html", 9) == 0)
694			{
695			strcpy( file_suffix, ".links" );
696
697			if ((fp = fopen(file_prefix, F_READ_TEXT)) != NULL)
698			{
699			/* URLs can get quite large so don't depend on a fixed size buffer */
700
701			while ((link = readline(fp)) != NULL)
702			{
703			*(link + strlen(link) - 1) = '\0';
704			urllist = add_url(sw, urllist, link, item->depth + 1, url);
705			}
706			fclose(fp);
707			}
708			}
709
710			}
711			else if ((code / 100) == 3)
712			{
713			if ( *contenttype )
714			urllist = add_url(sw, urllist, contenttype, item->depth, url);
715			else
716			if (sw->verbose >= 3)
717			printf("URL '%s' returned redirect code %d without a Location.\n", url, code);
718			}
719
720
721
722			/* Clean up the files left by swishspider */
723			cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
724			cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
725			cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
726			}
727			efree(file_prefix);
728			}
729
730			#endif
731
732
733
734
735			struct _indexing_data_source_def HTTPIndexingDataSource = {
736			"HTTP-Crawler",
737			"http",
738			http_indexpath,
739			configModule_HTTP
740			};