swish-e/src/http.c

/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
** 
** change sprintf to snprintf to avoid corruption,
** test length of spiderdirectory before strcat to avoid corruption,
** added safestrcpy() macro to avoid corruption from strcpy overflow,
** define MAXPIDLEN instead of literal "32" - assumed return length from lgetpid()
** SRE 11/17/99
**
** added buffer size arg to grabStringValue - core dumping from overrun
** SRE 2/22/00
**
** 2000-11   jruiz,rasc  some redesign
*/

/*
** http.c
*/

#ifdef HAVE_CONFIG_H
#include "acconfig.h"
#endif

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

#ifdef HAVE_PROCESS_H
#include <process.h>
#endif

#include <time.h>
#include <stdarg.h>

// for wait
#ifndef _WIN32
#include <sys/types.h>
#include <sys/wait.h>
#endif

#include "swish.h"
#include "mem.h"
#include "string.h"
#include "index.h"
#include "hash.h"
#include "file.h"
#include "check.h"
#include "error.h"

#include "http.h"
#include "httpserver.h"

#include "xml.h"
#include "txt.h"
#include "html.h"

/*
  -- init structures for this module
*/

void    initModule_HTTP(SWISH * sw)
{
    struct MOD_HTTP *http;
    int     i;

    http = (struct MOD_HTTP *) emalloc(sizeof(struct MOD_HTTP));

    sw->HTTP = http;

    http->lenspiderdirectory = MAXSTRLEN;
    http->spiderdirectory = (char *) emalloc(http->lenspiderdirectory + 1);
    http->spiderdirectory[0] = '\0';
    /* Initialize spider directory */
    http->spiderdirectory = SafeStrCopy(http->spiderdirectory, SPIDERDIRECTORY, &http->lenspiderdirectory);

    for (i = 0; i < BIGHASHSIZE; i++)
        http->url_hash[i] = NULL;

    http->equivalentservers = NULL;

    /* http default system parameters */
    http->maxdepth = 5;
    http->delay = 60;
}

void    freeModule_HTTP(SWISH * sw)
{
    struct MOD_HTTP *http = sw->HTTP;

    if (http->spiderdirectory)
        efree(http->spiderdirectory);
    efree(http);
    sw->HTTP = NULL;
}

int     configModule_HTTP(SWISH * sw, StringList * sl)
{
    struct MOD_HTTP *http = sw->HTTP;
    char   *w0 = sl->word[0];
    int     retval = 1;

    int     i;
    struct multiswline *list;
    struct swline *slist;

    if (strcasecmp(w0, "maxdepth") == 0)
    {
        if (sl->n == 2)
        {
            retval = 1;
            http->maxdepth = atoi(sl->word[1]);
        }
        else
            progerr("MaxDepth requires one value");
    }
    else if (strcasecmp(w0, "delay") == 0)
    {
        if (sl->n == 2)
        {
            retval = 1;
            http->delay = atoi(sl->word[1]);
        }
        else
            progerr("Delay requires one value");
    }
    else if (strcasecmp(w0, "spiderdirectory") == 0)
    {
        if (sl->n == 2)
        {
            retval = 1;
            http->spiderdirectory = erealloc( http->spiderdirectory, strlen(sl->word[1])+2);
            strcpy( http->spiderdirectory, sl->word[1] );
            normalize_path( http->spiderdirectory );
            

            if (!isdirectory(http->spiderdirectory))
            {
                progerr("SpiderDirectory. %s is not a directory", http->spiderdirectory);
            }

            if ( strlen( http->spiderdirectory ) != 1 || http->spiderdirectory[0] != '/' )
                strcat(http->spiderdirectory, "/" );  /* In this case, we just add the delimiter */

        }
        else
            progerr("SpiderDirectory requires one value");
    }
    else if (strcasecmp(w0, "equivalentserver") == 0)
    {
        if (sl->n > 1)
        {
            retval = 1;
            /* Add a new list of equivalent servers */
            list = (struct multiswline *) emalloc(sizeof(struct multiswline));

            list->next = http->equivalentservers;
            list->list = 0;
            http->equivalentservers = list;

            for (i = 1; i < sl->n; i++)
            {
                /* Add a new entry to this list */
                slist = (struct swline *) emalloc(sizeof(struct swline));

                slist->line = estrdup( sl->word[i] );
                slist->next = list->list;
                list->list = slist;
            }

        }
        else
            progerr("EquivalentServers requires at least one value");
    }
    else
    {
        retval = 0;
    }

    return retval;
}
typedef struct urldepth
{
    char   *url;
    int     depth;
    struct urldepth *next;
}
urldepth;


int     http_already_indexed(SWISH * sw, char *url);
urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl);


urldepth *add_url(SWISH * sw, urldepth * list, char *url, int depth, char *baseurl)
{
    urldepth *item;
    struct MOD_HTTP *http = sw->HTTP;


    if (!equivalentserver(sw, url, baseurl))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s:  %s\n", url, "Wrong method or server.");


    }
    else if (http->maxdepth && (depth >= http->maxdepth))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s:  %s\n", url, "Too deep.");
    }
    else if (sw->nocontentslist && isoksuffix(url, sw->nocontentslist))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s: %s\n", url, "Wrong suffix.");

    }
    else if (urldisallowed(sw, url))
    {
        if (sw->verbose >= 3)
            printf("Skipping %s:  %s\n", url, "URL disallowed by robots.txt.");
    }
    else if (!http_already_indexed(sw, url))
    {
        item = (urldepth *) emalloc(sizeof(urldepth));
        item->url = estrdup(url);
        item->depth = depth;
#if 0
        /* Depth first searching
           * */
        item->next = list;
        list = item;
#else
        /* Breadth first searching
           * */
        item->next = 0;
        if (!list)
        {
            list = item;
        }
        else
        {
            urldepth *walk;

            for (walk = list; walk->next; walk = walk->next)
            {
            }
            walk->next = item;
        }
#endif
    }

    return list;
}


/* Have we already indexed a file or directory?
** This function is used to avoid multiple index entries
** or endless looping due to symbolic links.
*/

int     http_already_indexed(SWISH * sw, char *url)
{
    struct url_info *p;

    int     len;
    unsigned hashval;
    struct MOD_HTTP *http = sw->HTTP;

    /* Hash with via the uri alone.  Depending on the equivalent
       ** servers, we may or may not make the decision of the entire
       ** url or just the uri.
     */
    hashval = bighash(url_uri(url, &len)); /* Search hash for this file. */
    for (p = http->url_hash[hashval]; p != NULL; p = p->next)
        if ((strcmp(url, p->url) == 0) || (equivalentserver(sw, url, p->url) && (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)))
        {                       /* We found it. */
            if (sw->verbose >= 3)
                printf("Skipping %s:  %s\n", url, "Already indexed.");
            return 1;
        }

    /* Not found, make new entry. */
    p = (struct url_info *) emalloc(sizeof(struct url_info));

    p->url = estrdup(url);
    p->next = http->url_hash[hashval];
    http->url_hash[hashval] = p;

    return 0;
}


char   *url_method(char *url, int *plen)
{
    char   *end;

    if ((end = strstr(url, "://")) == NULL)
    {
        return NULL;
    }
    *plen = end - url;
    return url;
}


char   *url_serverport(char *url, int *plen)
{
    int     methodlen;
    char   *serverstart;
    char   *serverend;

    if (url_method(url, &methodlen) == NULL)
    {
        return NULL;
    }

    /* +3 for 
       * */
    serverstart = url + methodlen + 3;
    if ((serverend = strchr(serverstart, '/')) == NULL)
    {
        *plen = strlen(serverstart);
    }
    else
    {
        *plen = serverend - serverstart;
    }

    return serverstart;
}


char   *url_uri(char *url, int *plen)
{
    if ((url = url_serverport(url, plen)) == 0)
    {
        return 0;
    }
    url += *plen;
    *plen = strlen(url);
    return url;
}
/************************************************************
*
* Fork and exec a program, and wait for child to exit.
* Returns
*
*************************************************************/
#ifndef _WIN32
static void run_program(char* prog, char** args)
{
    pid_t pid = fork();
    int   status;

    /* In parent, wait for child */
    if ( pid )
    {
        wait( &status );
        if ( WIFEXITED( status ) ) // exited normally if non-zero
            return;

        progerr("%s exited with non-zero status (%d)", prog, WEXITSTATUS(status) );
    }
        
    execvp (prog, args);
    progerrno("Failed to fork '%s'. Error: ", prog );
}
#endif

/************************************************************
*
* Fetch a URL
* Side effect that it appends to "response_file"
*  -- lazy programmer hoping that -S http will go away...
*
*  Under Windows system() is used to call "perl"
*  Otherwise, exec is called on the swishspider program
*
*************************************************************/

int get(SWISH * sw, char *contenttype_or_redirect, time_t *last_modified, time_t * plastretrieval, char *file_prefix, char *url)
{
    int     code = 500;
    FILE   *fp;
    struct MOD_HTTP *http = sw->HTTP;

    /* Build path to swishspider program */
    char   *spider_prog = emalloc( strlen(http->spiderdirectory) + strlen("swishspider+fill") );
    sprintf(spider_prog, "%sswishspider", http->spiderdirectory ); // note that spiderdir MUST be set.  

    
    /* Sleep a little so we don't overwhelm the server */
    if ((time(0) - *plastretrieval) < http->delay)
    {
        int     num_sec = http->delay - (time(0) - *plastretrieval);
        sleep(num_sec);
    }

    *plastretrieval = time(0);

    
#ifdef _WIN32
    /* Should be in autoconf or obsoleted by extprog. - DLN 2001-11-05  */
    {
        int     retval;
        char    commandline[] = "perl %s %s \"%s\"";
        char   *command = emalloc( strlen(commandline) + strlen(spider_prog) + strlen(file_prefix) + strlen(url) + 1 );

        sprintf(command, commandline, spider_prog, file_prefix, url);

        retval = system( command );
        efree( command );
        efree( spider_prog );

        printf("Returned %d\n", retval );
        
        if ( retval )
            return 500;
    }
#else
    {
        char *args[4];

        args[0] = spider_prog;
        args[1] = file_prefix;
        args[2] = url;
        args[3] = NULL;
        run_program( spider_prog, args );
        efree( spider_prog );
    }
#endif
    

    /* NAUGHTY SIDE EFFECT */
    strcat( file_prefix, ".response" );
    
    if ( !(fp = fopen(file_prefix, F_READ_TEXT)) )
    {
        progerrno("Failed to open file '%s': ", file_prefix );
    }
    else
    {
        char buffer[500];
    
        fgets(buffer, 400, fp);
        code = atoi(buffer);
        if ((code == 200) || ((code / 100) == 3))
        {
            /* read content-type  redirect */
            fgets(contenttype_or_redirect, MAXSTRLEN, fp);  /* more yuck */
            *(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '\0';
        }
        if (code == 200)
        {
            /* read last-mod time */
            fgets(buffer, 400, fp);  /* more yuck */
            *last_modified = (time_t)strtol(buffer, NULL, 10);  // go away http.c -- no error checking
        }


        fclose(fp);
    }

    return code;
}

int     cmdf(int (*cmd) (const char *), char *fmt, char *string, pid_t pid)
{
    int     rc;
    char   *buffer;

    buffer = emalloc(strlen(fmt) + strlen(string) + sizeof(pid_t) * 8 + 1);

    sprintf(buffer, fmt, string, pid);

    rc = cmd(buffer);
    efree(buffer);
    return rc;
}

char   *readline(FILE * fp)
{
    static char *buffer = 0;
    static int buffersize = 512;

    if (buffer == 0)
    {
        buffer = (char *) emalloc(buffersize);
    }
    /*
       *Try to read in the line
     */

    if (fgets(buffer, buffersize, fp) == NULL)
    {
        return NULL;
    }

    /*
       * Make sure we read the entire line.  If not, double the buffer
       * size and try to read the rest
     */
    while (buffer[strlen(buffer) - 1] != '\n')
    {
        buffer = (char *) erealloc(buffer, buffersize * 2);

        /*
           * The easiest way to verify that this line is okay is to consider
           * the situation where the buffer is 2 bytes longs.  Since fgets()
           * always guarantees to put the trailing NULL, it will have essentially
           * used only 1 bytes.  We double it to four, so we now have the left
           * over byte (that currently contains NULL) in addition to the doubling
           * which gets us to read buffersize + 1.
         */
        if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0)
        {
            break;
        }
        buffersize *= 2;
    }

    return buffer;
}


/* A local version of getpid() so that we don't have to suffer
** a system call each time we need it.
*/
pid_t   lgetpid()
{
    static pid_t pid = -1;

    if (pid == -1)
    {
        pid = getpid();
    }
    return pid;
}

#if 0

/* Testing the robot rules parsing code...
**/
void    http_indexpath(char *url)
{
    httpserverinfo *server = getserverinfo(url);
    robotrules *robotrule;

    printf("User-agent: %s\n", server->useragent ? server->useragent : "(none)");
    for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next)
    {
        printf("Disallow: %s\n", robotrule->disallow);
    }
}

#else

/********************************************************/
/*                                      "Public" functions                                      */
/********************************************************/

/* The main entry point for the module.  For fs.c, decides whether this
** is a file or directory and routes to the correct routine.
*/
void    http_indexpath(SWISH * sw, char *url)
{
    urldepth *urllist = 0;
    urldepth *item;
    static int lentitle = 0;
    static char *title = NULL;
    char   *tmptitle;
    static int lencontenttype = 0;
    static char *contenttype = NULL;
    int     code;
    time_t  last_modified = 0;

    httpserverinfo *server;
    char   *link;
    char   *p;
    FileProp *fprop;
    FILE   *fp;
    struct MOD_Index *idx = sw->Index;

    char   *file_prefix;  // prefix for use with files written by swishspider -- should just be on the stack!
    char   *file_suffix;  // where to copy the suffix


    /* Initialize buffers */


    file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
    sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
    file_suffix = file_prefix + strlen( file_prefix );
    

    if (!lentitle)
        title = emalloc((lentitle = MAXSTRLEN) + 1);

    if (!lencontenttype)
        contenttype = emalloc((lencontenttype = MAXSTRLEN) + 1);


    /* prime the pump with the first url */
    urllist = add_url(sw, urllist, url, 0, url);


    /* retrieve each url and add urls to a certain depth */

    while (urllist)
    {
        item = urllist;
        urllist = urllist->next;

        if (sw->verbose >= 2)
        {
            printf("retrieving %s (%d)...\n", item->url, item->depth);
            fflush(stdout);
        }

        /* We don't check if this url is legal here, because we do that before adding to the list. */
        server = getserverinfo(sw, item->url);

        strcpy( file_suffix, "" );  // reset to just the prefix

        if ((code = get(sw, contenttype, &last_modified, &server->lastretrieval, file_prefix, item->url)) == 200)
        {
            /* Set the file_prefix to be the path to "contents" */
            strcpy( file_suffix, ".contents" );

            
            /* Patch from Steve van der Burg */
            /* change from strcmp to strncmp */


            /* Fetch title from doc if it's HTML */

            if (strncmp(contenttype, "text/html", 9) == 0)
                title = SafeStrCopy(title, (char *) (tmptitle = parseHTMLtitle(sw , file_prefix)), &lentitle);
            else
                if ((p = strrchr(item->url, '/')))
                    title = SafeStrCopy(title, p + 1, &lentitle);
                else
                    title = SafeStrCopy(title, item->url, &lentitle);


            /* Now index the file */

            /* What to do with non text files?? */
            if ( strncmp(contenttype, "text/", 5) == 0 )
            {
                fprop = file_properties(item->url, file_prefix, sw);
                fprop->mtime = last_modified;

                /* only index contents of text docs */
                // this would just index the path name
                //fprop->index_no_content = strncmp(contenttype, "text/", 5);

                do_index_file(sw, fprop);
                free_file_properties(fprop);
            }
            else if (sw->verbose >= 3)
                printf("Skipping %s:  Wrong content type: %s.\n", url, contenttype);
            

            /* add new links as extracted by the spider */

            if (strncmp(contenttype, "text/html", 9) == 0)
            {
                strcpy( file_suffix, ".links" );
            
                if ((fp = fopen(file_prefix, F_READ_TEXT)) != NULL)
                {
                    /* URLs can get quite large so don't depend on a fixed size buffer */
                
                    while ((link = readline(fp)) != NULL)
                    {
                        *(link + strlen(link) - 1) = '\0';
                        urllist = add_url(sw, urllist, link, item->depth + 1, url);
                    }
                    fclose(fp);
                }
            }

        }
        else if ((code / 100) == 3)
        {
            if ( *contenttype )
                urllist = add_url(sw, urllist, contenttype, item->depth, url);
            else
                if (sw->verbose >= 3)
                    printf("URL '%s' returned redirect code %d without a Location.\n", url, code);
        }


        /* Clean up the files left by swishspider */
        cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
        cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
        cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
    }
    efree(file_prefix);
}

#endif


struct _indexing_data_source_def HTTPIndexingDataSource = {
    "HTTP-Crawler",
    "http",
    http_indexpath,
    configModule_HTTP
};
1	/*
2	** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3	** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4	**
5	** This program and library is free software; you can redistribute it and/or
6	** modify it under the terms of the GNU (Library) General Public License
7	** as published by the Free Software Foundation; either version 2
8	** of the License, or any later version.
9	**
10	** This program is distributed in the hope that it will be useful,
11	** but WITHOUT ANY WARRANTY; without even the implied warranty of
12	** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	** GNU (Library) General Public License for more details.
14	**
15	** You should have received a copy of the GNU (Library) General Public License
16	** long with this program; if not, write to the Free Software
17	** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18	**--------------------------------------------------------------------
19	** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
20	**
21	** change sprintf to snprintf to avoid corruption,
22	** test length of spiderdirectory before strcat to avoid corruption,
23	** added safestrcpy() macro to avoid corruption from strcpy overflow,
24	** define MAXPIDLEN instead of literal "32" - assumed return length from lgetpid()
25	** SRE 11/17/99
26	**
27	** added buffer size arg to grabStringValue - core dumping from overrun
28	** SRE 2/22/00
29	**
30	** 2000-11 jruiz,rasc some redesign
31	*/
32
33	/*
34	** http.c
35	*/
36
37	#ifdef HAVE_CONFIG_H
38	#include "acconfig.h"
39	#endif
40
41	#ifdef HAVE_UNISTD_H
42	#include <unistd.h>
43	#endif
44
45	#ifdef HAVE_STDLIB_H
46	#include <stdlib.h>
47	#endif
48
49	#ifdef HAVE_PROCESS_H
50	#include <process.h>
51	#endif
52
53	#include <time.h>
54	#include <stdarg.h>
55
56	// for wait
57	#ifndef _WIN32
58	#include <sys/types.h>
59	#include <sys/wait.h>
60	#endif
61
62	#include "swish.h"
63	#include "mem.h"
64	#include "string.h"
65	#include "index.h"
66	#include "hash.h"
67	#include "file.h"
68	#include "check.h"
69	#include "error.h"
70
71	#include "http.h"
72	#include "httpserver.h"
73
74	#include "xml.h"
75	#include "txt.h"
76	#include "html.h"
77
78	/*
79	-- init structures for this module
80	*/
81
82	void initModule_HTTP(SWISH * sw)
83	{
84	struct MOD_HTTP *http;
85	int i;
86
87	http = (struct MOD_HTTP *) emalloc(sizeof(struct MOD_HTTP));
88
89	sw->HTTP = http;
90
91	http->lenspiderdirectory = MAXSTRLEN;
92	http->spiderdirectory = (char *) emalloc(http->lenspiderdirectory + 1);
93	http->spiderdirectory[0] = '\0';
94	/* Initialize spider directory */
95	http->spiderdirectory = SafeStrCopy(http->spiderdirectory, SPIDERDIRECTORY, &http->lenspiderdirectory);
96
97	for (i = 0; i < BIGHASHSIZE; i++)
98	http->url_hash[i] = NULL;
99
100	http->equivalentservers = NULL;
101
102	/* http default system parameters */
103	http->maxdepth = 5;
104	http->delay = 60;
105	}
106
107	void freeModule_HTTP(SWISH * sw)
108	{
109	struct MOD_HTTP *http = sw->HTTP;
110
111	if (http->spiderdirectory)
112	efree(http->spiderdirectory);
113	efree(http);
114	sw->HTTP = NULL;
115	}
116
117	int configModule_HTTP(SWISH * sw, StringList * sl)
118	{
119	struct MOD_HTTP *http = sw->HTTP;
120	char *w0 = sl->word[0];
121	int retval = 1;
122
123	int i;
124	struct multiswline *list;
125	struct swline *slist;
126
127	if (strcasecmp(w0, "maxdepth") == 0)
128	{
129	if (sl->n == 2)
130	{
131	retval = 1;
132	http->maxdepth = atoi(sl->word[1]);
133	}
134	else
135	progerr("MaxDepth requires one value");
136	}
137	else if (strcasecmp(w0, "delay") == 0)
138	{
139	if (sl->n == 2)
140	{
141	retval = 1;
142	http->delay = atoi(sl->word[1]);
143	}
144	else
145	progerr("Delay requires one value");
146	}
147	else if (strcasecmp(w0, "spiderdirectory") == 0)
148	{
149	if (sl->n == 2)
150	{
151	retval = 1;
152	http->spiderdirectory = erealloc( http->spiderdirectory, strlen(sl->word[1])+2);
153	strcpy( http->spiderdirectory, sl->word[1] );
154	normalize_path( http->spiderdirectory );
155
156
157	if (!isdirectory(http->spiderdirectory))
158	{
159	progerr("SpiderDirectory. %s is not a directory", http->spiderdirectory);
160	}
161
162	if ( strlen( http->spiderdirectory ) != 1 \|\| http->spiderdirectory[0] != '/' )
163	strcat(http->spiderdirectory, "/" ); /* In this case, we just add the delimiter */
164
165	}
166	else
167	progerr("SpiderDirectory requires one value");
168	}
169	else if (strcasecmp(w0, "equivalentserver") == 0)
170	{
171	if (sl->n > 1)
172	{
173	retval = 1;
174	/* Add a new list of equivalent servers */
175	list = (struct multiswline *) emalloc(sizeof(struct multiswline));
176
177	list->next = http->equivalentservers;
178	list->list = 0;
179	http->equivalentservers = list;
180
181	for (i = 1; i < sl->n; i++)
182	{
183	/* Add a new entry to this list */
184	slist = (struct swline *) emalloc(sizeof(struct swline));
185
186	slist->line = estrdup( sl->word[i] );
187	slist->next = list->list;
188	list->list = slist;
189	}
190
191	}
192	else
193	progerr("EquivalentServers requires at least one value");
194	}
195	else
196	{
197	retval = 0;
198	}
199
200	return retval;
201	}
202	typedef struct urldepth
203	{
204	char *url;
205	int depth;
206	struct urldepth *next;
207	}
208	urldepth;
209
210
211	int http_already_indexed(SWISH * sw, char *url);
212	urldepth add_url(SWISH sw, urldepth * list, char url, int depth, char baseurl);
213
214
215	urldepth add_url(SWISH sw, urldepth * list, char url, int depth, char baseurl)
216	{
217	urldepth *item;
218	struct MOD_HTTP *http = sw->HTTP;
219
220
221	if (!equivalentserver(sw, url, baseurl))
222	{
223	if (sw->verbose >= 3)
224	printf("Skipping %s: %s\n", url, "Wrong method or server.");
225
226
227	}
228	else if (http->maxdepth && (depth >= http->maxdepth))
229	{
230	if (sw->verbose >= 3)
231	printf("Skipping %s: %s\n", url, "Too deep.");
232	}
233	else if (sw->nocontentslist && isoksuffix(url, sw->nocontentslist))
234	{
235	if (sw->verbose >= 3)
236	printf("Skipping %s: %s\n", url, "Wrong suffix.");
237
238	}
239	else if (urldisallowed(sw, url))
240	{
241	if (sw->verbose >= 3)
242	printf("Skipping %s: %s\n", url, "URL disallowed by robots.txt.");
243	}
244	else if (!http_already_indexed(sw, url))
245	{
246	item = (urldepth *) emalloc(sizeof(urldepth));
247	item->url = estrdup(url);
248	item->depth = depth;
249	#if 0
250	/* Depth first searching
251	* */
252	item->next = list;
253	list = item;
254	#else
255	/* Breadth first searching
256	* */
257	item->next = 0;
258	if (!list)
259	{
260	list = item;
261	}
262	else
263	{
264	urldepth *walk;
265
266	for (walk = list; walk->next; walk = walk->next)
267	{
268	}
269	walk->next = item;
270	}
271	#endif
272	}
273
274	return list;
275	}
276
277
278	/* Have we already indexed a file or directory?
279	** This function is used to avoid multiple index entries
280	** or endless looping due to symbolic links.
281	*/
282
283	int http_already_indexed(SWISH * sw, char *url)
284	{
285	struct url_info *p;
286
287	int len;
288	unsigned hashval;
289	struct MOD_HTTP *http = sw->HTTP;
290
291	/* Hash with via the uri alone. Depending on the equivalent
292	** servers, we may or may not make the decision of the entire
293	** url or just the uri.
294	*/
295	hashval = bighash(url_uri(url, &len)); /* Search hash for this file. */
296	for (p = http->url_hash[hashval]; p != NULL; p = p->next)
297	if ((strcmp(url, p->url) == 0) \|\| (equivalentserver(sw, url, p->url) && (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)))
298	{ /* We found it. */
299	if (sw->verbose >= 3)
300	printf("Skipping %s: %s\n", url, "Already indexed.");
301	return 1;
302	}
303
304	/* Not found, make new entry. */
305	p = (struct url_info *) emalloc(sizeof(struct url_info));
306
307	p->url = estrdup(url);
308	p->next = http->url_hash[hashval];
309	http->url_hash[hashval] = p;
310
311	return 0;
312	}
313
314
315	char url_method(char url, int *plen)
316	{
317	char *end;
318
319	if ((end = strstr(url, "://")) == NULL)
320	{
321	return NULL;
322	}
323	*plen = end - url;
324	return url;
325	}
326
327
328	char url_serverport(char url, int *plen)
329	{
330	int methodlen;
331	char *serverstart;
332	char *serverend;
333
334	if (url_method(url, &methodlen) == NULL)
335	{
336	return NULL;
337	}
338
339	/* +3 for
340	* */
341	serverstart = url + methodlen + 3;
342	if ((serverend = strchr(serverstart, '/')) == NULL)
343	{
344	*plen = strlen(serverstart);
345	}
346	else
347	{
348	*plen = serverend - serverstart;
349	}
350
351	return serverstart;
352	}
353
354
355	char url_uri(char url, int *plen)
356	{
357	if ((url = url_serverport(url, plen)) == 0)
358	{
359	return 0;
360	}
361	url += *plen;
362	*plen = strlen(url);
363	return url;
364	}
365	/************************************************************
366	*
367	* Fork and exec a program, and wait for child to exit.
368	* Returns
369	*
370	*************************************************************/
371	#ifndef _WIN32
372	static void run_program(char* prog, char** args)
373	{
374	pid_t pid = fork();
375	int status;
376
377	/* In parent, wait for child */
378	if ( pid )
379	{
380	wait( &status );
381	if ( WIFEXITED( status ) ) // exited normally if non-zero
382	return;
383
384	progerr("%s exited with non-zero status (%d)", prog, WEXITSTATUS(status) );
385	}
386
387	execvp (prog, args);
388	progerrno("Failed to fork '%s'. Error: ", prog );
389	}
390	#endif
391
392	/************************************************************
393	*
394	* Fetch a URL
395	* Side effect that it appends to "response_file"
396	* -- lazy programmer hoping that -S http will go away...
397	*
398	* Under Windows system() is used to call "perl"
399	* Otherwise, exec is called on the swishspider program
400	*
401	*************************************************************/
402
403	int get(SWISH * sw, char contenttype_or_redirect, time_t last_modified, time_t * plastretrieval, char file_prefix, char url)
404	{
405	int code = 500;
406	FILE *fp;
407	struct MOD_HTTP *http = sw->HTTP;
408
409	/* Build path to swishspider program */
410	char *spider_prog = emalloc( strlen(http->spiderdirectory) + strlen("swishspider+fill") );
411	sprintf(spider_prog, "%sswishspider", http->spiderdirectory ); // note that spiderdir MUST be set.
412
413
414	/* Sleep a little so we don't overwhelm the server */
415	if ((time(0) - *plastretrieval) < http->delay)
416	{
417	int num_sec = http->delay - (time(0) - *plastretrieval);
418	sleep(num_sec);
419	}
420
421	*plastretrieval = time(0);
422
423
424	#ifdef _WIN32
425	/* Should be in autoconf or obsoleted by extprog. - DLN 2001-11-05 */
426	{
427	int retval;
428	char commandline[] = "perl %s %s \"%s\"";
429	char *command = emalloc( strlen(commandline) + strlen(spider_prog) + strlen(file_prefix) + strlen(url) + 1 );
430
431	sprintf(command, commandline, spider_prog, file_prefix, url);
432
433	retval = system( command );
434	efree( command );
435	efree( spider_prog );
436
437	printf("Returned %d\n", retval );
438
439	if ( retval )
440	return 500;
441	}
442	#else
443	{
444	char *args[4];
445
446	args[0] = spider_prog;
447	args[1] = file_prefix;
448	args[2] = url;
449	args[3] = NULL;
450	run_program( spider_prog, args );
451	efree( spider_prog );
452	}
453	#endif
454
455
456	/* NAUGHTY SIDE EFFECT */
457	strcat( file_prefix, ".response" );
458
459	if ( !(fp = fopen(file_prefix, F_READ_TEXT)) )
460	{
461	progerrno("Failed to open file '%s': ", file_prefix );
462	}
463	else
464	{
465	char buffer[500];
466
467	fgets(buffer, 400, fp);
468	code = atoi(buffer);
469	if ((code == 200) \|\| ((code / 100) == 3))
470	{
471	/* read content-type redirect */
472	fgets(contenttype_or_redirect, MAXSTRLEN, fp); /* more yuck */
473	*(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '\0';
474	}
475	if (code == 200)
476	{
477	/* read last-mod time */
478	fgets(buffer, 400, fp); /* more yuck */
479	*last_modified = (time_t)strtol(buffer, NULL, 10); // go away http.c -- no error checking
480	}
481
482
483	fclose(fp);
484	}
485
486	return code;
487	}
488
489	int cmdf(int (cmd) (const char ), char fmt, char string, pid_t pid)
490	{
491	int rc;
492	char *buffer;
493
494	buffer = emalloc(strlen(fmt) + strlen(string) + sizeof(pid_t) * 8 + 1);
495
496	sprintf(buffer, fmt, string, pid);
497
498	rc = cmd(buffer);
499	efree(buffer);
500	return rc;
501	}
502
503	char readline(FILE fp)
504	{
505	static char *buffer = 0;
506	static int buffersize = 512;
507
508	if (buffer == 0)
509	{
510	buffer = (char *) emalloc(buffersize);
511	}
512	/*
513	*Try to read in the line
514	*/
515
516	if (fgets(buffer, buffersize, fp) == NULL)
517	{
518	return NULL;
519	}
520
521	/*
522	* Make sure we read the entire line. If not, double the buffer
523	* size and try to read the rest
524	*/
525	while (buffer[strlen(buffer) - 1] != '\n')
526	{
527	buffer = (char ) erealloc(buffer, buffersize 2);
528
529	/*
530	* The easiest way to verify that this line is okay is to consider
531	* the situation where the buffer is 2 bytes longs. Since fgets()
532	* always guarantees to put the trailing NULL, it will have essentially
533	* used only 1 bytes. We double it to four, so we now have the left
534	* over byte (that currently contains NULL) in addition to the doubling
535	* which gets us to read buffersize + 1.
536	*/
537	if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0)
538	{
539	break;
540	}
541	buffersize *= 2;
542	}
543
544	return buffer;
545	}
546
547
548	/* A local version of getpid() so that we don't have to suffer
549	** a system call each time we need it.
550	*/
551	pid_t lgetpid()
552	{
553	static pid_t pid = -1;
554
555	if (pid == -1)
556	{
557	pid = getpid();
558	}
559	return pid;
560	}
561
562	#if 0
563
564	/* Testing the robot rules parsing code...
565	**/
566	void http_indexpath(char *url)
567	{
568	httpserverinfo *server = getserverinfo(url);
569	robotrules *robotrule;
570
571	printf("User-agent: %s\n", server->useragent ? server->useragent : "(none)");
572	for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next)
573	{
574	printf("Disallow: %s\n", robotrule->disallow);
575	}
576	}
577
578	#else
579
580	/********************************************************/
581	/* "Public" functions */
582	/********************************************************/
583
584	/* The main entry point for the module. For fs.c, decides whether this
585	** is a file or directory and routes to the correct routine.
586	*/
587	void http_indexpath(SWISH * sw, char *url)
588	{
589	urldepth *urllist = 0;
590	urldepth *item;
591	static int lentitle = 0;
592	static char *title = NULL;
593	char *tmptitle;
594	static int lencontenttype = 0;
595	static char *contenttype = NULL;
596	int code;
597	time_t last_modified = 0;
598
599	httpserverinfo *server;
600	char *link;
601	char *p;
602	FileProp *fprop;
603	FILE *fp;
604	struct MOD_Index *idx = sw->Index;
605
606	char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack!
607	char *file_suffix; // where to copy the suffix
608
609
610	/* Initialize buffers */
611
612
613	file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") );
614	sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid());
615	file_suffix = file_prefix + strlen( file_prefix );
616
617
618	if (!lentitle)
619	title = emalloc((lentitle = MAXSTRLEN) + 1);
620
621	if (!lencontenttype)
622	contenttype = emalloc((lencontenttype = MAXSTRLEN) + 1);
623
624
625
626	/* prime the pump with the first url */
627	urllist = add_url(sw, urllist, url, 0, url);
628
629
630
631	/* retrieve each url and add urls to a certain depth */
632
633	while (urllist)
634	{
635	item = urllist;
636	urllist = urllist->next;
637
638	if (sw->verbose >= 2)
639	{
640	printf("retrieving %s (%d)...\n", item->url, item->depth);
641	fflush(stdout);
642	}
643
644	/* We don't check if this url is legal here, because we do that before adding to the list. */
645	server = getserverinfo(sw, item->url);
646
647	strcpy( file_suffix, "" ); // reset to just the prefix
648
649	if ((code = get(sw, contenttype, &last_modified, &server->lastretrieval, file_prefix, item->url)) == 200)
650	{
651	/* Set the file_prefix to be the path to "contents" */
652	strcpy( file_suffix, ".contents" );
653
654
655	/* Patch from Steve van der Burg */
656	/* change from strcmp to strncmp */
657
658
659	/* Fetch title from doc if it's HTML */
660
661	if (strncmp(contenttype, "text/html", 9) == 0)
662	title = SafeStrCopy(title, (char *) (tmptitle = parseHTMLtitle(sw , file_prefix)), &lentitle);
663	else
664	if ((p = strrchr(item->url, '/')))
665	title = SafeStrCopy(title, p + 1, &lentitle);
666	else
667	title = SafeStrCopy(title, item->url, &lentitle);
668
669
670	/* Now index the file */
671
672	/* What to do with non text files?? */
673	if ( strncmp(contenttype, "text/", 5) == 0 )
674	{
675	fprop = file_properties(item->url, file_prefix, sw);
676	fprop->mtime = last_modified;
677
678	/* only index contents of text docs */
679	// this would just index the path name
680	//fprop->index_no_content = strncmp(contenttype, "text/", 5);
681
682	do_index_file(sw, fprop);
683	free_file_properties(fprop);
684	}
685	else if (sw->verbose >= 3)
686	printf("Skipping %s: Wrong content type: %s.\n", url, contenttype);
687
688
689
690
691	/* add new links as extracted by the spider */
692
693	if (strncmp(contenttype, "text/html", 9) == 0)
694	{
695	strcpy( file_suffix, ".links" );
696
697	if ((fp = fopen(file_prefix, F_READ_TEXT)) != NULL)
698	{
699	/* URLs can get quite large so don't depend on a fixed size buffer */
700
701	while ((link = readline(fp)) != NULL)
702	{
703	*(link + strlen(link) - 1) = '\0';
704	urllist = add_url(sw, urllist, link, item->depth + 1, url);
705	}
706	fclose(fp);
707	}
708	}
709
710	}
711	else if ((code / 100) == 3)
712	{
713	if ( *contenttype )
714	urllist = add_url(sw, urllist, contenttype, item->depth, url);
715	else
716	if (sw->verbose >= 3)
717	printf("URL '%s' returned redirect code %d without a Location.\n", url, code);
718	}
719
720
721
722	/* Clean up the files left by swishspider */
723	cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid());
724	cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid());
725	cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid());
726	}
727	efree(file_prefix);
728	}
729
730	#endif
731
732
733
734
735	struct _indexing_data_source_def HTTPIndexingDataSource = {
736	"HTTP-Crawler",
737	"http",
738	http_indexpath,
739	configModule_HTTP
740	};