swish-e/src/xml.c

/*
$Id: xml.c,v 1.55 2001/10/11 22:21:14 whmoseley Exp $
**
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
**
** 2001-03-17  rasc  save real_filename as title (instead full real_path)
**                   was: compatibility issue to v 1.x.x
** 2001-05-09  rasc  entities changed (new module)
**
** 2001-07-25  moseley complete rewrite to use James Clark's Expat parser
**
** BUGS:
**      UndefinedMetaTags ignore is not coded
*/

#include "swish.h"
#include "merge.h"
#include "mem.h"
#include "string.h"
#include "docprop.h"
#include "error.h"
#include "index.h"
#include "metanames.h"

#include "xmlparse.h"   // James Clark's Expat

#define BUFFER_CHUNK_SIZE 20000

typedef struct {
    char   *buffer;     // text for buffer
    int     cur;        // pointer to end of buffer
    int     max;        // max size of buffer
    int     defaultID;  // default ID for no meta names.
} CHAR_BUFFER;


// I think that the property system can deal with StoreDescription in a cleaner way.
// This code shouldn't need to know about that StoreDescription.

typedef struct {
    struct metaEntry    *meta;
    int                 save_size;   /* save max size */
    char                *tag;        /* summary tag */
    int                 active;      /* inside summary */
} SUMMARY_INFO;
    

typedef struct {
    CHAR_BUFFER text_buffer;    // buffer for collecting text

    // CHAR_BUFFER prop_buffer;  // someday, may want a separate property buffer if want to collect tags within props

    SUMMARY_INFO    summary;     // argh.

    char       *ignore_tag;     // tag that triggered ignore (currently used for both)
    int         total_words;
    int         word_pos;
    int         filenum;
    XML_Parser *parser;
    INDEXDATAHEADER *header;
    SWISH      *sw;
    FileProp   *fprop;
    FileRec    *thisFileEntry;
    
} PARSE_DATA;


/* Prototypes */
static void start_hndl(void *data, const char *el, const char **attr);
static void end_hndl(void *data, const char *el);
static void char_hndl(void *data, const char *txt, int txtlen);
static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen );
static void flush_buffer( PARSE_DATA  *parse_data );
static void comment_hndl(void *data, const char *txt);
static char *isIgnoreMetaName(SWISH * sw, char *tag);


/*********************************************************************
*   Entry to index an XML file.
*
*   Creates an XML_Parser object and parses buffer
*
*   Returns:
*       Count of words indexed
*
*   ToDo:
*       This is a stream parser, so could avoid loading entire document into RAM before parsing
*
*********************************************************************/

int countwords_XML (SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
{
    PARSE_DATA          parse_data;
    XML_Parser          p = XML_ParserCreate(NULL);
    IndexFILE          *indexf = sw->indexlist;
    struct StoreDescription *stordesc = fprop->stordesc;


    /* Set defaults  */
    memset(&parse_data, 0, sizeof(parse_data));

    parse_data.header = &indexf->header;
    parse_data.parser = p;
    parse_data.sw     = sw;
    parse_data.fprop  = fprop;
    parse_data.filenum = fi->filenum;
    parse_data.word_pos= 1;  /* compress doesn't like zero */
    parse_data.thisFileEntry = fi;


    /* Don't really like this, as mentioned above */
    if ( stordesc && (parse_data.summary.meta = getPropNameByName(parse_data.header, AUTOPROPERTY_SUMMARY)))
    {
        /* Set property limit size for this document type, and store previous size limit */
        parse_data.summary.save_size = parse_data.summary.meta->max_len;
        parse_data.summary.meta->max_len = stordesc->size;
        parse_data.summary.tag = stordesc->field;
    }
        
    
    addCommonProperties(sw, fprop, fi, NULL,NULL, 0);


    if (!p)
        progerr("Failed to create XML parser object for '%s'", fprop->real_path );


    /* Set event handlers */
    XML_SetUserData( p, (void *)&parse_data );          // local data to pass around
    XML_SetElementHandler(p, start_hndl, end_hndl);
    XML_SetCharacterDataHandler(p, char_hndl);

    if( sw->indexComments )
        XML_SetCommentHandler( p, comment_hndl );

    //XML_SetProcessingInstructionHandler(p, proc_hndl);

    if ( !XML_Parse(p, buffer, fprop->fsize, 1) )
        progwarn("XML parse error in file '%s' line %d.  Error: %s",
                     fprop->real_path, XML_GetCurrentLineNumber(p),XML_ErrorString(XML_GetErrorCode(p))); 


    /* clean up */
    XML_ParserFree(p);

    /* Flush any text left in the buffer, and free the buffer */
    flush_buffer( &parse_data );

    if ( parse_data.text_buffer.buffer )
        efree( parse_data.text_buffer.buffer );


    /* Restore the size in the StoreDescription property */
    if ( parse_data.summary.save_size )
        parse_data.summary.meta->max_len = parse_data.summary.save_size;
        
    return parse_data.total_words;
}
    
/*********************************************************************
*   Start Tag Event Handler
*
*   These routines check to see if a given meta tag should be indexed
*   and if the tags should be added as a property
*
*   To Do:
*       deal with attributes!
*
*********************************************************************/


static void start_hndl(void *data, const char *el, const char **attr)
{
    PARSE_DATA *parse_data = (PARSE_DATA *)data;
    struct metaEntry *m;
    SWISH *sw = parse_data->sw;
    char  tag[MAXSTRLEN + 1];


    /* return if within an ignore block */
    if ( parse_data->ignore_tag )
        return;

    /* Flush any text in the buffer */
    flush_buffer( parse_data );


    if(strlen(el) >= MAXSTRLEN)  // easy way out
    {
        progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
        return;
    }

    strcpy(tag,(char *)el);
    strtolower( tag );  // $$$ swish ignores case in xml tags!


    /* Bump on all meta names, unless overridden */
    /* Done before the ignore tag check since still need to bump */

    if (!isDontBumpMetaName(sw->dontbumpstarttagslist, tag))
        parse_data->word_pos++;


    /* check for ignore tag (should propably remove char handler for speed) */
    if ( (parse_data->ignore_tag = isIgnoreMetaName( sw, tag )))
        return;


    /* Check for metaNames */

    if ( (m  = getMetaNameByName( parse_data->header, tag)) )
        m->in_tag++;

    else
    {
        if (sw->UndefinedMetaTags == UNDEF_META_AUTO)
        {
            if (sw->verbose)
                printf("!!!Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path);

            addMetaEntry( parse_data->header, tag, META_INDEX, 0)->in_tag++;
        }


        /* If set to "error" on undefined meta tags, then error */
        if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
            progerr("UndefinedMetaNames=error.  Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path);
    }


    /* Check property names */

    if ( (m  = getPropNameByName( parse_data->header, tag)) )
        m->in_tag++;


    /* Look to enable StoreDescription */
    {
        SUMMARY_INFO    *summary = &parse_data->summary;
        if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
            summary->active++;
    }

}


/*********************************************************************
*   End Tag Event Handler
*
*
*
*********************************************************************/


static void end_hndl(void *data, const char *el)
{
    PARSE_DATA *parse_data = (PARSE_DATA *)data;
    char  tag[MAXSTRLEN + 1];
    struct metaEntry *m;

    if(strlen(el) > MAXSTRLEN)
    {
        progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
        return;
    }

    strcpy(tag,(char *)el);
    strtolower( tag );

    if ( parse_data->ignore_tag )
    {
        if  (strcmp( parse_data->ignore_tag, tag ) == 0)
            parse_data->ignore_tag = NULL;  // don't free since it's a pointer to the config setting
        return;
    }

    /* Flush any text in the buffer */
    flush_buffer( parse_data );


    /* Don't allow matching across tag boundry */
    if (!isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag))
       parse_data->word_pos++;
    

    /* Flag that we are not in tag anymore - tags must be balanced, of course. */

    if ( ( m = getMetaNameByName( parse_data->header, tag) ) )
        if ( m->in_tag )
            m->in_tag--;


    if ( ( m = getPropNameByName( parse_data->header, tag) ) )
        if ( m->in_tag )
            m->in_tag--;


    /* Look to disable StoreDescription */
    {
        SUMMARY_INFO    *summary = &parse_data->summary;
        if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
            summary->active--;
    }

}

/*********************************************************************
*   Character Data Event Handler
*
*   This does the actual adding of text to the index and adding properties
*   if any tags have been found to index
*
*
*********************************************************************/

static void char_hndl(void *data, const char *txt, int txtlen)
{
    PARSE_DATA         *parse_data = (PARSE_DATA *)data;


    /* If currently in an ignore block, then return */
    if ( parse_data->ignore_tag )
        return;

    /* Buffer the text */
    append_buffer( &parse_data->text_buffer, txt, txtlen );

    /* Some day, might want to have a separate property buffer if need to collect more than plain text */
    // append_buffer( parse_data->prop_buffer, txt, txtlen );

}

/*********************************************************************
*   Append character data to the end of the buffer
*
*   Buffer is extended/created if needed
*
*   ToDo: Flush buffer if it gets too large
*
*
*********************************************************************/

static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen )
{

    if ( !txtlen )  // shouldn't happen
        return;


    /* (re)allocate buf if needed */
    
    if ( buf->cur + txtlen >= buf->max )
        buf->buffer = erealloc( buf->buffer, ( buf->max += BUFFER_CHUNK_SIZE+1 ) );


    memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen );
    buf->cur += txtlen;
}


/*********************************************************************
*   Flush buffer - adds words to index, and properties
*
*    2001-08 jmruiz Change structure from IN_FILE | IN_META to IN_FILE
*    Since structure does not have much sense in XML, if we use only IN_FILE 
*    we will save memory and disk space (one byte per location)
*
*
*********************************************************************/
static void flush_buffer( PARSE_DATA  *parse_data )
{
    CHAR_BUFFER *buf = &parse_data->text_buffer;
    SWISH       *sw = parse_data->sw;

    /* anything to do? */
    if ( !buf->cur )
        return;

    buf->buffer[buf->cur] = '\0';


    /* Index the text */
    parse_data->total_words +=
        indexstring( sw, buf->buffer, parse_data->filenum, IN_FILE, 0, NULL, &(parse_data->word_pos) );


    /* Add the properties */
    addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path );


    /* yuck.  Ok, add to summary, if active */
    {
        SUMMARY_INFO    *summary = &parse_data->summary;
        if ( summary->active )
            addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 );
    }


    /* clear the buffer */
    buf->cur = 0;
}


/*********************************************************************
*   Comments
*
*   Should be able to call the char_hndl
*
*   To Do:
*       Can't use DontBump with comments.  Might need a config variable for that.
*
*********************************************************************/
static void comment_hndl(void *data, const char *txt)
{
    PARSE_DATA  *parse_data = (PARSE_DATA *)data;
    SWISH       *sw = parse_data->sw;
    

    /* Bump position around comments - hard coded, always done to prevent phrase matching */
    parse_data->word_pos++;

    /* Index the text */
    parse_data->total_words +=
        indexstring( sw, (char *)txt, parse_data->filenum, IN_COMMENTS, 0, NULL, &(parse_data->word_pos) );


    parse_data->word_pos++;

}


/*********************************************************************
*   check if a tag is an IgnoreTag
*
*   Note: this returns a pointer to the config set tag, so don't free it!
*
*
*********************************************************************/

static char *isIgnoreMetaName(SWISH * sw, char *tag)
{
    struct swline *tmplist = sw->ignoremetalist;

    if (!tmplist)
        return 0;
        
    while (tmplist)
    {
        if (strcmp(tag, tmplist->line) == 0)
            return tmplist->line;

        tmplist = tmplist->next;
    }

    return NULL;
}


1	adcroft	1.1	/*
2			$Id: xml.c,v 1.55 2001/10/11 22:21:14 whmoseley Exp $
3			**
4			**
5			** This program and library is free software; you can redistribute it and/or
6			** modify it under the terms of the GNU (Library) General Public License
7			** as published by the Free Software Foundation; either version 2
8			** of the License, or any later version.
9			**
10			** This program is distributed in the hope that it will be useful,
11			** but WITHOUT ANY WARRANTY; without even the implied warranty of
12			** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13			** GNU (Library) General Public License for more details.
14			**
15			**
16			** 2001-03-17 rasc save real_filename as title (instead full real_path)
17			** was: compatibility issue to v 1.x.x
18			** 2001-05-09 rasc entities changed (new module)
19			**
20			** 2001-07-25 moseley complete rewrite to use James Clark's Expat parser
21			**
22			** BUGS:
23			** UndefinedMetaTags ignore is not coded
24			*/
25
26			#include "swish.h"
27			#include "merge.h"
28			#include "mem.h"
29			#include "string.h"
30			#include "docprop.h"
31			#include "error.h"
32			#include "index.h"
33			#include "metanames.h"
34
35			#include "xmlparse.h" // James Clark's Expat
36
37			#define BUFFER_CHUNK_SIZE 20000
38
39			typedef struct {
40			char *buffer; // text for buffer
41			int cur; // pointer to end of buffer
42			int max; // max size of buffer
43			int defaultID; // default ID for no meta names.
44			} CHAR_BUFFER;
45
46
47			// I think that the property system can deal with StoreDescription in a cleaner way.
48			// This code shouldn't need to know about that StoreDescription.
49
50			typedef struct {
51			struct metaEntry *meta;
52			int save_size; /* save max size */
53			char tag; / summary tag */
54			int active; /* inside summary */
55			} SUMMARY_INFO;
56
57
58			typedef struct {
59			CHAR_BUFFER text_buffer; // buffer for collecting text
60
61			// CHAR_BUFFER prop_buffer; // someday, may want a separate property buffer if want to collect tags within props
62
63			SUMMARY_INFO summary; // argh.
64
65			char *ignore_tag; // tag that triggered ignore (currently used for both)
66			int total_words;
67			int word_pos;
68			int filenum;
69			XML_Parser *parser;
70			INDEXDATAHEADER *header;
71			SWISH *sw;
72			FileProp *fprop;
73			FileRec *thisFileEntry;
74
75			} PARSE_DATA;
76
77
78			/* Prototypes */
79			static void start_hndl(void data, const char el, const char **attr);
80			static void end_hndl(void data, const char el);
81			static void char_hndl(void data, const char txt, int txtlen);
82			static void append_buffer( CHAR_BUFFER buf, const char txt, int txtlen );
83			static void flush_buffer( PARSE_DATA *parse_data );
84			static void comment_hndl(void data, const char txt);
85			static char isIgnoreMetaName(SWISH sw, char *tag);
86
87
88
89
90			/*********************************************************************
91			* Entry to index an XML file.
92			*
93			* Creates an XML_Parser object and parses buffer
94			*
95			* Returns:
96			* Count of words indexed
97			*
98			* ToDo:
99			* This is a stream parser, so could avoid loading entire document into RAM before parsing
100			*
101			*********************************************************************/
102
103			int countwords_XML (SWISH sw, FileProp fprop, FileRec fi, char buffer)
104			{
105			PARSE_DATA parse_data;
106			XML_Parser p = XML_ParserCreate(NULL);
107			IndexFILE *indexf = sw->indexlist;
108			struct StoreDescription *stordesc = fprop->stordesc;
109
110
111			/* Set defaults */
112			memset(&parse_data, 0, sizeof(parse_data));
113
114			parse_data.header = &indexf->header;
115			parse_data.parser = p;
116			parse_data.sw = sw;
117			parse_data.fprop = fprop;
118			parse_data.filenum = fi->filenum;
119			parse_data.word_pos= 1; /* compress doesn't like zero */
120			parse_data.thisFileEntry = fi;
121
122
123			/* Don't really like this, as mentioned above */
124			if ( stordesc && (parse_data.summary.meta = getPropNameByName(parse_data.header, AUTOPROPERTY_SUMMARY)))
125			{
126			/* Set property limit size for this document type, and store previous size limit */
127			parse_data.summary.save_size = parse_data.summary.meta->max_len;
128			parse_data.summary.meta->max_len = stordesc->size;
129			parse_data.summary.tag = stordesc->field;
130			}
131
132
133			addCommonProperties(sw, fprop, fi, NULL,NULL, 0);
134
135
136
137			if (!p)
138			progerr("Failed to create XML parser object for '%s'", fprop->real_path );
139
140
141			/* Set event handlers */
142			XML_SetUserData( p, (void *)&parse_data ); // local data to pass around
143			XML_SetElementHandler(p, start_hndl, end_hndl);
144			XML_SetCharacterDataHandler(p, char_hndl);
145
146			if( sw->indexComments )
147			XML_SetCommentHandler( p, comment_hndl );
148
149			//XML_SetProcessingInstructionHandler(p, proc_hndl);
150
151			if ( !XML_Parse(p, buffer, fprop->fsize, 1) )
152			progwarn("XML parse error in file '%s' line %d. Error: %s",
153			fprop->real_path, XML_GetCurrentLineNumber(p),XML_ErrorString(XML_GetErrorCode(p)));
154
155
156			/* clean up */
157			XML_ParserFree(p);
158
159			/* Flush any text left in the buffer, and free the buffer */
160			flush_buffer( &parse_data );
161
162			if ( parse_data.text_buffer.buffer )
163			efree( parse_data.text_buffer.buffer );
164
165
166			/* Restore the size in the StoreDescription property */
167			if ( parse_data.summary.save_size )
168			parse_data.summary.meta->max_len = parse_data.summary.save_size;
169
170			return parse_data.total_words;
171			}
172
173			/*********************************************************************
174			* Start Tag Event Handler
175			*
176			* These routines check to see if a given meta tag should be indexed
177			* and if the tags should be added as a property
178			*
179			* To Do:
180			* deal with attributes!
181			*
182			*********************************************************************/
183
184
185			static void start_hndl(void data, const char el, const char **attr)
186			{
187			PARSE_DATA parse_data = (PARSE_DATA )data;
188			struct metaEntry *m;
189			SWISH *sw = parse_data->sw;
190			char tag[MAXSTRLEN + 1];
191
192
193			/* return if within an ignore block */
194			if ( parse_data->ignore_tag )
195			return;
196
197			/* Flush any text in the buffer */
198			flush_buffer( parse_data );
199
200
201			if(strlen(el) >= MAXSTRLEN) // easy way out
202			{
203			progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
204			return;
205			}
206
207			strcpy(tag,(char *)el);
208			strtolower( tag ); // $$$ swish ignores case in xml tags!
209
210
211
212			/* Bump on all meta names, unless overridden */
213			/* Done before the ignore tag check since still need to bump */
214
215			if (!isDontBumpMetaName(sw->dontbumpstarttagslist, tag))
216			parse_data->word_pos++;
217
218
219			/* check for ignore tag (should propably remove char handler for speed) */
220			if ( (parse_data->ignore_tag = isIgnoreMetaName( sw, tag )))
221			return;
222
223
224			/* Check for metaNames */
225
226			if ( (m = getMetaNameByName( parse_data->header, tag)) )
227			m->in_tag++;
228
229			else
230			{
231			if (sw->UndefinedMetaTags == UNDEF_META_AUTO)
232			{
233			if (sw->verbose)
234			printf("!!!Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path);
235
236			addMetaEntry( parse_data->header, tag, META_INDEX, 0)->in_tag++;
237			}
238
239
240			/* If set to "error" on undefined meta tags, then error */
241			if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
242			progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path);
243			}
244
245
246			/* Check property names */
247
248			if ( (m = getPropNameByName( parse_data->header, tag)) )
249			m->in_tag++;
250
251
252			/* Look to enable StoreDescription */
253			{
254			SUMMARY_INFO *summary = &parse_data->summary;
255			if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
256			summary->active++;
257			}
258
259			}
260
261
262			/*********************************************************************
263			* End Tag Event Handler
264			*
265			*
266			*
267			*********************************************************************/
268
269
270			static void end_hndl(void data, const char el)
271			{
272			PARSE_DATA parse_data = (PARSE_DATA )data;
273			char tag[MAXSTRLEN + 1];
274			struct metaEntry *m;
275
276			if(strlen(el) > MAXSTRLEN)
277			{
278			progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
279			return;
280			}
281
282			strcpy(tag,(char *)el);
283			strtolower( tag );
284
285			if ( parse_data->ignore_tag )
286			{
287			if (strcmp( parse_data->ignore_tag, tag ) == 0)
288			parse_data->ignore_tag = NULL; // don't free since it's a pointer to the config setting
289			return;
290			}
291
292			/* Flush any text in the buffer */
293			flush_buffer( parse_data );
294
295
296			/* Don't allow matching across tag boundry */
297			if (!isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag))
298			parse_data->word_pos++;
299
300
301
302			/* Flag that we are not in tag anymore - tags must be balanced, of course. */
303
304			if ( ( m = getMetaNameByName( parse_data->header, tag) ) )
305			if ( m->in_tag )
306			m->in_tag--;
307
308
309			if ( ( m = getPropNameByName( parse_data->header, tag) ) )
310			if ( m->in_tag )
311			m->in_tag--;
312
313
314			/* Look to disable StoreDescription */
315			{
316			SUMMARY_INFO *summary = &parse_data->summary;
317			if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
318			summary->active--;
319			}
320
321			}
322
323			/*********************************************************************
324			* Character Data Event Handler
325			*
326			* This does the actual adding of text to the index and adding properties
327			* if any tags have been found to index
328			*
329			*
330			*********************************************************************/
331
332			static void char_hndl(void data, const char txt, int txtlen)
333			{
334			PARSE_DATA parse_data = (PARSE_DATA )data;
335
336
337			/* If currently in an ignore block, then return */
338			if ( parse_data->ignore_tag )
339			return;
340
341			/* Buffer the text */
342			append_buffer( &parse_data->text_buffer, txt, txtlen );
343
344			/* Some day, might want to have a separate property buffer if need to collect more than plain text */
345			// append_buffer( parse_data->prop_buffer, txt, txtlen );
346
347			}
348
349			/*********************************************************************
350			* Append character data to the end of the buffer
351			*
352			* Buffer is extended/created if needed
353			*
354			* ToDo: Flush buffer if it gets too large
355			*
356			*
357			*********************************************************************/
358
359			static void append_buffer( CHAR_BUFFER buf, const char txt, int txtlen )
360			{
361
362			if ( !txtlen ) // shouldn't happen
363			return;
364
365
366			/* (re)allocate buf if needed */
367
368			if ( buf->cur + txtlen >= buf->max )
369			buf->buffer = erealloc( buf->buffer, ( buf->max += BUFFER_CHUNK_SIZE+1 ) );
370
371
372			memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen );
373			buf->cur += txtlen;
374			}
375
376
377
378
379			/*********************************************************************
380			* Flush buffer - adds words to index, and properties
381			*
382			* 2001-08 jmruiz Change structure from IN_FILE \| IN_META to IN_FILE
383			* Since structure does not have much sense in XML, if we use only IN_FILE
384			* we will save memory and disk space (one byte per location)
385			*
386			*
387			*********************************************************************/
388			static void flush_buffer( PARSE_DATA *parse_data )
389			{
390			CHAR_BUFFER *buf = &parse_data->text_buffer;
391			SWISH *sw = parse_data->sw;
392
393			/* anything to do? */
394			if ( !buf->cur )
395			return;
396
397			buf->buffer[buf->cur] = '\0';
398
399
400			/* Index the text */
401			parse_data->total_words +=
402			indexstring( sw, buf->buffer, parse_data->filenum, IN_FILE, 0, NULL, &(parse_data->word_pos) );
403
404
405			/* Add the properties */
406			addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path );
407
408
409			/* yuck. Ok, add to summary, if active */
410			{
411			SUMMARY_INFO *summary = &parse_data->summary;
412			if ( summary->active )
413			addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 );
414			}
415
416
417			/* clear the buffer */
418			buf->cur = 0;
419			}
420
421
422
423			/*********************************************************************
424			* Comments
425			*
426			* Should be able to call the char_hndl
427			*
428			* To Do:
429			* Can't use DontBump with comments. Might need a config variable for that.
430			*
431			*********************************************************************/
432			static void comment_hndl(void data, const char txt)
433			{
434			PARSE_DATA parse_data = (PARSE_DATA )data;
435			SWISH *sw = parse_data->sw;
436
437
438			/* Bump position around comments - hard coded, always done to prevent phrase matching */
439			parse_data->word_pos++;
440
441			/* Index the text */
442			parse_data->total_words +=
443			indexstring( sw, (char *)txt, parse_data->filenum, IN_COMMENTS, 0, NULL, &(parse_data->word_pos) );
444
445
446			parse_data->word_pos++;
447
448			}
449
450
451
452			/*********************************************************************
453			* check if a tag is an IgnoreTag
454			*
455			* Note: this returns a pointer to the config set tag, so don't free it!
456			*
457			*
458			*********************************************************************/
459
460			static char isIgnoreMetaName(SWISH sw, char *tag)
461			{
462			struct swline *tmplist = sw->ignoremetalist;
463
464			if (!tmplist)
465			return 0;
466
467			while (tmplist)
468			{
469			if (strcmp(tag, tmplist->line) == 0)
470			return tmplist->line;
471
472			tmplist = tmplist->next;
473			}
474
475			return NULL;
476			}
477
478