/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/parser.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/parser.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     $Id: parser.c,v 1.42 2002/08/14 22:08:48 whmoseley Exp $
3     **
4     **
5     ** This program and library is free software; you can redistribute it and/or
6     ** modify it under the terms of the GNU General Public License
7     ** as published by the Free Software Foundation; either version 2
8     ** of the License, or any later version.
9     **
10     ** This program is distributed in the hope that it will be useful,
11     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13     ** GNU General Public License for more details.
14     **
15     **
16     ** 2001-09-21 new HTML parser using libxml2 (http://www.xmlsoft.org/) Bill Moseley
17     **
18     ** Parser reads from the stream in READ_CHUNK_SIZE (after an initial 4 byte chunk
19     ** to determine the document encoding). Text is accumulated in a buffer of
20     ** BUFFER_CHUNK_SIZE (10K?) size. The buffer is flushed when a new metatag
21     ** is found. That buffer will grow, if needed, but now it will attempt
22     ** to flush upto the last word boundry if > BUFFER_CHUNK_SIZE.
23     **
24     ** The buffer is really only flushed when a real metaName or PropertyName is
25     ** found, or when the strucutre changes -- anything that changes the
26     ** properities of the text that might be in the buffer.
27     **
28     ** An optional arrangement might be to flush the buffer after processing each
29     ** READ_CHUNK_SIZE from the stream (flush to last word). This would limit the
30     ** character buffer size. It might be nice to flush on any meta tag (not just
31     ** tags listed as PropertyNames or MetaNames), but for large XML files one would
32     ** expect some use of Meta/PropertyNames. HTML files should flush more often
33     ** since the structure will change often. Exceptions to this are large <pre>
34     ** sections, but then the append_buffer() routine will force a flush when the buffer
35     ** exceeds BUFFER_CHUNK_SIZE.
36     **
37     ** The TXT buffer does flush after every chunk read.
38     **
39     ** I doubt messing with any of these would change much...
40     **
41     **
42     ** TODO:
43     **
44     ** - FileRules title (and all abort_parsing calls - define some constants)
45     **
46     ** - There's a lot of mixing of xmlChar and char, which will generate warnings.
47     **
48     ** - Add a fprop->orig_path (before ReplaceRules) and a directive BaseURI to be used
49     ** to fixup relative urls (if no <BASE>).
50     ** This would save space in the property file, but probably not enough to worry about.
51     **
52     **
53     ** - UndefinedMetaTags ignore might throw things like structure off since
54     ** processing continues (unlike IgnoreMetaTags). But everything should balance out.
55     **
56     ** - There are two buffers that are created for every file, but these could be done once
57     ** and only expanded when needed. If that would make any difference in indexing speed.
58     **
59     ** - Note that these parse_*() functions get passed a "buffer" which is not used
60     ** (to be compatible with old swihs-e buffer-based parsers)
61     **
62     ** - XML elements and attributes are all converted to lowercase.
63     **
64     */
65    
66     /* libxml2 */
67     #include <libxml/HTMLparser.h>
68     #include <libxml/xmlerror.h>
69     #include <libxml/uri.h>
70    
71    
72     #include <stdarg.h> // for va_list
73     #ifdef HAVE_VARARGS_H
74     #include <varargs.h> // va_list on Win32
75     #endif
76     #include "swish.h"
77     #include "fs.h" // for the title check
78     #include "merge.h"
79     #include "mem.h"
80     #include "string.h"
81     #include "docprop.h"
82     #include "error.h"
83     #include "index.h"
84     #include "metanames.h"
85    
86    
87     /* Should be in config.h */
88    
89     #define BUFFER_CHUNK_SIZE 10000 // This is the size of buffers used to accumulate text
90     #define READ_CHUNK_SIZE 2048 // The size of chunks read from the stream (4096 seems to cause problems)
91    
92     /* to buffer text until an end tag is found */
93    
94     typedef struct {
95     char *buffer; // text for buffer
96     int cur; // length
97     int max; // max size of buffer
98     int defaultID; // default ID for no meta names.
99     } CHAR_BUFFER;
100    
101    
102    
103     // I think that the property system can deal with StoreDescription in a cleaner way.
104     // This code shouldn't need to know about that StoreDescription.
105    
106     typedef struct {
107     struct metaEntry *meta;
108     int save_size; /* save max size */
109     char *tag; /* summary tag */
110     int active; /* inside summary */
111     } SUMMARY_INFO;
112    
113     #define STACK_SIZE 255 // stack size, but can grow.
114    
115     typedef struct MetaStackElement {
116     struct MetaStackElement *next; // pointer to *siblings*, if any
117     struct metaEntry *meta; // pointer to meta that's inuse
118     int ignore; // flag that this meta turned on ignore
119     char tag[1]; // tag to look for
120     } MetaStackElement, *MetaStackElementPtr;
121    
122     typedef struct {
123     int pointer; // next empty slot in stack
124     int maxsize; // size of stack
125     int ignore_flag; // count of ignores
126     MetaStackElementPtr *stack; // pointer to an array of stack data
127     int is_meta; // is this a metaname or property stack?
128     } MetaStack;
129    
130    
131    
132    
133    
134    
135     /* This struct is returned in all call-back functions as user data */
136    
137     typedef struct {
138     CHAR_BUFFER text_buffer; // buffer for collecting text
139     // CHAR_BUFFER prop_buffer; // someday, may want a separate property buffer if want to collect tags within props
140     SUMMARY_INFO summary; // argh.
141     MetaStack meta_stack; // stacks for tracking the nested metas
142     MetaStack prop_stack;
143     int total_words;
144     int word_pos;
145     int filenum;
146     INDEXDATAHEADER *header;
147     SWISH *sw;
148     FileProp *fprop;
149     FileRec *thisFileEntry;
150     int structure[STRUCTURE_END+1];
151     int parsing_html;
152     struct metaEntry *titleProp;
153     struct metaEntry *titleMeta;
154     struct metaEntry *swishdefaultMeta;
155     int flush_word; // flag to flush buffer next time there's a white space.
156     xmlSAXHandlerPtr SAXHandler; // for aborting, I guess.
157     xmlParserCtxtPtr ctxt;
158     CHAR_BUFFER ISO_Latin1; // buffer to hold UTF-8 -> ISO Latin-1 converted text
159     int abort; // flag to stop parsing
160     char *baseURL; // for fixing up relative links
161     int swish_noindex; // swishindex swishnoindex -- for hiding blocks with comments
162     } PARSE_DATA;
163    
164    
165     /* Prototypes */
166     static void start_hndl(void *data, const char *el, const char **attr);
167     static void end_hndl(void *data, const char *el);
168     static void char_hndl(void *data, const char *txt, int txtlen);
169     static void Whitespace(void *data, const xmlChar *txt, int txtlen);
170     static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen );
171     static void flush_buffer( PARSE_DATA *parse_data, int clear );
172     static void comment_hndl(void *data, const char *txt);
173     static char *isIgnoreMetaName(SWISH * sw, char *tag);
174     static void error(void *data, const char *msg, ...);
175     static void warning(void *data, const char *msg, ...);
176     static void process_htmlmeta( PARSE_DATA *parse_data, const char ** attr );
177     static int check_html_tag( PARSE_DATA *parse_data, char * tag, int start );
178     static void start_metaTag( PARSE_DATA *parse_data, char * tag, char *endtag, int *meta_append, int *prop_append , int is_html_tag );
179     static void end_metaTag( PARSE_DATA *parse_data, char * tag, int is_html_tag );
180     static void init_sax_handler( xmlSAXHandlerPtr SAXHandler, SWISH * sw );
181     static void init_parse_data( PARSE_DATA *parse_data, SWISH * sw, FileProp * fprop, FileRec *fi, xmlSAXHandlerPtr SAXHandler );
182     static void free_parse_data( PARSE_DATA *parse_data );
183     static void Convert_to_latin1( PARSE_DATA *parse_data, char *txt, int txtlen );
184     static int parse_chunks( PARSE_DATA *parse_data );
185    
186     static void index_alt_tab( PARSE_DATA *parse_data, const char **attr );
187     static char *extract_html_links( PARSE_DATA *parse_data, const char **attr, struct metaEntry *meta_entry, char *tag );
188     static int read_next_chunk( FileProp *fprop, char *buf, int buf_size, int max_size );
189     static void abort_parsing( PARSE_DATA *parse_data, int abort_code );
190     static int get_structure( PARSE_DATA *parse_data );
191    
192     static void push_stack( MetaStack *stack, char *tag, struct metaEntry *meta, int *append, int ignore );
193     static int pop_stack_ifMatch( PARSE_DATA *parse_data, MetaStack *stack, char *tag );
194     static int pop_stack( MetaStack *stack );
195    
196     static void index_XML_attributes( PARSE_DATA *parse_data, char *tag, const char **attr );
197     static int start_XML_ClassAttributes( PARSE_DATA *parse_data, char *tag, const char **attr, int *meta_append, int *prop_append );
198     static char *isXMLClassAttribute(SWISH * sw, char *tag);
199    
200     static void debug_show_tag( char *tag, PARSE_DATA *parse_data, int start, char *message );
201     static void debug_show_parsed_text( PARSE_DATA *parse_data, char *txt, int len );
202    
203    
204    
205     /*********************************************************************
206     * XML Push parser
207     *
208     * Returns:
209     * Count of words indexed
210     *
211     *
212     *********************************************************************/
213    
214     int parse_XML(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
215    
216     {
217     xmlSAXHandler SAXHandlerStruct;
218     xmlSAXHandlerPtr SAXHandler = &SAXHandlerStruct;
219     PARSE_DATA parse_data;
220    
221    
222     init_sax_handler( SAXHandler, sw );
223     init_parse_data( &parse_data, sw, fprop, fi, SAXHandler );
224    
225    
226     /* Now parse the XML file */
227     return parse_chunks( &parse_data );
228    
229     }
230    
231     /*********************************************************************
232     * HTML Push parser
233     *
234     * Returns:
235     * Count of words indexed
236     *
237     *********************************************************************/
238    
239     int parse_HTML(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
240     {
241     htmlSAXHandler SAXHandlerStruct;
242     htmlSAXHandlerPtr SAXHandler = &SAXHandlerStruct;
243     PARSE_DATA parse_data;
244    
245     init_sax_handler( (xmlSAXHandlerPtr)SAXHandler, sw );
246     init_parse_data( &parse_data, sw, fprop, fi, (xmlSAXHandlerPtr)SAXHandler );
247    
248    
249     parse_data.parsing_html = 1;
250     parse_data.titleProp = getPropNameByName( parse_data.header, AUTOPROPERTY_TITLE );
251     parse_data.titleMeta = getMetaNameByName( parse_data.header, AUTOPROPERTY_TITLE );
252     parse_data.swishdefaultMeta = getMetaNameByName( parse_data.header, AUTOPROPERTY_DEFAULT );
253    
254     /* Now parse the HTML file */
255     return parse_chunks( &parse_data );
256    
257     }
258    
259     /*********************************************************************
260     * TXT "Push" parser
261     *
262     * Returns:
263     * Count of words indexed
264     *
265     *********************************************************************/
266    
267     int parse_TXT(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
268     {
269     PARSE_DATA parse_data;
270     int res;
271     char chars[READ_CHUNK_SIZE];
272    
273    
274    
275     /* This does stuff that's not needed for txt */
276     init_parse_data( &parse_data, sw, fprop, fi, NULL );
277    
278    
279     /* Document Summary */
280     if ( parse_data.summary.meta && parse_data.summary.meta->max_len )
281     parse_data.summary.active++;
282    
283    
284     while ( (res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize )) )
285     {
286     append_buffer( &parse_data.text_buffer, chars, res );
287     flush_buffer( &parse_data, 0 ); // flush upto whitespace
288    
289    
290     /* turn off summary when we exceed size */
291     if ( parse_data.summary.meta && parse_data.summary.meta->max_len && fprop->bytes_read > parse_data.summary.meta->max_len )
292     parse_data.summary.active = 0;
293    
294     }
295    
296     flush_buffer( &parse_data, 1 );
297     free_parse_data( &parse_data );
298     return parse_data.total_words;
299     }
300    
301    
302     /*********************************************************************
303     * Parse chunks (used for both XML and HTML parsing)
304     * Creates the parsers, reads in chunks as one might expect
305     *
306     *
307     *********************************************************************/
308     static int parse_chunks( PARSE_DATA *parse_data )
309     {
310     SWISH *sw = parse_data->sw;
311     FileProp *fprop = parse_data->fprop;
312     xmlSAXHandlerPtr SAXHandler = parse_data->SAXHandler;
313     int res;
314     char chars[READ_CHUNK_SIZE];
315     xmlParserCtxtPtr ctxt;
316    
317    
318     /* Now start pulling into the libxml2 parser */
319    
320     res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize );
321     if (res == 0)
322     return 0;
323    
324     /* Create parser */
325     if ( parse_data->parsing_html )
326     ctxt = (xmlParserCtxtPtr)htmlCreatePushParserCtxt((htmlSAXHandlerPtr)SAXHandler, parse_data, chars, res, fprop->real_path,0);
327     else
328     ctxt = xmlCreatePushParserCtxt(SAXHandler, parse_data, chars, res, fprop->real_path);
329    
330     parse_data->ctxt = ctxt; // save
331    
332    
333    
334     while ( !parse_data->abort && (res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize )) )
335     {
336     if ( parse_data->parsing_html )
337     htmlParseChunk((htmlParserCtxtPtr)ctxt, chars, res, 0);
338     else
339     xmlParseChunk(ctxt, chars, res, 0);
340    
341     /* Doesn't seem to make much difference to flush here */
342     //flush_buffer( parse_data, 0 ); // flush upto whitespace
343     }
344    
345    
346    
347     /* Tell the parser we are done, and free it */
348     if ( parse_data->parsing_html )
349     {
350     if ( !parse_data->abort ) // bug in libxml 2.4.5
351     htmlParseChunk( (htmlParserCtxtPtr)ctxt, chars, 0, 1 );
352     htmlFreeParserCtxt( (htmlParserCtxtPtr)ctxt);
353     }
354     else
355     {
356     if ( !parse_data->abort ) // bug in libxml
357     xmlParseChunk(ctxt, chars, 0, 1);
358     xmlFreeParserCtxt(ctxt);
359     }
360    
361     /* Daniel Veillard on Nov 21, 2001 says this should not be called for every doc. */
362     // But, it probably should be called when done parsing.
363     // xmlCleanupParser();
364    
365     /* Check for abort condition set while parsing (isoktitle, NoContents) */
366    
367     if ( parse_data->abort && fprop->index_no_content && !parse_data->total_words )
368     {
369     append_buffer( &parse_data->text_buffer, fprop->real_path, strlen(fprop->real_path) );
370    
371     parse_data->meta_stack.ignore_flag = 0; /* make sure we can write */
372     flush_buffer( parse_data, 3 );
373     }
374    
375    
376     /* Flush any text left in the buffer */
377    
378     if ( !parse_data->abort )
379     flush_buffer( parse_data, 3 );
380    
381    
382    
383     free_parse_data( parse_data );
384    
385    
386     // $$$ This doesn't work since the file (and maybe some words) already added
387     // $$$ need a way to "remove" the file entry and words already added
388    
389     if ( parse_data->abort < 0 )
390     return parse_data->abort;
391    
392     return parse_data->total_words;
393     }
394    
395     /*********************************************************************
396     * read_next_chunk - read another chunk from the stream
397     *
398     * Call with:
399     * fprop
400     * *buf - where to save the data
401     * *buf_size - max size of buffer
402     * *max_size - limit of *total* bytes read from this stream (for truncate)
403     *
404     * Returns:
405     * number of bytes read (as returned from fread)
406     *
407     *
408     *********************************************************************/
409     static int read_next_chunk( FileProp *fprop, char *buf, int buf_size, int max_size )
410     {
411     int size;
412     int res;
413    
414     if ( fprop->done )
415     return 0;
416    
417     /* For -S prog, only read in the right amount of data */
418     if ( fprop->external_program && (fprop->bytes_read >= fprop->fsize ))
419     return 0;
420    
421    
422     /* fprop->external_program is set if -S prog and NOT reading from a filter */
423    
424     size = fprop->external_program && (( fprop->fsize - fprop->bytes_read ) < buf_size)
425     ? fprop->fsize - fprop->bytes_read
426     : buf_size;
427    
428     if ( !fprop->bytes_read && size > 4 )
429     size = 4;
430    
431    
432    
433     /* Truncate -- safety feature from Rainer. No attempt is made to backup to a whole word */
434     if ( max_size && fprop->bytes_read + size > max_size )
435     {
436     fprop->done++; // flag that we are done
437     size = max_size - fprop->bytes_read;
438     }
439    
440    
441     res = fread(buf, 1, size, fprop->fp);
442    
443     fprop->bytes_read += res;
444    
445     return res;
446     }
447    
448    
449    
450     /*********************************************************************
451     * Init a sax handler structure
452     * Must pass in the structure
453     *
454     *********************************************************************/
455     static void init_sax_handler( xmlSAXHandlerPtr SAXHandler, SWISH * sw )
456     {
457     /* Set event handlers for libxml2 parser */
458     memset( SAXHandler, 0, sizeof( xmlSAXHandler ) );
459    
460     SAXHandler->startElement = (startElementSAXFunc)&start_hndl;
461     SAXHandler->endElement = (endElementSAXFunc)&end_hndl;
462     SAXHandler->characters = (charactersSAXFunc)&char_hndl;
463     SAXHandler->cdataBlock = (charactersSAXFunc)&char_hndl;
464     SAXHandler->ignorableWhitespace = (ignorableWhitespaceSAXFunc)&Whitespace;
465    
466     SAXHandler->comment = (commentSAXFunc)&comment_hndl;
467    
468     if ( sw->parser_warn_level >= 1 )
469     SAXHandler->fatalError = (fatalErrorSAXFunc)&error;
470    
471     if ( sw->parser_warn_level >= 2 )
472     SAXHandler->error = (errorSAXFunc)&error;
473    
474     if ( sw->parser_warn_level >= 3 )
475     SAXHandler->warning = (warningSAXFunc)&warning;
476    
477     }
478    
479    
480     /*********************************************************************
481     * Init the parer data structure
482     * Must pass in the structure
483     *
484     *********************************************************************/
485     static void init_parse_data( PARSE_DATA *parse_data, SWISH * sw, FileProp * fprop, FileRec *fi, xmlSAXHandlerPtr SAXHandler )
486     {
487     IndexFILE *indexf = sw->indexlist;
488     struct StoreDescription *stordesc = fprop->stordesc;
489    
490     /* Set defaults */
491     memset( parse_data, 0, sizeof(PARSE_DATA));
492    
493     parse_data->header = &indexf->header;
494     parse_data->sw = sw;
495     parse_data->fprop = fprop;
496     parse_data->filenum = fi->filenum;
497     parse_data->word_pos = 1; /* compress doesn't like zero */
498     parse_data->SAXHandler = SAXHandler;
499     parse_data->thisFileEntry = fi;
500    
501    
502     /* Don't really like this, as mentioned above */
503     if ( stordesc && (parse_data->summary.meta = getPropNameByName(parse_data->header, AUTOPROPERTY_SUMMARY)))
504     {
505     /* Set property limit size for this document type, and store previous size limit */
506     parse_data->summary.save_size = parse_data->summary.meta->max_len;
507     parse_data->summary.meta->max_len = stordesc->size;
508     parse_data->summary.tag = stordesc->field;
509     if ( parse_data->summary.tag )
510     strtolower(parse_data->summary.tag);
511     }
512    
513    
514     /* Initialize the meta and property stacks */
515     /* Not needed for TXT processing, of course */
516     {
517     MetaStack *s;
518    
519     s = &parse_data->meta_stack;
520     s->is_meta = 1;
521     s->maxsize = STACK_SIZE;
522    
523     s->stack = (MetaStackElementPtr *)emalloc( sizeof( MetaStackElementPtr ) * s->maxsize );
524     if ( fprop->index_no_content )
525     s->ignore_flag++;
526    
527     s = &parse_data->prop_stack;
528     s->is_meta = 0;
529     s->maxsize = STACK_SIZE;
530     s->stack = (MetaStackElementPtr *)emalloc( sizeof( MetaStackElementPtr ) * s->maxsize );
531     if ( fprop->index_no_content ) /* only works for HTML */
532     s->ignore_flag++;
533     }
534    
535     addCommonProperties(sw, fprop, fi, NULL, NULL, 0);
536     }
537    
538    
539     /*********************************************************************
540     * Free any data used by the parse_data struct
541     *
542     *********************************************************************/
543     static void free_parse_data( PARSE_DATA *parse_data )
544     {
545    
546     if ( parse_data->ISO_Latin1.buffer )
547     efree( parse_data->ISO_Latin1.buffer );
548    
549     if ( parse_data->text_buffer.buffer )
550     efree( parse_data->text_buffer.buffer );
551    
552     if ( parse_data->baseURL )
553     efree( parse_data->baseURL );
554    
555    
556     /* Pop the stacks */
557     while( pop_stack( &parse_data->meta_stack ) );
558     while( pop_stack( &parse_data->prop_stack ) );
559    
560     /* Free the stacks */
561     if ( parse_data->meta_stack.stack )
562     efree( parse_data->meta_stack.stack );
563    
564     if ( parse_data->prop_stack.stack )
565     efree( parse_data->prop_stack.stack );
566    
567    
568    
569     /* Restore the size in the StoreDescription property */
570     if ( parse_data->summary.save_size )
571     parse_data->summary.meta->max_len = parse_data->summary.save_size;
572    
573     }
574    
575     /*********************************************************************
576     * Start Tag Event Handler
577     *
578     * This is called by libxml2. It normally just calls start_metaTag()
579     * and that decides how to deal with that meta tag.
580     * It also converts <meta> and <tag class=foo> into meta tags as swish
581     * would expect them (and then calls start_metaTag().
582     *
583     * To Do:
584     * deal with attributes!
585     *
586     *********************************************************************/
587    
588    
589     static void start_hndl(void *data, const char *el, const char **attr)
590     {
591     PARSE_DATA *parse_data = (PARSE_DATA *)data;
592     char tag[MAXSTRLEN + 1];
593     int is_html_tag = 0; // to allow <foo> type of meta tags in HTML
594     int meta_append = 0; // used to allow siblings metanames
595     int prop_append = 0;
596    
597    
598     /* disabeld by a comment? */
599     if ( parse_data->swish_noindex )
600     return;
601    
602     if(strlen(el) >= MAXSTRLEN) // easy way out
603     {
604     warning("Warning: Tag found in %s is too long: '%s'\n", parse_data->fprop->real_path, el );
605     return;
606     }
607    
608     strcpy(tag,(char *)el);
609     strtolower( tag ); // xml?
610    
611    
612     if ( parse_data->parsing_html )
613     {
614    
615     /* handle <meta name="metaname" content="foo"> */
616     if ( (strcmp( tag, "meta") == 0) && attr )
617     {
618     process_htmlmeta( parse_data, attr );
619     return;
620     }
621    
622    
623     /* Deal with structure */
624     if ( (is_html_tag = check_html_tag( parse_data, tag, 1 )) )
625     {
626     /** Special handling for <A>, <IMG>, and <BASE> tags **/
627    
628     /* Extract out links - currently only keep <a> links */
629     if ( strcmp( tag, "a") == 0 )
630     extract_html_links( parse_data, attr, parse_data->sw->links_meta, "href" );
631    
632    
633     /* Extract out links from images */
634     else if ( strcmp( tag, "img") == 0 )
635     {
636     if (parse_data->sw->IndexAltTag)
637     index_alt_tab( parse_data, attr );
638    
639     extract_html_links( parse_data, attr, parse_data->sw->images_meta, "src" );
640     }
641    
642    
643     /* Extract out the BASE URL for fixups */
644     else if ( strcmp( tag, "base") == 0 )
645     parse_data->baseURL = estrdup( extract_html_links( parse_data, attr, NULL, "href" ) );
646     }
647    
648     }
649    
650    
651     /* Now check if we are in a meta tag */
652     start_metaTag( parse_data, tag, tag, &meta_append, &prop_append, is_html_tag );
653    
654    
655    
656     /* Index the content of attributes */
657    
658     if ( !parse_data->parsing_html && attr )
659     {
660     int class_found = 0;
661    
662     /* Allow <foo class="bar"> to look like <foo.bar> */
663    
664     if ( parse_data->sw->XMLClassAttributes )
665     class_found = start_XML_ClassAttributes( parse_data, tag, attr, &meta_append, &prop_append );
666    
667    
668     /* Index XML attributes */
669    
670     if ( !class_found && parse_data->sw->UndefinedXMLAttributes != UNDEF_META_DISABLE )
671     index_XML_attributes( parse_data, tag, attr );
672     }
673    
674     }
675    
676    
677    
678    
679    
680     /*********************************************************************
681     * End Tag Event Handler
682     *
683     * Called by libxml2.
684     *
685     *
686     *
687     *********************************************************************/
688    
689    
690     static void end_hndl(void *data, const char *el)
691     {
692     PARSE_DATA *parse_data = (PARSE_DATA *)data;
693     char tag[MAXSTRLEN + 1];
694     int is_html_tag = 0; // to allow <foo> type of metatags in html.
695    
696    
697     /* disabeld by a comment? */
698     if ( parse_data->swish_noindex )
699     return;
700    
701     if(strlen(el) > MAXSTRLEN)
702     {
703     warning("Warning: Tag found in %s is too long: '%s'\n", parse_data->fprop->real_path, el );
704     return;
705     }
706    
707     strcpy(tag,(char *)el);
708     strtolower( tag );
709    
710    
711    
712     if ( parse_data->parsing_html )
713     {
714    
715     /* <meta> tags are closed in start_hndl */
716    
717     if ( (strcmp( tag, "meta") == 0) )
718     return; // this was flushed at end tag
719    
720    
721    
722     /* Deal with structure */
723     is_html_tag = check_html_tag( parse_data, tag, 0 );
724     }
725    
726    
727     end_metaTag( parse_data, tag, is_html_tag );
728     }
729    
730    
731    
732     /*********************************************************************
733     * Character Data Event Handler
734     *
735     * This does the actual adding of text to the index and adding properties
736     * if any tags have been found to index
737     *
738     *
739     *********************************************************************/
740    
741     static void char_hndl(void *data, const char *txt, int txtlen)
742     {
743     PARSE_DATA *parse_data = (PARSE_DATA *)data;
744    
745    
746     /* Have we been disabled? */
747     if ( !parse_data->SAXHandler->characters )
748     return;
749    
750     /* disabeld by a comment? */
751     if ( parse_data->swish_noindex )
752     return;
753    
754    
755     /* If currently in an ignore block, then return */
756     if ( parse_data->meta_stack.ignore_flag && parse_data->prop_stack.ignore_flag )
757     return;
758    
759     /* $$$ this was added to limit the buffer size */
760     if ( parse_data->text_buffer.cur + txtlen >= BUFFER_CHUNK_SIZE )
761     flush_buffer( parse_data, 0 ); // flush upto last word - somewhat expensive
762    
763    
764    
765     Convert_to_latin1( parse_data, (char *)txt, txtlen );
766    
767    
768     if ( DEBUG_MASK & DEBUG_PARSED_TEXT )
769     debug_show_parsed_text( parse_data, parse_data->ISO_Latin1.buffer, parse_data->ISO_Latin1.cur );
770    
771    
772    
773    
774     /* Check if we are waiting for a word boundry, and there is white space in the text */
775     /* If so, write the word, then reset the structure, then write the rest of the text. */
776    
777     if ( parse_data->flush_word )
778     {
779     /* look for whitespace */
780     char *c = parse_data->ISO_Latin1.buffer;
781     int i;
782     for ( i=0; i < parse_data->ISO_Latin1.cur; i++ )
783     if ( isspace( (int)c[i] ) )
784     {
785     append_buffer( &parse_data->text_buffer, parse_data->ISO_Latin1.buffer, i );
786     flush_buffer( parse_data, 1 ); // Flush the entire buffer
787    
788     parse_data->structure[parse_data->flush_word-1]--; // now it's ok to turn of the structure bit
789     parse_data->flush_word = 0;
790    
791     /* flush the rest */
792     append_buffer( &parse_data->text_buffer, &c[i], parse_data->ISO_Latin1.cur - i );
793    
794     return;
795     }
796     }
797    
798    
799    
800     /* Buffer the text */
801     append_buffer( &parse_data->text_buffer, parse_data->ISO_Latin1.buffer, parse_data->ISO_Latin1.cur );
802    
803     /* Some day, might want to have a separate property buffer if need to collect more than plain text */
804     // append_buffer( &parse_data->prop_buffer, txt, txtlen );
805    
806    
807    
808     }
809    
810     /*********************************************************************
811     * ignorableWhitespace handler
812     *
813     * Just adds a space to the buffer
814     *
815     *
816     *********************************************************************/
817    
818     static void Whitespace(void *data, const xmlChar *txt, int txtlen)
819     {
820     PARSE_DATA *parse_data = (PARSE_DATA *)data;
821    
822     append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose
823     }
824    
825    
826    
827    
828     /*********************************************************************
829     * Convert UTF-8 to Latin-1
830     *
831     * Buffer is extended/created if needed
832     *
833     *********************************************************************/
834    
835     static void Convert_to_latin1( PARSE_DATA *parse_data, char *txt, int txtlen )
836     {
837     CHAR_BUFFER *buf = &parse_data->ISO_Latin1;
838     int inlen = txtlen;
839     int ret;
840     char *start_buf;
841     char *end_buf = txt + txtlen - 1;
842     int used;
843    
844    
845     /* (re)allocate buf if needed */
846    
847     if ( txtlen >= buf->max )
848     {
849     buf->max = ( buf->max + BUFFER_CHUNK_SIZE+1 < txtlen )
850     ? buf->max + txtlen+1
851     : buf->max + BUFFER_CHUNK_SIZE+1;
852    
853     buf->buffer = erealloc( buf->buffer, buf->max );
854     }
855    
856     buf->cur = 0; /* start at the beginning of the buffer */
857    
858     while( 1 )
859     {
860     used = buf->max - buf->cur; /* size available in buffer */
861     start_buf = &buf->buffer[buf->cur]; /* offset into buffer */
862    
863     /* Returns 0 for OK */
864     ret = UTF8Toisolat1( (unsigned char *)start_buf, &used, (const unsigned char *)txt, &inlen );
865    
866     if ( used > 0 ) // tally up total bytes consumed
867     buf->cur += used;
868    
869     if ( ret == 0 ) // all done
870     return;
871    
872     if ( ret == -2 ) // encoding failed
873     {
874     if ( parse_data->sw->parser_warn_level >= 1 )
875     xmlParserWarning(parse_data->ctxt, "Failed to convert internal UTF-8 to Latin-1.\nReplacing non ISO-8859-1 char with char '%c'\n", ENCODE_ERROR_CHAR);
876    
877    
878     buf->buffer[buf->cur++] = ENCODE_ERROR_CHAR;
879    
880    
881     /* Skip one UTF-8 character -- returns null if not pointing to a UTF-8 char */
882     if ( !(txt = (char *)xmlUTF8Strpos( (const xmlChar *)(&txt[inlen]), 1) ))
883     return;
884    
885     /* Calculate the remaining length of the input string */
886     inlen = (unsigned long)end_buf - (unsigned long)txt + 1;
887    
888     if ( inlen <= 0 )
889     return;
890    
891     start_buf += buf->cur-1;
892     }
893     else
894     {
895     xmlParserWarning(parse_data->ctxt, "Error '%d' converting internal UTF-8 to Latin-1.\n", ret );
896     return;
897     }
898     }
899     }
900    
901    
902     /*********************************************************************
903     * Start of a MetaTag
904     * All XML tags are metatags, but for HTML there's special handling.
905     *
906     * Call with:
907     * parse_data
908     * tag = tag to look for as a metaname/property
909     * endtag = tag to look for as the ending tag (since might be different from start tag)
910     * meta_append = if zero, tells push that this is a new meta
911     * prop_append otherwise, says it's a sibling of a previous call
912     * (Argh Jan 29, 2001 -- now I don't remember what that _append does!)
913     * (it's for working with xml attributes)
914     * is_html_tag = prevents UndefinedMetaTags from being applied to html tags
915     *
916     * <foo class=bar> can start two meta tags "foo" and "foo.bar". But "bar"
917     * will end both tags.
918     *
919     *
920     *********************************************************************/
921     static void start_metaTag( PARSE_DATA *parse_data, char * tag, char *endtag, int *meta_append, int *prop_append, int is_html_tag )
922     {
923     SWISH *sw = parse_data->sw;
924     struct metaEntry *m = NULL;
925    
926    
927     /* Bump on all meta names, unless overridden */
928     if (!is_html_tag && !isDontBumpMetaName(sw->dontbumpstarttagslist, tag))
929     parse_data->word_pos++;
930    
931    
932     /* check for ignore tag (should probably remove char handler for speed) */
933     // Should specific property names and meta names override this?
934    
935     if ( isIgnoreMetaName( sw, tag ) )
936     {
937     /* shouldn't need to flush buffer since it's just blocking out a section and should be balanced */
938     /* but need to due to the weird way the char buffer is used (and shared with props) and how metatags are assigned to the buffer */
939     /* basically, since flush_buffer looks at the ignore flag and always clears the buffer, need to do it now */
940     /* flush_buffer really should not be in the business of checking the ignore flag, and rather we need to keep two buffers -- or maybe just always flush with any change */
941    
942     flush_buffer( parse_data, 1 );
943    
944     push_stack( &parse_data->meta_stack, endtag, NULL, meta_append, 1 );
945     push_stack( &parse_data->prop_stack, endtag, NULL, prop_append, 1 );
946     parse_data->structure[IN_META_BIT]++; // so we are in balance with pop_stack
947     return;
948     }
949    
950    
951     /* Check for metaNames */
952    
953     if ( !(m = getMetaNameByName( parse_data->header, tag)) )
954     {
955    
956     if ( !is_html_tag )
957     {
958     if ( sw->UndefinedMetaTags == UNDEF_META_AUTO )
959     {
960     if (sw->verbose)
961     printf("**Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path);
962    
963     m = addMetaEntry( parse_data->header, tag, META_INDEX, 0);
964     }
965    
966    
967     else if ( sw->UndefinedMetaTags == UNDEF_META_IGNORE ) /* Ignore this block of text for metanames only (props ok) */
968     {
969     flush_buffer( parse_data, 66 ); // flush because we must still continue to process, and structures might change
970     push_stack( &parse_data->meta_stack, endtag, NULL, meta_append, 1 );
971     parse_data->structure[IN_META_BIT]++; // so we are in balance with pop_stack
972     /* must fall though to property check */
973     }
974     }
975     }
976    
977    
978    
979     if ( m ) /* Is a meta name */
980     {
981     flush_buffer( parse_data, 6 ); /* new meta tag, so must flush */
982     push_stack( &parse_data->meta_stack, endtag, m, meta_append, 0 );
983     parse_data->structure[IN_META_BIT]++;
984     }
985    
986     else if ( !is_html_tag )
987     {
988     /* If set to "error" on undefined meta tags, then error */
989     if ( sw->UndefinedMetaTags == UNDEF_META_ERROR )
990     progerr("Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path);
991    
992     else if ( DEBUG_MASK & DEBUG_PARSED_TAGS )
993     debug_show_tag( tag, parse_data, 1, "(undefined meta name - no action)" );
994     }
995    
996    
997     /* Check property names -- allows HTML tags as property names */
998    
999    
1000     if ( (m = getPropNameByName( parse_data->header, tag)) )
1001     {
1002     flush_buffer( parse_data, 7 ); // flush since it's a new meta tag
1003     push_stack( &parse_data->prop_stack, endtag, m, prop_append, 0 );
1004     }
1005    
1006    
1007    
1008     /* Look to enable StoreDescription - allow any tag */
1009     /* Don't need to flush since this has it's own buffer */
1010    
1011     // This should really be a property, and use aliasing as needed
1012     {
1013     SUMMARY_INFO *summary = &parse_data->summary;
1014    
1015     if ( summary->tag && (strcmp( tag, summary->tag ) == 0 ))
1016     {
1017     /* Flush data in buffer */
1018     if ( 0 == summary->active )
1019     flush_buffer( parse_data, 1 );
1020    
1021     summary->active++;
1022     }
1023     }
1024    
1025     }
1026    
1027    
1028     /*********************************************************************
1029     * End of a MetaTag
1030     * All XML tags are metatags, but for HTML there's special handling.
1031     *
1032     *********************************************************************/
1033     static void end_metaTag( PARSE_DATA *parse_data, char * tag, int is_html_tag )
1034     {
1035    
1036     if ( pop_stack_ifMatch( parse_data, &parse_data->meta_stack, tag ) )
1037     parse_data->structure[IN_META_BIT]--;
1038    
1039    
1040     /* Out of a property? */
1041     pop_stack_ifMatch( parse_data, &parse_data->prop_stack, tag );
1042    
1043    
1044     /* Don't allow matching across tag boundry */
1045     if (!is_html_tag && !isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag))
1046     parse_data->word_pos++;
1047    
1048    
1049    
1050     /* Look to disable StoreDescription */
1051     {
1052     SUMMARY_INFO *summary = &parse_data->summary;
1053     if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
1054     {
1055     /* Flush data in buffer */
1056     if ( 1 == summary->active )
1057     flush_buffer( parse_data, 1 ); // do first since flush buffer looks at summary->active
1058    
1059     summary->active--;
1060     }
1061     }
1062    
1063     }
1064    
1065    
1066     /*********************************************************************
1067     * Checks the HTML tag, and sets the "structure"
1068     * Also deals with FileRules title
1069     * In general, flushes the character buffer due to the change in structure.
1070     *
1071     * returns false if not a valid HTML tag (which might be a "fake" metaname)
1072     *
1073     *********************************************************************/
1074    
1075     static int check_html_tag( PARSE_DATA *parse_data, char * tag, int start )
1076     {
1077     int is_html_tag = 1;
1078     int bump = start ? +1 : -1;
1079    
1080     /* Check for structure bits */
1081    
1082    
1083     /** HEAD **/
1084    
1085     if ( strcmp( tag, "head" ) == 0 )
1086     {
1087     flush_buffer( parse_data, 10 );
1088     parse_data->structure[IN_HEAD_BIT] += bump;
1089    
1090     /* Check for NoContents - can quit looking now once out of <head> block */
1091    
1092     if ( !start && parse_data->fprop->index_no_content )
1093     abort_parsing( parse_data, 1 );
1094    
1095     }
1096    
1097    
1098    
1099     /** TITLE **/
1100    
1101     // Note: I think storing the title words by default should be optional.
1102     // Someone might not want to search title tags, if if they don't they are
1103     // screwed since title by default ranks higher than body words.
1104    
1105    
1106     else if ( strcmp( tag, "title" ) == 0 )
1107     {
1108     /* Can't flush buffer until we have looked at the title */
1109    
1110     if ( !start )
1111     {
1112     struct MOD_FS *fs = parse_data->sw->FS;
1113    
1114     /* Check isoktitle - before NoContents? */
1115     if ( match_regex_list( parse_data->text_buffer.buffer, fs->filerules.title) )
1116     {
1117     abort_parsing( parse_data, -2 );
1118     return 1;
1119     }
1120    
1121     /* Check for NoContents - abort since all we need is the title text */
1122     if ( parse_data->fprop->index_no_content )
1123     abort_parsing( parse_data, 1 );
1124    
1125    
1126     }
1127     else
1128     /* In start tag, allow capture of text (NoContents sets ignore_flag at start) */
1129     if ( parse_data->fprop->index_no_content )
1130     parse_data->meta_stack.ignore_flag--;
1131    
1132    
1133     /* Now it's ok to flush */
1134     flush_buffer( parse_data, 11 );
1135    
1136    
1137     /* If title is a property, turn on the property flag */
1138     if ( parse_data->titleProp )
1139     parse_data->titleProp->in_tag = start ? 1 : 0;
1140    
1141    
1142     /* If title is a metaname, turn on the indexing flag */
1143     if ( parse_data->titleMeta )
1144     {
1145     parse_data->titleMeta->in_tag = start ? 1 : 0;
1146     parse_data->swishdefaultMeta->in_tag = start ? 1 : 0;
1147     }
1148    
1149    
1150    
1151     parse_data->word_pos++;
1152     parse_data->structure[IN_TITLE_BIT] += bump;
1153     }
1154    
1155    
1156    
1157     /** BODY **/
1158    
1159     else if ( strcmp( tag, "body" ) == 0 )
1160     {
1161     flush_buffer( parse_data, 12 );
1162     parse_data->structure[IN_BODY_BIT] += bump;
1163     parse_data->word_pos++;
1164     }
1165    
1166    
1167    
1168     /** H1 HEADINGS **/
1169    
1170     /* This should be split so know different level for ranking */
1171     else if ( tag[0] == 'h' && isdigit((int) tag[1]))
1172     {
1173     flush_buffer( parse_data, 13 );
1174     parse_data->structure[IN_HEADER_BIT] += bump;
1175     }
1176    
1177    
1178    
1179     /** EMPHASIZED **/
1180    
1181     /* These should not be hard coded */
1182    
1183     else if ( !strcmp( tag, "em" ) || !strcmp( tag, "b" ) || !strcmp( tag, "strong" ) || !strcmp( tag, "i" ) )
1184     {
1185     /* This is hard. The idea is to not break up words. But messes up the structure
1186     * ie: "this is b<b>O</b>ld word" so this would only flush "this is" on <b>,
1187     * and </b> would not flush anything. The PROBLEM is that then will make the next words
1188     * have a IN_EMPHASIZED structure. To "fix", I set a flag to flush at next word boundry.
1189     */
1190     flush_buffer( parse_data, 0 ); // flush up to current word (leaving any leading chars in buffer)
1191    
1192     if ( start )
1193     parse_data->structure[IN_EMPHASIZED_BIT]++;
1194     else
1195     {
1196     /* If there is something in the buffer then delay turning off the flag until whitespace is found */
1197     if ( parse_data->text_buffer.cur )
1198     /* Flag to flush at next word boundry */
1199     parse_data->flush_word = IN_EMPHASIZED_BIT + 1; // + 1 because we might need to use zero some day
1200     else
1201     parse_data->structure[IN_EMPHASIZED_BIT]--;
1202     }
1203    
1204    
1205     }
1206    
1207    
1208    
1209    
1210     /* Now, look for reasons to add whitespace
1211     * img is not really, as someone might use an image to make up a word, but
1212     * commonly an image would split up text.
1213     * other tags: frame?
1214     */
1215    
1216     if ( !strcmp( tag, "br" ) || !strcmp( tag, "img" ) )
1217     append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose
1218     else
1219     {
1220     const htmlElemDesc *element = htmlTagLookup( (const xmlChar *)tag );
1221    
1222     if ( !element )
1223     is_html_tag = 0; // flag that this might be a meta name
1224    
1225     else if ( !element->isinline )
1226     append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose
1227     }
1228    
1229    
1230    
1231    
1232     return is_html_tag;
1233     }
1234    
1235     /*********************************************************************
1236     * Allow <foo class="bar"> to start "foo.bar" meta tag
1237     *
1238     * Returns true if any found
1239     *
1240     *********************************************************************/
1241     static int start_XML_ClassAttributes( PARSE_DATA *parse_data, char *tag, const char **attr, int *meta_append, int *prop_append )
1242     {
1243     char tagbuf[256]; /* we have our limits */
1244     char *t;
1245     int i;
1246     int taglen = strlen( tag );
1247     SWISH *sw = parse_data->sw;
1248     int found = 0;
1249    
1250     strcpy( tagbuf, tag );
1251     t = tagbuf + taglen;
1252     *t = '.'; /* hard coded! */
1253     t++;
1254    
1255    
1256     for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1257     {
1258     if ( !isXMLClassAttribute( sw, (char *)attr[i]) )
1259     continue;
1260    
1261    
1262     /* Is the tag going to be too long? */
1263     if ( strlen( (char *)attr[i+1] ) + taglen + 2 > 256 )
1264     {
1265     warning("ClassAttribute on tag '%s' too long\n", tag );
1266     continue;
1267     }
1268    
1269    
1270     /* All metanames are currently lowercase -- would be better to force this in metanames.c */
1271     strtolower( tagbuf );
1272    
1273     strcpy( t, (char *)attr[i+1] ); /* create tag.attribute metaname */
1274     start_metaTag( parse_data, tagbuf, tag, meta_append, prop_append, 0 );
1275     found++;
1276    
1277     /* Now, nest attributes */
1278     if ( sw->UndefinedXMLAttributes != UNDEF_META_DISABLE )
1279     index_XML_attributes( parse_data, tagbuf, attr );
1280    
1281     }
1282    
1283     return found;
1284    
1285     }
1286    
1287     /*********************************************************************
1288     * check if a tag is an XMLClassAttributes
1289     *
1290     * Note: this returns a pointer to the config set tag, so don't free it!
1291     * Duplicate code!
1292     *
1293     * This does a case-insensitive lookup
1294     *
1295     *
1296     *********************************************************************/
1297    
1298     static char *isXMLClassAttribute(SWISH * sw, char *tag)
1299     {
1300     struct swline *tmplist = sw->XMLClassAttributes;
1301    
1302     if (!tmplist)
1303     return 0;
1304    
1305     while (tmplist)
1306     {
1307     if (strcasecmp(tag, tmplist->line) == 0)
1308     return tmplist->line;
1309    
1310     tmplist = tmplist->next;
1311     }
1312    
1313     return NULL;
1314     }
1315    
1316    
1317    
1318     /*********************************************************************
1319     * This extracts out the attributes and contents and indexes them
1320     *
1321     *********************************************************************/
1322     static void index_XML_attributes( PARSE_DATA *parse_data, char *tag, const char **attr )
1323     {
1324     char tagbuf[256]; /* we have our limits */
1325     char *content;
1326     char *t;
1327     int i;
1328     int meta_append;
1329     int prop_append;
1330     int taglen = strlen( tag );
1331     SWISH *sw = parse_data->sw;
1332     UndefMetaFlag tmp_undef = sw->UndefinedMetaTags; // save
1333    
1334     sw->UndefinedMetaTags = sw->UndefinedXMLAttributes;
1335    
1336    
1337     strcpy( tagbuf, tag );
1338     t = tagbuf + taglen;
1339     *t = '.'; /* hard coded! */
1340     t++;
1341    
1342     for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1343     {
1344     meta_append = 0;
1345     prop_append = 0;
1346    
1347     /* Skip attributes that are XMLClassAttribues */
1348     if ( isXMLClassAttribute( sw, (char *)attr[i] ) )
1349     continue;
1350    
1351    
1352     if ( strlen( (char *)attr[i] ) + taglen + 2 > 256 )
1353     {
1354     warning("Attribute '%s' on tag '%s' too long to build metaname\n", (char *)attr[i], tag );
1355     continue;
1356     }
1357    
1358     strcpy( t, (char *)attr[i] ); /* create tag.attribute metaname */
1359     content = (char *)attr[i+1];
1360    
1361     if ( !*content )
1362     continue;
1363    
1364     strtolower( tagbuf );
1365    
1366    
1367    
1368     flush_buffer( parse_data, 1 ); // isn't needed, right?
1369     start_metaTag( parse_data, tagbuf, tagbuf, &meta_append, &prop_append, 0 );
1370     char_hndl( parse_data, content, strlen( content ) );
1371     end_metaTag( parse_data, tagbuf, 0 );
1372     }
1373    
1374     sw->UndefinedMetaTags = tmp_undef;
1375     }
1376    
1377    
1378    
1379     /*********************************************************************
1380     * Deal with html's <meta name="foo" content="bar">
1381     * Simply calls start and end meta, and passes content
1382     *
1383     *********************************************************************/
1384    
1385     static void process_htmlmeta( PARSE_DATA *parse_data, const char **attr )
1386     {
1387     char *metatag = NULL;
1388     char *content = NULL;
1389     int meta_append = 0;
1390     int prop_append = 0;
1391    
1392     int i;
1393    
1394     /* Don't add any meta data while looking for just the title */
1395     if ( parse_data->fprop->index_no_content )
1396     return;
1397    
1398     for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1399     {
1400     if ( (strcmp( attr[i], "name" ) == 0 ) && attr[i+1] )
1401     metatag = (char *)attr[i+1];
1402    
1403     else if ( (strcmp( attr[i], "content" ) == 0 ) && attr[i+1] )
1404     content = (char *)attr[i+1];
1405     }
1406    
1407    
1408     if ( metatag && content )
1409     {
1410    
1411     /* Robots exclusion: http://www.robotstxt.org/wc/exclusion.html#meta */
1412     if ( !strcasecmp( metatag, "ROBOTS") && lstrstr( content, "NOINDEX" ) )
1413     {
1414     if ( parse_data->sw->obeyRobotsNoIndex )
1415     abort_parsing( parse_data, -3 );
1416    
1417     return;
1418     }
1419    
1420     /* Process as a start -> end tag sequence */
1421     strtolower( metatag );
1422    
1423     flush_buffer( parse_data, 111 );
1424     start_metaTag( parse_data, metatag, metatag, &meta_append, &prop_append, 0 );
1425     char_hndl( parse_data, content, strlen( content ) );
1426     end_metaTag( parse_data, metatag, 0 );
1427     flush_buffer( parse_data, 112 );
1428     }
1429    
1430     }
1431    
1432    
1433     /*********************************************************************
1434     * Append character data to the end of the buffer
1435     *
1436     * Buffer is extended/created if needed
1437     *
1438     * ToDo: Flush buffer if it gets too large
1439     *
1440     *
1441     *********************************************************************/
1442    
1443     static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen )
1444     {
1445    
1446    
1447     if ( !txtlen ) // shouldn't happen
1448     return;
1449    
1450     /* (re)allocate buf if needed */
1451    
1452     if ( buf->cur + txtlen >= buf->max )
1453     {
1454     buf->max = ( buf->max + BUFFER_CHUNK_SIZE+1 < buf->cur + txtlen )
1455     ? buf->cur + txtlen+1
1456     : buf->max + BUFFER_CHUNK_SIZE+1;
1457    
1458     buf->buffer = erealloc( buf->buffer, buf->max+1 );
1459     }
1460    
1461     memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen );
1462     buf->cur += txtlen;
1463     buf->buffer[buf->cur] = '\0'; /* seems like a nice thing to do -- only used now in title check */
1464     }
1465    
1466    
1467    
1468    
1469     /*********************************************************************
1470     * Flush buffer - adds words to index, and properties
1471     *
1472     * If the clear flag is set then the entire buffer is flushed.
1473     * Otherwise, every thing up to the last *partial* word is flushed.
1474     * It's partial if there is not white-space at the very end of the buffer.
1475     *
1476     * This prevents some<b>long</b>word from being flushed into part words.
1477     *
1478     *********************************************************************/
1479     static void flush_buffer( PARSE_DATA *parse_data, int clear )
1480     {
1481     CHAR_BUFFER *buf = &parse_data->text_buffer;
1482     SWISH *sw = parse_data->sw;
1483     int structure = get_structure( parse_data );
1484     int orig_end = buf->cur;
1485     char save_char = '?';
1486     char *c;
1487    
1488     /* anything to do? */
1489     if ( !buf->cur )
1490     return;
1491    
1492     /* look back for word boundry when "clear" is not set */
1493    
1494     if ( !clear && !isspace( (int)buf->buffer[buf->cur-1] ) ) // flush up to current word
1495     {
1496     while ( buf->cur > 0 && !isspace( (int)buf->buffer[buf->cur-1] ) )
1497     buf->cur--;
1498    
1499     if ( !buf->cur ) // then there's only a single word in the buffer
1500     {
1501     buf->cur = orig_end;
1502     if ( buf->cur < BUFFER_CHUNK_SIZE ) // should reall look at indexf->header.maxwordlimit
1503     return; // but just trying to keep the buffer from growing too large
1504     }
1505    
1506     save_char = buf->buffer[buf->cur];
1507     }
1508    
1509    
1510     /* Mark the end of the buffer - should switch over to using a length to avoid strlen */
1511    
1512     buf->buffer[buf->cur] = '\0';
1513    
1514    
1515     /* Make sure there some non-whitespace chars to print */
1516    
1517     c = buf->buffer;
1518     while ( *c && isspace( (int)*c ) )
1519     c++;
1520    
1521    
1522     if ( *c )
1523     {
1524     /* Index the text */
1525     if ( !parse_data->meta_stack.ignore_flag ) // this really is wrong -- should not check ignore here. Fix should be to use two buffers
1526     parse_data->total_words +=
1527     indexstring( sw, c, parse_data->filenum, structure, 0, NULL, &(parse_data->word_pos) );
1528    
1529     /* Add the properties */
1530     addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path );
1531    
1532    
1533     /* yuck - addDocProperties should do this. Ok, add to summary, if active */
1534     {
1535     SUMMARY_INFO *summary = &parse_data->summary;
1536     if ( summary->active )
1537     addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 );
1538     }
1539     }
1540    
1541    
1542     /* clear the buffer */
1543    
1544     if ( orig_end && orig_end > buf->cur )
1545     {
1546     buf->buffer[buf->cur] = save_char; // put back the char where null was placed
1547     memmove( buf->buffer, &buf->buffer[buf->cur], orig_end - buf->cur );
1548     buf->cur = orig_end - buf->cur;
1549     }
1550     else
1551     buf->cur = 0;
1552    
1553     }
1554    
1555    
1556    
1557     /*********************************************************************
1558     * Comments
1559     *
1560     * Should be able to call the char_hndl
1561     * Allows comments to enable/disable indexing a block by either:
1562     *
1563     * <!-- noindex -->
1564     * <!-- index -->
1565     * <!-- SwishCommand noindex -->
1566     * <!-- SwishCommand index -->
1567     *
1568     *
1569     *
1570     * To Do:
1571     * Can't use DontBump with comments. Might need a config variable for that.
1572     *
1573     *********************************************************************/
1574     static void comment_hndl(void *data, const char *txt)
1575     {
1576     PARSE_DATA *parse_data = (PARSE_DATA *)data;
1577     SWISH *sw = parse_data->sw;
1578     int structure = get_structure( parse_data );
1579     char *swishcmd;
1580     char *comment_text = str_skip_ws( (char *)txt );
1581     int found = 0;
1582    
1583    
1584     str_trim_ws( comment_text );
1585     if ( ! *comment_text )
1586     return;
1587    
1588    
1589     /* Strip off SwishCommand - might be for future use */
1590     if ( ( swishcmd = lstrstr( comment_text, "SwishCommand" )) && swishcmd == comment_text )
1591     {
1592     comment_text = str_skip_ws( comment_text + strlen( "SwishCommand" ) );
1593     found++;
1594     }
1595    
1596     if ( !strcasecmp( comment_text, "noindex" ) )
1597     {
1598     parse_data->swish_noindex++;
1599     return;
1600     }
1601     else if ( !strcasecmp( comment_text, "index" ) )
1602     {
1603     if ( parse_data->swish_noindex )
1604     parse_data->swish_noindex--;
1605    
1606     return;
1607     }
1608    
1609    
1610     if( found || !sw->indexComments )
1611     return;
1612    
1613    
1614     /* Bump position around comments - hard coded, always done to prevent phrase matching */
1615     parse_data->word_pos++;
1616    
1617     /* Index the text */
1618     parse_data->total_words +=
1619     indexstring( sw, comment_text, parse_data->filenum, structure | IN_COMMENTS, 0, NULL, &(parse_data->word_pos) );
1620    
1621    
1622     parse_data->word_pos++;
1623    
1624     }
1625    
1626    
1627    
1628     /*********************************************************************
1629     * check if a tag is an IgnoreTag
1630     *
1631     * Note: this returns a pointer to the config set tag, so don't free it!
1632     *
1633     *
1634     *********************************************************************/
1635    
1636     static char *isIgnoreMetaName(SWISH * sw, char *tag)
1637     {
1638     struct swline *tmplist = sw->ignoremetalist;
1639    
1640     if (!tmplist)
1641     return 0;
1642    
1643     while (tmplist)
1644     {
1645     if (strcmp(tag, tmplist->line) == 0)
1646     return tmplist->line;
1647    
1648     tmplist = tmplist->next;
1649     }
1650    
1651     return NULL;
1652     }
1653    
1654     /******************************************************************
1655     * Warning and Error Messages
1656     *
1657     ******************************************************************/
1658    
1659     static void error(void *data, const char *msg, ...)
1660     {
1661     va_list args;
1662     PARSE_DATA *parse_data = (PARSE_DATA *)data;
1663     char str[1000];
1664    
1665     va_start(args, msg);
1666     vsnprintf(str, 1000, msg, args );
1667     va_end(args);
1668     xmlParserError(parse_data->ctxt, str);
1669     }
1670    
1671     static void warning(void *data, const char *msg, ...)
1672     {
1673     va_list args;
1674     PARSE_DATA *parse_data = (PARSE_DATA *)data;
1675     char str[1000];
1676    
1677     va_start(args, msg);
1678     vsnprintf(str, 1000, msg, args );
1679     va_end(args);
1680     xmlParserWarning(parse_data->ctxt, str);
1681     }
1682    
1683    
1684     /*********************************************************************
1685     * Index ALT tabs
1686     *
1687     *
1688     *********************************************************************/
1689     static void index_alt_tab( PARSE_DATA *parse_data, const char **attr )
1690     {
1691     int meta_append = 0;
1692     int prop_append = 0;
1693     char *tagbuf = parse_data->sw->IndexAltTagMeta;
1694     char *alt_text = extract_html_links( parse_data, attr, NULL, "alt");
1695    
1696    
1697     if ( !alt_text )
1698     return;
1699    
1700     /* Index as regular text? */
1701     if ( !parse_data->sw->IndexAltTagMeta )
1702     {
1703     char_hndl( parse_data, alt_text, strlen( alt_text ) );
1704     return;
1705     }
1706    
1707     flush_buffer( parse_data, 1 );
1708     start_metaTag( parse_data, tagbuf, tagbuf, &meta_append, &prop_append, 0 );
1709     char_hndl( parse_data, alt_text, strlen( alt_text ) );
1710     end_metaTag( parse_data, tagbuf, 0 );
1711     }
1712    
1713    
1714    
1715    
1716     /*********************************************************************
1717     * Extract out links for indexing
1718     *
1719     * Pass in a metaname, and a tag
1720     *
1721     *********************************************************************/
1722    
1723     static char *extract_html_links( PARSE_DATA *parse_data, const char **attr, struct metaEntry *meta_entry, char *tag )
1724     {
1725     char *href = NULL;
1726     int i;
1727     int structure = get_structure( parse_data );
1728     char *absoluteURL;
1729     SWISH *sw = parse_data->sw;
1730    
1731    
1732     if ( !attr )
1733     return NULL;
1734    
1735     for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1736     if ( (strcmp( attr[i], tag ) == 0 ) && attr[i+1] )
1737     href = (char *)attr[i+1];
1738    
1739     if ( !href )
1740     return NULL;
1741    
1742     if ( !meta_entry ) /* The case for <BASE> */
1743     return href;
1744    
1745    
1746     /* Now, fixup the URL, if possible */
1747    
1748     if ( sw->AbsoluteLinks ) // ?? || parse_data->baseURL??? always fix up if a <BASE> tag?
1749     {
1750     char *base = parse_data->baseURL
1751     ? parse_data->baseURL
1752     : parse_data->fprop->real_path;
1753    
1754     absoluteURL = (char *)xmlBuildURI( (xmlChar *)href, (xmlChar *)base );
1755     }
1756     else
1757     absoluteURL = NULL;
1758    
1759    
1760    
1761     /* Index the text */
1762     parse_data->total_words +=
1763     indexstring( sw, absoluteURL ? absoluteURL : href, parse_data->filenum, structure, 1, &meta_entry->metaID, &(parse_data->word_pos) );
1764    
1765     if ( absoluteURL )
1766     xmlFree( absoluteURL );
1767    
1768     return href;
1769     }
1770    
1771    
1772    
1773     /* This doesn't look like the best method */
1774    
1775     static void abort_parsing( PARSE_DATA *parse_data, int abort_code )
1776     {
1777     parse_data->abort = abort_code; /* Flag that the we are all done */
1778     /* Disable parser */
1779     parse_data->SAXHandler->startElement = (startElementSAXFunc)NULL;
1780     parse_data->SAXHandler->endElement = (endElementSAXFunc)NULL;
1781     parse_data->SAXHandler->characters = (charactersSAXFunc)NULL;
1782     }
1783    
1784    
1785     /* This sets the current structure context (IN_HEAD, IN_BODY, etc) */
1786    
1787     static int get_structure( PARSE_DATA *parse_data )
1788     {
1789     int structure = IN_FILE;
1790    
1791     /* Set structure bits */
1792     if ( parse_data->parsing_html )
1793     {
1794     int i;
1795     for ( i = 0; i <= STRUCTURE_END; i++ )
1796     if ( parse_data->structure[i] )
1797     structure |= ( 1 << i );
1798     }
1799     return structure;
1800     }
1801    
1802     /*********************************************************************
1803     * Push a meta entry onto the stack
1804     *
1805     * Call With:
1806     * stack = which stack to use
1807     * tag = Element (tag name) to be used to match end tag
1808     * met = metaEntry to save
1809     * append = append to current if one (will be incremented)
1810     * ignore = if true, then flag as an ignore block and bump ignore counter
1811     *
1812     * Returns:
1813     * void
1814     *
1815     * ToDo:
1816     * move to Mem_Zone?
1817     *
1818     *
1819     *********************************************************************/
1820    
1821     static void push_stack( MetaStack *stack, char *tag, struct metaEntry *meta, int *append, int ignore )
1822     {
1823     MetaStackElementPtr node;
1824    
1825    
1826     if ( DEBUG_MASK & DEBUG_PARSED_TAGS )
1827     {
1828     int i;
1829     for (i=0; i<stack->pointer; i++)
1830     printf(" ");
1831    
1832     printf("<%s> (%s [%s]%s)\n", tag, stack->is_meta ? "meta" : "property", !meta ? "no meta name defined" : meta->metaName, ignore ? " *Start Ignore*" : "" );
1833     }
1834    
1835    
1836     /* Create a new node ( MetaStackElement already has one byte allocated for string ) */
1837     node = (MetaStackElementPtr) emalloc( sizeof( MetaStackElement ) + strlen( tag ) );
1838     node->next = NULL;
1839    
1840     /* Turn on the meta */
1841     if ( (node->meta = meta) )
1842     meta->in_tag++;
1843    
1844     if ( ( node->ignore = ignore ) ) /* entering a block to ignore */
1845     stack->ignore_flag++;
1846    
1847    
1848     strcpy( node->tag, tag );
1849    
1850    
1851    
1852    
1853     if ( !(*append)++ )
1854     {
1855     /* reallocate stack buffer if needed */
1856     if ( stack->pointer >= stack->maxsize )
1857     {
1858     progwarn("swish parser adding more stack space for tag %s. from %d to %d", tag, stack->maxsize, stack->maxsize+STACK_SIZE );
1859    
1860     stack->maxsize += STACK_SIZE;
1861     stack->stack = (MetaStackElementPtr *)erealloc( stack->stack, sizeof( MetaStackElementPtr ) * stack->maxsize );
1862     }
1863    
1864     stack->stack[stack->pointer++] = node;
1865     }
1866     else // prepend to the list
1867     {
1868     if ( !stack->pointer )
1869     progerr("Tried to append tag %s to stack, but stack is empty", tag );
1870    
1871     node->next = stack->stack[stack->pointer - 1];
1872     stack->stack[stack->pointer - 1] = node;
1873     }
1874     }
1875    
1876     /*********************************************************************
1877     * Pop the stack if the tag matches the last entry
1878     * Will turn off all metas associated with this tag level
1879     *
1880     * Call With:
1881     * parse_data = to automatically flush
1882     * stack = which stack to use
1883     * tag = Element (tag name) to be used for removal
1884     *
1885     * Returns:
1886     * true if tag matched
1887     *
1888     *********************************************************************/
1889    
1890     static int pop_stack_ifMatch( PARSE_DATA *parse_data, MetaStack *stack, char *tag )
1891     {
1892    
1893     /* return if stack is empty */
1894     if ( !stack->pointer )
1895     return 0;
1896    
1897    
1898    
1899     /* return if doesn't match the tag at the top of the stack */
1900    
1901     if ( strcmp( stack->stack[stack->pointer - 1]->tag, tag ) != 0 )
1902     return 0;
1903    
1904    
1905     flush_buffer( parse_data, 1 );
1906     pop_stack( stack );
1907    
1908     return 1;
1909     }
1910    
1911     /*********************************************************************
1912     * Pop the stack
1913     * Will turn off all metas associated with this tag level
1914     *
1915     * Call With:
1916     * stack = which stack to use
1917     *
1918     * Returns:
1919     * the stack pointer
1920     *
1921     *********************************************************************/
1922    
1923     static int pop_stack( MetaStack *stack )
1924     {
1925     MetaStackElementPtr node, this;
1926    
1927    
1928     /* return if stack is empty */
1929     if ( !stack->pointer )
1930     return 0;
1931    
1932     node = stack->stack[--stack->pointer];
1933    
1934     /* Now pop the stack. */
1935    
1936     // Note that some end tags can pop more than one tag
1937     // <foo class="bar"> can be to starting metanames <foo> and <foo:bar>, and </foo> pops all.
1938    
1939     while ( node )
1940     {
1941     this = node;
1942    
1943     if ( node->meta )
1944     node->meta->in_tag--;
1945    
1946     if ( node->ignore )
1947     stack->ignore_flag--;
1948    
1949    
1950     if ( DEBUG_MASK & DEBUG_PARSED_TAGS )
1951     {
1952     int i;
1953     for (i=0; i<stack->pointer; i++)
1954     printf(" ");
1955    
1956     printf("</%s> (%s)%s\n", node->tag, stack->is_meta ? "meta" : "property", node->ignore ? " end ignore" : "" );
1957     }
1958    
1959    
1960     node = node->next;
1961     efree( this );
1962     }
1963    
1964     return stack->pointer;
1965     }
1966    
1967     static int debug_get_indent( INDEXDATAHEADER *header )
1968     {
1969     int i;
1970     int indent = 0;
1971    
1972     for (i = 0; i < header->metaCounter; i++)
1973     if ( is_meta_index(header->metaEntryArray[i]) )
1974     indent += header->metaEntryArray[i]->in_tag;
1975    
1976     return indent;
1977     }
1978    
1979    
1980    
1981     static void debug_show_tag( char *tag, PARSE_DATA *parse_data, int start, char *message )
1982     {
1983     int indent = debug_get_indent( &parse_data->sw->indexlist->header);
1984     int i;
1985    
1986     for (i=0; i<indent; i++)
1987     printf(" ");
1988    
1989     printf("<%s%s> %s\n", start ? "" : "/", tag, message );
1990     }
1991    
1992     static void debug_show_parsed_text( PARSE_DATA *parse_data, char *txt, int len )
1993     {
1994     int indent = debug_get_indent( &parse_data->sw->indexlist->header);
1995     int i;
1996     char indent_buf[1000];
1997     int last_newline = 0;
1998     int col = 0;
1999    
2000    
2001     indent_buf[0] = '\0';
2002    
2003     for (i=0; i<indent; i++)
2004     strcat( indent_buf, " ");
2005    
2006    
2007     i = 0;
2008     while ( i < len )
2009     {
2010     printf("%s", indent_buf );
2011     col = 0;
2012     last_newline = 0;
2013    
2014     /* skip leading space */
2015     while ( i < len && isspace((int)txt[i] ) )
2016     i++;
2017    
2018     /* print text */
2019     while ( i < len )
2020     {
2021     col++;
2022    
2023    
2024     if ( txt[i] == '\n' )
2025     {
2026     while ( i < len && isspace((int)txt[i] ))
2027     i++;
2028     }
2029    
2030     if ( !isprint((int)txt[i] ))
2031     {
2032     i++;
2033     continue;
2034     }
2035    
2036     printf("%c", txt[i] );
2037     i++;
2038    
2039     if ( (col + strlen( indent_buf ) > 60 && isspace((int)txt[i])) || col + strlen( indent_buf ) > 78 )
2040     {
2041     printf("\n");
2042     last_newline=1;
2043     break;
2044     }
2045     }
2046     }
2047    
2048    
2049     if ( !last_newline )
2050     printf("\n");
2051     }
2052    

  ViewVC Help
Powered by ViewVC 1.1.22