/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/parser.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/parser.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Error occurred while calculating annotation data.
Importing web-site building process.

1 /*
2 $Id: parser.c,v 1.42 2002/08/14 22:08:48 whmoseley Exp $
3 **
4 **
5 ** This program and library is free software; you can redistribute it and/or
6 ** modify it under the terms of the GNU General Public License
7 ** as published by the Free Software Foundation; either version 2
8 ** of the License, or any later version.
9 **
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU General Public License for more details.
14 **
15 **
16 ** 2001-09-21 new HTML parser using libxml2 (http://www.xmlsoft.org/) Bill Moseley
17 **
18 ** Parser reads from the stream in READ_CHUNK_SIZE (after an initial 4 byte chunk
19 ** to determine the document encoding). Text is accumulated in a buffer of
20 ** BUFFER_CHUNK_SIZE (10K?) size. The buffer is flushed when a new metatag
21 ** is found. That buffer will grow, if needed, but now it will attempt
22 ** to flush upto the last word boundry if > BUFFER_CHUNK_SIZE.
23 **
24 ** The buffer is really only flushed when a real metaName or PropertyName is
25 ** found, or when the strucutre changes -- anything that changes the
26 ** properities of the text that might be in the buffer.
27 **
28 ** An optional arrangement might be to flush the buffer after processing each
29 ** READ_CHUNK_SIZE from the stream (flush to last word). This would limit the
30 ** character buffer size. It might be nice to flush on any meta tag (not just
31 ** tags listed as PropertyNames or MetaNames), but for large XML files one would
32 ** expect some use of Meta/PropertyNames. HTML files should flush more often
33 ** since the structure will change often. Exceptions to this are large <pre>
34 ** sections, but then the append_buffer() routine will force a flush when the buffer
35 ** exceeds BUFFER_CHUNK_SIZE.
36 **
37 ** The TXT buffer does flush after every chunk read.
38 **
39 ** I doubt messing with any of these would change much...
40 **
41 **
42 ** TODO:
43 **
44 ** - FileRules title (and all abort_parsing calls - define some constants)
45 **
46 ** - There's a lot of mixing of xmlChar and char, which will generate warnings.
47 **
48 ** - Add a fprop->orig_path (before ReplaceRules) and a directive BaseURI to be used
49 ** to fixup relative urls (if no <BASE>).
50 ** This would save space in the property file, but probably not enough to worry about.
51 **
52 **
53 ** - UndefinedMetaTags ignore might throw things like structure off since
54 ** processing continues (unlike IgnoreMetaTags). But everything should balance out.
55 **
56 ** - There are two buffers that are created for every file, but these could be done once
57 ** and only expanded when needed. If that would make any difference in indexing speed.
58 **
59 ** - Note that these parse_*() functions get passed a "buffer" which is not used
60 ** (to be compatible with old swihs-e buffer-based parsers)
61 **
62 ** - XML elements and attributes are all converted to lowercase.
63 **
64 */
65
66 /* libxml2 */
67 #include <libxml/HTMLparser.h>
68 #include <libxml/xmlerror.h>
69 #include <libxml/uri.h>
70
71
72 #include <stdarg.h> // for va_list
73 #ifdef HAVE_VARARGS_H
74 #include <varargs.h> // va_list on Win32
75 #endif
76 #include "swish.h"
77 #include "fs.h" // for the title check
78 #include "merge.h"
79 #include "mem.h"
80 #include "string.h"
81 #include "docprop.h"
82 #include "error.h"
83 #include "index.h"
84 #include "metanames.h"
85
86
87 /* Should be in config.h */
88
89 #define BUFFER_CHUNK_SIZE 10000 // This is the size of buffers used to accumulate text
90 #define READ_CHUNK_SIZE 2048 // The size of chunks read from the stream (4096 seems to cause problems)
91
92 /* to buffer text until an end tag is found */
93
94 typedef struct {
95 char *buffer; // text for buffer
96 int cur; // length
97 int max; // max size of buffer
98 int defaultID; // default ID for no meta names.
99 } CHAR_BUFFER;
100
101
102
103 // I think that the property system can deal with StoreDescription in a cleaner way.
104 // This code shouldn't need to know about that StoreDescription.
105
106 typedef struct {
107 struct metaEntry *meta;
108 int save_size; /* save max size */
109 char *tag; /* summary tag */
110 int active; /* inside summary */
111 } SUMMARY_INFO;
112
113 #define STACK_SIZE 255 // stack size, but can grow.
114
115 typedef struct MetaStackElement {
116 struct MetaStackElement *next; // pointer to *siblings*, if any
117 struct metaEntry *meta; // pointer to meta that's inuse
118 int ignore; // flag that this meta turned on ignore
119 char tag[1]; // tag to look for
120 } MetaStackElement, *MetaStackElementPtr;
121
122 typedef struct {
123 int pointer; // next empty slot in stack
124 int maxsize; // size of stack
125 int ignore_flag; // count of ignores
126 MetaStackElementPtr *stack; // pointer to an array of stack data
127 int is_meta; // is this a metaname or property stack?
128 } MetaStack;
129
130
131
132
133
134
135 /* This struct is returned in all call-back functions as user data */
136
137 typedef struct {
138 CHAR_BUFFER text_buffer; // buffer for collecting text
139 // CHAR_BUFFER prop_buffer; // someday, may want a separate property buffer if want to collect tags within props
140 SUMMARY_INFO summary; // argh.
141 MetaStack meta_stack; // stacks for tracking the nested metas
142 MetaStack prop_stack;
143 int total_words;
144 int word_pos;
145 int filenum;
146 INDEXDATAHEADER *header;
147 SWISH *sw;
148 FileProp *fprop;
149 FileRec *thisFileEntry;
150 int structure[STRUCTURE_END+1];
151 int parsing_html;
152 struct metaEntry *titleProp;
153 struct metaEntry *titleMeta;
154 struct metaEntry *swishdefaultMeta;
155 int flush_word; // flag to flush buffer next time there's a white space.
156 xmlSAXHandlerPtr SAXHandler; // for aborting, I guess.
157 xmlParserCtxtPtr ctxt;
158 CHAR_BUFFER ISO_Latin1; // buffer to hold UTF-8 -> ISO Latin-1 converted text
159 int abort; // flag to stop parsing
160 char *baseURL; // for fixing up relative links
161 int swish_noindex; // swishindex swishnoindex -- for hiding blocks with comments
162 } PARSE_DATA;
163
164
165 /* Prototypes */
166 static void start_hndl(void *data, const char *el, const char **attr);
167 static void end_hndl(void *data, const char *el);
168 static void char_hndl(void *data, const char *txt, int txtlen);
169 static void Whitespace(void *data, const xmlChar *txt, int txtlen);
170 static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen );
171 static void flush_buffer( PARSE_DATA *parse_data, int clear );
172 static void comment_hndl(void *data, const char *txt);
173 static char *isIgnoreMetaName(SWISH * sw, char *tag);
174 static void error(void *data, const char *msg, ...);
175 static void warning(void *data, const char *msg, ...);
176 static void process_htmlmeta( PARSE_DATA *parse_data, const char ** attr );
177 static int check_html_tag( PARSE_DATA *parse_data, char * tag, int start );
178 static void start_metaTag( PARSE_DATA *parse_data, char * tag, char *endtag, int *meta_append, int *prop_append , int is_html_tag );
179 static void end_metaTag( PARSE_DATA *parse_data, char * tag, int is_html_tag );
180 static void init_sax_handler( xmlSAXHandlerPtr SAXHandler, SWISH * sw );
181 static void init_parse_data( PARSE_DATA *parse_data, SWISH * sw, FileProp * fprop, FileRec *fi, xmlSAXHandlerPtr SAXHandler );
182 static void free_parse_data( PARSE_DATA *parse_data );
183 static void Convert_to_latin1( PARSE_DATA *parse_data, char *txt, int txtlen );
184 static int parse_chunks( PARSE_DATA *parse_data );
185
186 static void index_alt_tab( PARSE_DATA *parse_data, const char **attr );
187 static char *extract_html_links( PARSE_DATA *parse_data, const char **attr, struct metaEntry *meta_entry, char *tag );
188 static int read_next_chunk( FileProp *fprop, char *buf, int buf_size, int max_size );
189 static void abort_parsing( PARSE_DATA *parse_data, int abort_code );
190 static int get_structure( PARSE_DATA *parse_data );
191
192 static void push_stack( MetaStack *stack, char *tag, struct metaEntry *meta, int *append, int ignore );
193 static int pop_stack_ifMatch( PARSE_DATA *parse_data, MetaStack *stack, char *tag );
194 static int pop_stack( MetaStack *stack );
195
196 static void index_XML_attributes( PARSE_DATA *parse_data, char *tag, const char **attr );
197 static int start_XML_ClassAttributes( PARSE_DATA *parse_data, char *tag, const char **attr, int *meta_append, int *prop_append );
198 static char *isXMLClassAttribute(SWISH * sw, char *tag);
199
200 static void debug_show_tag( char *tag, PARSE_DATA *parse_data, int start, char *message );
201 static void debug_show_parsed_text( PARSE_DATA *parse_data, char *txt, int len );
202
203
204
205 /*********************************************************************
206 * XML Push parser
207 *
208 * Returns:
209 * Count of words indexed
210 *
211 *
212 *********************************************************************/
213
214 int parse_XML(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
215
216 {
217 xmlSAXHandler SAXHandlerStruct;
218 xmlSAXHandlerPtr SAXHandler = &SAXHandlerStruct;
219 PARSE_DATA parse_data;
220
221
222 init_sax_handler( SAXHandler, sw );
223 init_parse_data( &parse_data, sw, fprop, fi, SAXHandler );
224
225
226 /* Now parse the XML file */
227 return parse_chunks( &parse_data );
228
229 }
230
231 /*********************************************************************
232 * HTML Push parser
233 *
234 * Returns:
235 * Count of words indexed
236 *
237 *********************************************************************/
238
239 int parse_HTML(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
240 {
241 htmlSAXHandler SAXHandlerStruct;
242 htmlSAXHandlerPtr SAXHandler = &SAXHandlerStruct;
243 PARSE_DATA parse_data;
244
245 init_sax_handler( (xmlSAXHandlerPtr)SAXHandler, sw );
246 init_parse_data( &parse_data, sw, fprop, fi, (xmlSAXHandlerPtr)SAXHandler );
247
248
249 parse_data.parsing_html = 1;
250 parse_data.titleProp = getPropNameByName( parse_data.header, AUTOPROPERTY_TITLE );
251 parse_data.titleMeta = getMetaNameByName( parse_data.header, AUTOPROPERTY_TITLE );
252 parse_data.swishdefaultMeta = getMetaNameByName( parse_data.header, AUTOPROPERTY_DEFAULT );
253
254 /* Now parse the HTML file */
255 return parse_chunks( &parse_data );
256
257 }
258
259 /*********************************************************************
260 * TXT "Push" parser
261 *
262 * Returns:
263 * Count of words indexed
264 *
265 *********************************************************************/
266
267 int parse_TXT(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer)
268 {
269 PARSE_DATA parse_data;
270 int res;
271 char chars[READ_CHUNK_SIZE];
272
273
274
275 /* This does stuff that's not needed for txt */
276 init_parse_data( &parse_data, sw, fprop, fi, NULL );
277
278
279 /* Document Summary */
280 if ( parse_data.summary.meta && parse_data.summary.meta->max_len )
281 parse_data.summary.active++;
282
283
284 while ( (res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize )) )
285 {
286 append_buffer( &parse_data.text_buffer, chars, res );
287 flush_buffer( &parse_data, 0 ); // flush upto whitespace
288
289
290 /* turn off summary when we exceed size */
291 if ( parse_data.summary.meta && parse_data.summary.meta->max_len && fprop->bytes_read > parse_data.summary.meta->max_len )
292 parse_data.summary.active = 0;
293
294 }
295
296 flush_buffer( &parse_data, 1 );
297 free_parse_data( &parse_data );
298 return parse_data.total_words;
299 }
300
301
302 /*********************************************************************
303 * Parse chunks (used for both XML and HTML parsing)
304 * Creates the parsers, reads in chunks as one might expect
305 *
306 *
307 *********************************************************************/
308 static int parse_chunks( PARSE_DATA *parse_data )
309 {
310 SWISH *sw = parse_data->sw;
311 FileProp *fprop = parse_data->fprop;
312 xmlSAXHandlerPtr SAXHandler = parse_data->SAXHandler;
313 int res;
314 char chars[READ_CHUNK_SIZE];
315 xmlParserCtxtPtr ctxt;
316
317
318 /* Now start pulling into the libxml2 parser */
319
320 res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize );
321 if (res == 0)
322 return 0;
323
324 /* Create parser */
325 if ( parse_data->parsing_html )
326 ctxt = (xmlParserCtxtPtr)htmlCreatePushParserCtxt((htmlSAXHandlerPtr)SAXHandler, parse_data, chars, res, fprop->real_path,0);
327 else
328 ctxt = xmlCreatePushParserCtxt(SAXHandler, parse_data, chars, res, fprop->real_path);
329
330 parse_data->ctxt = ctxt; // save
331
332
333
334 while ( !parse_data->abort && (res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize )) )
335 {
336 if ( parse_data->parsing_html )
337 htmlParseChunk((htmlParserCtxtPtr)ctxt, chars, res, 0);
338 else
339 xmlParseChunk(ctxt, chars, res, 0);
340
341 /* Doesn't seem to make much difference to flush here */
342 //flush_buffer( parse_data, 0 ); // flush upto whitespace
343 }
344
345
346
347 /* Tell the parser we are done, and free it */
348 if ( parse_data->parsing_html )
349 {
350 if ( !parse_data->abort ) // bug in libxml 2.4.5
351 htmlParseChunk( (htmlParserCtxtPtr)ctxt, chars, 0, 1 );
352 htmlFreeParserCtxt( (htmlParserCtxtPtr)ctxt);
353 }
354 else
355 {
356 if ( !parse_data->abort ) // bug in libxml
357 xmlParseChunk(ctxt, chars, 0, 1);
358 xmlFreeParserCtxt(ctxt);
359 }
360
361 /* Daniel Veillard on Nov 21, 2001 says this should not be called for every doc. */
362 // But, it probably should be called when done parsing.
363 // xmlCleanupParser();
364
365 /* Check for abort condition set while parsing (isoktitle, NoContents) */
366
367 if ( parse_data->abort && fprop->index_no_content && !parse_data->total_words )
368 {
369 append_buffer( &parse_data->text_buffer, fprop->real_path, strlen(fprop->real_path) );
370
371 parse_data->meta_stack.ignore_flag = 0; /* make sure we can write */
372 flush_buffer( parse_data, 3 );
373 }
374
375
376 /* Flush any text left in the buffer */
377
378 if ( !parse_data->abort )
379 flush_buffer( parse_data, 3 );
380
381
382
383 free_parse_data( parse_data );
384
385
386 // $$$ This doesn't work since the file (and maybe some words) already added
387 // $$$ need a way to "remove" the file entry and words already added
388
389 if ( parse_data->abort < 0 )
390 return parse_data->abort;
391
392 return parse_data->total_words;
393 }
394
395 /*********************************************************************
396 * read_next_chunk - read another chunk from the stream
397 *
398 * Call with:
399 * fprop
400 * *buf - where to save the data
401 * *buf_size - max size of buffer
402 * *max_size - limit of *total* bytes read from this stream (for truncate)
403 *
404 * Returns:
405 * number of bytes read (as returned from fread)
406 *
407 *
408 *********************************************************************/
409 static int read_next_chunk( FileProp *fprop, char *buf, int buf_size, int max_size )
410 {
411 int size;
412 int res;
413
414 if ( fprop->done )
415 return 0;
416
417 /* For -S prog, only read in the right amount of data */
418 if ( fprop->external_program && (fprop->bytes_read >= fprop->fsize ))
419 return 0;
420
421
422 /* fprop->external_program is set if -S prog and NOT reading from a filter */
423
424 size = fprop->external_program && (( fprop->fsize - fprop->bytes_read ) < buf_size)
425 ? fprop->fsize - fprop->bytes_read
426 : buf_size;
427
428 if ( !fprop->bytes_read && size > 4 )
429 size = 4;
430
431
432
433 /* Truncate -- safety feature from Rainer. No attempt is made to backup to a whole word */
434 if ( max_size && fprop->bytes_read + size > max_size )
435 {
436 fprop->done++; // flag that we are done
437 size = max_size - fprop->bytes_read;
438 }
439
440
441 res = fread(buf, 1, size, fprop->fp);
442
443 fprop->bytes_read += res;
444
445 return res;
446 }
447
448
449
450 /*********************************************************************
451 * Init a sax handler structure
452 * Must pass in the structure
453 *
454 *********************************************************************/
455 static void init_sax_handler( xmlSAXHandlerPtr SAXHandler, SWISH * sw )
456 {
457 /* Set event handlers for libxml2 parser */
458 memset( SAXHandler, 0, sizeof( xmlSAXHandler ) );
459
460 SAXHandler->startElement = (startElementSAXFunc)&start_hndl;
461 SAXHandler->endElement = (endElementSAXFunc)&end_hndl;
462 SAXHandler->characters = (charactersSAXFunc)&char_hndl;
463 SAXHandler->cdataBlock = (charactersSAXFunc)&char_hndl;
464 SAXHandler->ignorableWhitespace = (ignorableWhitespaceSAXFunc)&Whitespace;
465
466 SAXHandler->comment = (commentSAXFunc)&comment_hndl;
467
468 if ( sw->parser_warn_level >= 1 )
469 SAXHandler->fatalError = (fatalErrorSAXFunc)&error;
470
471 if ( sw->parser_warn_level >= 2 )
472 SAXHandler->error = (errorSAXFunc)&error;
473
474 if ( sw->parser_warn_level >= 3 )
475 SAXHandler->warning = (warningSAXFunc)&warning;
476
477 }
478
479
480 /*********************************************************************
481 * Init the parer data structure
482 * Must pass in the structure
483 *
484 *********************************************************************/
485 static void init_parse_data( PARSE_DATA *parse_data, SWISH * sw, FileProp * fprop, FileRec *fi, xmlSAXHandlerPtr SAXHandler )
486 {
487 IndexFILE *indexf = sw->indexlist;
488 struct StoreDescription *stordesc = fprop->stordesc;
489
490 /* Set defaults */
491 memset( parse_data, 0, sizeof(PARSE_DATA));
492
493 parse_data->header = &indexf->header;
494 parse_data->sw = sw;
495 parse_data->fprop = fprop;
496 parse_data->filenum = fi->filenum;
497 parse_data->word_pos = 1; /* compress doesn't like zero */
498 parse_data->SAXHandler = SAXHandler;
499 parse_data->thisFileEntry = fi;
500
501
502 /* Don't really like this, as mentioned above */
503 if ( stordesc && (parse_data->summary.meta = getPropNameByName(parse_data->header, AUTOPROPERTY_SUMMARY)))
504 {
505 /* Set property limit size for this document type, and store previous size limit */
506 parse_data->summary.save_size = parse_data->summary.meta->max_len;
507 parse_data->summary.meta->max_len = stordesc->size;
508 parse_data->summary.tag = stordesc->field;
509 if ( parse_data->summary.tag )
510 strtolower(parse_data->summary.tag);
511 }
512
513
514 /* Initialize the meta and property stacks */
515 /* Not needed for TXT processing, of course */
516 {
517 MetaStack *s;
518
519 s = &parse_data->meta_stack;
520 s->is_meta = 1;
521 s->maxsize = STACK_SIZE;
522
523 s->stack = (MetaStackElementPtr *)emalloc( sizeof( MetaStackElementPtr ) * s->maxsize );
524 if ( fprop->index_no_content )
525 s->ignore_flag++;
526
527 s = &parse_data->prop_stack;
528 s->is_meta = 0;
529 s->maxsize = STACK_SIZE;
530 s->stack = (MetaStackElementPtr *)emalloc( sizeof( MetaStackElementPtr ) * s->maxsize );
531 if ( fprop->index_no_content ) /* only works for HTML */
532 s->ignore_flag++;
533 }
534
535 addCommonProperties(sw, fprop, fi, NULL, NULL, 0);
536 }
537
538
539 /*********************************************************************
540 * Free any data used by the parse_data struct
541 *
542 *********************************************************************/
543 static void free_parse_data( PARSE_DATA *parse_data )
544 {
545
546 if ( parse_data->ISO_Latin1.buffer )
547 efree( parse_data->ISO_Latin1.buffer );
548
549 if ( parse_data->text_buffer.buffer )
550 efree( parse_data->text_buffer.buffer );
551
552 if ( parse_data->baseURL )
553 efree( parse_data->baseURL );
554
555
556 /* Pop the stacks */
557 while( pop_stack( &parse_data->meta_stack ) );
558 while( pop_stack( &parse_data->prop_stack ) );
559
560 /* Free the stacks */
561 if ( parse_data->meta_stack.stack )
562 efree( parse_data->meta_stack.stack );
563
564 if ( parse_data->prop_stack.stack )
565 efree( parse_data->prop_stack.stack );
566
567
568
569 /* Restore the size in the StoreDescription property */
570 if ( parse_data->summary.save_size )
571 parse_data->summary.meta->max_len = parse_data->summary.save_size;
572
573 }
574
575 /*********************************************************************
576 * Start Tag Event Handler
577 *
578 * This is called by libxml2. It normally just calls start_metaTag()
579 * and that decides how to deal with that meta tag.
580 * It also converts <meta> and <tag class=foo> into meta tags as swish
581 * would expect them (and then calls start_metaTag().
582 *
583 * To Do:
584 * deal with attributes!
585 *
586 *********************************************************************/
587
588
589 static void start_hndl(void *data, const char *el, const char **attr)
590 {
591 PARSE_DATA *parse_data = (PARSE_DATA *)data;
592 char tag[MAXSTRLEN + 1];
593 int is_html_tag = 0; // to allow <foo> type of meta tags in HTML
594 int meta_append = 0; // used to allow siblings metanames
595 int prop_append = 0;
596
597
598 /* disabeld by a comment? */
599 if ( parse_data->swish_noindex )
600 return;
601
602 if(strlen(el) >= MAXSTRLEN) // easy way out
603 {
604 warning("Warning: Tag found in %s is too long: '%s'\n", parse_data->fprop->real_path, el );
605 return;
606 }
607
608 strcpy(tag,(char *)el);
609 strtolower( tag ); // xml?
610
611
612 if ( parse_data->parsing_html )
613 {
614
615 /* handle <meta name="metaname" content="foo"> */
616 if ( (strcmp( tag, "meta") == 0) && attr )
617 {
618 process_htmlmeta( parse_data, attr );
619 return;
620 }
621
622
623 /* Deal with structure */
624 if ( (is_html_tag = check_html_tag( parse_data, tag, 1 )) )
625 {
626 /** Special handling for <A>, <IMG>, and <BASE> tags **/
627
628 /* Extract out links - currently only keep <a> links */
629 if ( strcmp( tag, "a") == 0 )
630 extract_html_links( parse_data, attr, parse_data->sw->links_meta, "href" );
631
632
633 /* Extract out links from images */
634 else if ( strcmp( tag, "img") == 0 )
635 {
636 if (parse_data->sw->IndexAltTag)
637 index_alt_tab( parse_data, attr );
638
639 extract_html_links( parse_data, attr, parse_data->sw->images_meta, "src" );
640 }
641
642
643 /* Extract out the BASE URL for fixups */
644 else if ( strcmp( tag, "base") == 0 )
645 parse_data->baseURL = estrdup( extract_html_links( parse_data, attr, NULL, "href" ) );
646 }
647
648 }
649
650
651 /* Now check if we are in a meta tag */
652 start_metaTag( parse_data, tag, tag, &meta_append, &prop_append, is_html_tag );
653
654
655
656 /* Index the content of attributes */
657
658 if ( !parse_data->parsing_html && attr )
659 {
660 int class_found = 0;
661
662 /* Allow <foo class="bar"> to look like <foo.bar> */
663
664 if ( parse_data->sw->XMLClassAttributes )
665 class_found = start_XML_ClassAttributes( parse_data, tag, attr, &meta_append, &prop_append );
666
667
668 /* Index XML attributes */
669
670 if ( !class_found && parse_data->sw->UndefinedXMLAttributes != UNDEF_META_DISABLE )
671 index_XML_attributes( parse_data, tag, attr );
672 }
673
674 }
675
676
677
678
679
680 /*********************************************************************
681 * End Tag Event Handler
682 *
683 * Called by libxml2.
684 *
685 *
686 *
687 *********************************************************************/
688
689
690 static void end_hndl(void *data, const char *el)
691 {
692 PARSE_DATA *parse_data = (PARSE_DATA *)data;
693 char tag[MAXSTRLEN + 1];
694 int is_html_tag = 0; // to allow <foo> type of metatags in html.
695
696
697 /* disabeld by a comment? */
698 if ( parse_data->swish_noindex )
699 return;
700
701 if(strlen(el) > MAXSTRLEN)
702 {
703 warning("Warning: Tag found in %s is too long: '%s'\n", parse_data->fprop->real_path, el );
704 return;
705 }
706
707 strcpy(tag,(char *)el);
708 strtolower( tag );
709
710
711
712 if ( parse_data->parsing_html )
713 {
714
715 /* <meta> tags are closed in start_hndl */
716
717 if ( (strcmp( tag, "meta") == 0) )
718 return; // this was flushed at end tag
719
720
721
722 /* Deal with structure */
723 is_html_tag = check_html_tag( parse_data, tag, 0 );
724 }
725
726
727 end_metaTag( parse_data, tag, is_html_tag );
728 }
729
730
731
732 /*********************************************************************
733 * Character Data Event Handler
734 *
735 * This does the actual adding of text to the index and adding properties
736 * if any tags have been found to index
737 *
738 *
739 *********************************************************************/
740
741 static void char_hndl(void *data, const char *txt, int txtlen)
742 {
743 PARSE_DATA *parse_data = (PARSE_DATA *)data;
744
745
746 /* Have we been disabled? */
747 if ( !parse_data->SAXHandler->characters )
748 return;
749
750 /* disabeld by a comment? */
751 if ( parse_data->swish_noindex )
752 return;
753
754
755 /* If currently in an ignore block, then return */
756 if ( parse_data->meta_stack.ignore_flag && parse_data->prop_stack.ignore_flag )
757 return;
758
759 /* $$$ this was added to limit the buffer size */
760 if ( parse_data->text_buffer.cur + txtlen >= BUFFER_CHUNK_SIZE )
761 flush_buffer( parse_data, 0 ); // flush upto last word - somewhat expensive
762
763
764
765 Convert_to_latin1( parse_data, (char *)txt, txtlen );
766
767
768 if ( DEBUG_MASK & DEBUG_PARSED_TEXT )
769 debug_show_parsed_text( parse_data, parse_data->ISO_Latin1.buffer, parse_data->ISO_Latin1.cur );
770
771
772
773
774 /* Check if we are waiting for a word boundry, and there is white space in the text */
775 /* If so, write the word, then reset the structure, then write the rest of the text. */
776
777 if ( parse_data->flush_word )
778 {
779 /* look for whitespace */
780 char *c = parse_data->ISO_Latin1.buffer;
781 int i;
782 for ( i=0; i < parse_data->ISO_Latin1.cur; i++ )
783 if ( isspace( (int)c[i] ) )
784 {
785 append_buffer( &parse_data->text_buffer, parse_data->ISO_Latin1.buffer, i );
786 flush_buffer( parse_data, 1 ); // Flush the entire buffer
787
788 parse_data->structure[parse_data->flush_word-1]--; // now it's ok to turn of the structure bit
789 parse_data->flush_word = 0;
790
791 /* flush the rest */
792 append_buffer( &parse_data->text_buffer, &c[i], parse_data->ISO_Latin1.cur - i );
793
794 return;
795 }
796 }
797
798
799
800 /* Buffer the text */
801 append_buffer( &parse_data->text_buffer, parse_data->ISO_Latin1.buffer, parse_data->ISO_Latin1.cur );
802
803 /* Some day, might want to have a separate property buffer if need to collect more than plain text */
804 // append_buffer( &parse_data->prop_buffer, txt, txtlen );
805
806
807
808 }
809
810 /*********************************************************************
811 * ignorableWhitespace handler
812 *
813 * Just adds a space to the buffer
814 *
815 *
816 *********************************************************************/
817
818 static void Whitespace(void *data, const xmlChar *txt, int txtlen)
819 {
820 PARSE_DATA *parse_data = (PARSE_DATA *)data;
821
822 append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose
823 }
824
825
826
827
828 /*********************************************************************
829 * Convert UTF-8 to Latin-1
830 *
831 * Buffer is extended/created if needed
832 *
833 *********************************************************************/
834
835 static void Convert_to_latin1( PARSE_DATA *parse_data, char *txt, int txtlen )
836 {
837 CHAR_BUFFER *buf = &parse_data->ISO_Latin1;
838 int inlen = txtlen;
839 int ret;
840 char *start_buf;
841 char *end_buf = txt + txtlen - 1;
842 int used;
843
844
845 /* (re)allocate buf if needed */
846
847 if ( txtlen >= buf->max )
848 {
849 buf->max = ( buf->max + BUFFER_CHUNK_SIZE+1 < txtlen )
850 ? buf->max + txtlen+1
851 : buf->max + BUFFER_CHUNK_SIZE+1;
852
853 buf->buffer = erealloc( buf->buffer, buf->max );
854 }
855
856 buf->cur = 0; /* start at the beginning of the buffer */
857
858 while( 1 )
859 {
860 used = buf->max - buf->cur; /* size available in buffer */
861 start_buf = &buf->buffer[buf->cur]; /* offset into buffer */
862
863 /* Returns 0 for OK */
864 ret = UTF8Toisolat1( (unsigned char *)start_buf, &used, (const unsigned char *)txt, &inlen );
865
866 if ( used > 0 ) // tally up total bytes consumed
867 buf->cur += used;
868
869 if ( ret == 0 ) // all done
870 return;
871
872 if ( ret == -2 ) // encoding failed
873 {
874 if ( parse_data->sw->parser_warn_level >= 1 )
875 xmlParserWarning(parse_data->ctxt, "Failed to convert internal UTF-8 to Latin-1.\nReplacing non ISO-8859-1 char with char '%c'\n", ENCODE_ERROR_CHAR);
876
877
878 buf->buffer[buf->cur++] = ENCODE_ERROR_CHAR;
879
880
881 /* Skip one UTF-8 character -- returns null if not pointing to a UTF-8 char */
882 if ( !(txt = (char *)xmlUTF8Strpos( (const xmlChar *)(&txt[inlen]), 1) ))
883 return;
884
885 /* Calculate the remaining length of the input string */
886 inlen = (unsigned long)end_buf - (unsigned long)txt + 1;
887
888 if ( inlen <= 0 )
889 return;
890
891 start_buf += buf->cur-1;
892 }
893 else
894 {
895 xmlParserWarning(parse_data->ctxt, "Error '%d' converting internal UTF-8 to Latin-1.\n", ret );
896 return;
897 }
898 }
899 }
900
901
902 /*********************************************************************
903 * Start of a MetaTag
904 * All XML tags are metatags, but for HTML there's special handling.
905 *
906 * Call with:
907 * parse_data
908 * tag = tag to look for as a metaname/property
909 * endtag = tag to look for as the ending tag (since might be different from start tag)
910 * meta_append = if zero, tells push that this is a new meta
911 * prop_append otherwise, says it's a sibling of a previous call
912 * (Argh Jan 29, 2001 -- now I don't remember what that _append does!)
913 * (it's for working with xml attributes)
914 * is_html_tag = prevents UndefinedMetaTags from being applied to html tags
915 *
916 * <foo class=bar> can start two meta tags "foo" and "foo.bar". But "bar"
917 * will end both tags.
918 *
919 *
920 *********************************************************************/
921 static void start_metaTag( PARSE_DATA *parse_data, char * tag, char *endtag, int *meta_append, int *prop_append, int is_html_tag )
922 {
923 SWISH *sw = parse_data->sw;
924 struct metaEntry *m = NULL;
925
926
927 /* Bump on all meta names, unless overridden */
928 if (!is_html_tag && !isDontBumpMetaName(sw->dontbumpstarttagslist, tag))
929 parse_data->word_pos++;
930
931
932 /* check for ignore tag (should probably remove char handler for speed) */
933 // Should specific property names and meta names override this?
934
935 if ( isIgnoreMetaName( sw, tag ) )
936 {
937 /* shouldn't need to flush buffer since it's just blocking out a section and should be balanced */
938 /* but need to due to the weird way the char buffer is used (and shared with props) and how metatags are assigned to the buffer */
939 /* basically, since flush_buffer looks at the ignore flag and always clears the buffer, need to do it now */
940 /* flush_buffer really should not be in the business of checking the ignore flag, and rather we need to keep two buffers -- or maybe just always flush with any change */
941
942 flush_buffer( parse_data, 1 );
943
944 push_stack( &parse_data->meta_stack, endtag, NULL, meta_append, 1 );
945 push_stack( &parse_data->prop_stack, endtag, NULL, prop_append, 1 );
946 parse_data->structure[IN_META_BIT]++; // so we are in balance with pop_stack
947 return;
948 }
949
950
951 /* Check for metaNames */
952
953 if ( !(m = getMetaNameByName( parse_data->header, tag)) )
954 {
955
956 if ( !is_html_tag )
957 {
958 if ( sw->UndefinedMetaTags == UNDEF_META_AUTO )
959 {
960 if (sw->verbose)
961 printf("**Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path);
962
963 m = addMetaEntry( parse_data->header, tag, META_INDEX, 0);
964 }
965
966
967 else if ( sw->UndefinedMetaTags == UNDEF_META_IGNORE ) /* Ignore this block of text for metanames only (props ok) */
968 {
969 flush_buffer( parse_data, 66 ); // flush because we must still continue to process, and structures might change
970 push_stack( &parse_data->meta_stack, endtag, NULL, meta_append, 1 );
971 parse_data->structure[IN_META_BIT]++; // so we are in balance with pop_stack
972 /* must fall though to property check */
973 }
974 }
975 }
976
977
978
979 if ( m ) /* Is a meta name */
980 {
981 flush_buffer( parse_data, 6 ); /* new meta tag, so must flush */
982 push_stack( &parse_data->meta_stack, endtag, m, meta_append, 0 );
983 parse_data->structure[IN_META_BIT]++;
984 }
985
986 else if ( !is_html_tag )
987 {
988 /* If set to "error" on undefined meta tags, then error */
989 if ( sw->UndefinedMetaTags == UNDEF_META_ERROR )
990 progerr("Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path);
991
992 else if ( DEBUG_MASK & DEBUG_PARSED_TAGS )
993 debug_show_tag( tag, parse_data, 1, "(undefined meta name - no action)" );
994 }
995
996
997 /* Check property names -- allows HTML tags as property names */
998
999
1000 if ( (m = getPropNameByName( parse_data->header, tag)) )
1001 {
1002 flush_buffer( parse_data, 7 ); // flush since it's a new meta tag
1003 push_stack( &parse_data->prop_stack, endtag, m, prop_append, 0 );
1004 }
1005
1006
1007
1008 /* Look to enable StoreDescription - allow any tag */
1009 /* Don't need to flush since this has it's own buffer */
1010
1011 // This should really be a property, and use aliasing as needed
1012 {
1013 SUMMARY_INFO *summary = &parse_data->summary;
1014
1015 if ( summary->tag && (strcmp( tag, summary->tag ) == 0 ))
1016 {
1017 /* Flush data in buffer */
1018 if ( 0 == summary->active )
1019 flush_buffer( parse_data, 1 );
1020
1021 summary->active++;
1022 }
1023 }
1024
1025 }
1026
1027
1028 /*********************************************************************
1029 * End of a MetaTag
1030 * All XML tags are metatags, but for HTML there's special handling.
1031 *
1032 *********************************************************************/
1033 static void end_metaTag( PARSE_DATA *parse_data, char * tag, int is_html_tag )
1034 {
1035
1036 if ( pop_stack_ifMatch( parse_data, &parse_data->meta_stack, tag ) )
1037 parse_data->structure[IN_META_BIT]--;
1038
1039
1040 /* Out of a property? */
1041 pop_stack_ifMatch( parse_data, &parse_data->prop_stack, tag );
1042
1043
1044 /* Don't allow matching across tag boundry */
1045 if (!is_html_tag && !isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag))
1046 parse_data->word_pos++;
1047
1048
1049
1050 /* Look to disable StoreDescription */
1051 {
1052 SUMMARY_INFO *summary = &parse_data->summary;
1053 if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
1054 {
1055 /* Flush data in buffer */
1056 if ( 1 == summary->active )
1057 flush_buffer( parse_data, 1 ); // do first since flush buffer looks at summary->active
1058
1059 summary->active--;
1060 }
1061 }
1062
1063 }
1064
1065
1066 /*********************************************************************
1067 * Checks the HTML tag, and sets the "structure"
1068 * Also deals with FileRules title
1069 * In general, flushes the character buffer due to the change in structure.
1070 *
1071 * returns false if not a valid HTML tag (which might be a "fake" metaname)
1072 *
1073 *********************************************************************/
1074
1075 static int check_html_tag( PARSE_DATA *parse_data, char * tag, int start )
1076 {
1077 int is_html_tag = 1;
1078 int bump = start ? +1 : -1;
1079
1080 /* Check for structure bits */
1081
1082
1083 /** HEAD **/
1084
1085 if ( strcmp( tag, "head" ) == 0 )
1086 {
1087 flush_buffer( parse_data, 10 );
1088 parse_data->structure[IN_HEAD_BIT] += bump;
1089
1090 /* Check for NoContents - can quit looking now once out of <head> block */
1091
1092 if ( !start && parse_data->fprop->index_no_content )
1093 abort_parsing( parse_data, 1 );
1094
1095 }
1096
1097
1098
1099 /** TITLE **/
1100
1101 // Note: I think storing the title words by default should be optional.
1102 // Someone might not want to search title tags, if if they don't they are
1103 // screwed since title by default ranks higher than body words.
1104
1105
1106 else if ( strcmp( tag, "title" ) == 0 )
1107 {
1108 /* Can't flush buffer until we have looked at the title */
1109
1110 if ( !start )
1111 {
1112 struct MOD_FS *fs = parse_data->sw->FS;
1113
1114 /* Check isoktitle - before NoContents? */
1115 if ( match_regex_list( parse_data->text_buffer.buffer, fs->filerules.title) )
1116 {
1117 abort_parsing( parse_data, -2 );
1118 return 1;
1119 }
1120
1121 /* Check for NoContents - abort since all we need is the title text */
1122 if ( parse_data->fprop->index_no_content )
1123 abort_parsing( parse_data, 1 );
1124
1125
1126 }
1127 else
1128 /* In start tag, allow capture of text (NoContents sets ignore_flag at start) */
1129 if ( parse_data->fprop->index_no_content )
1130 parse_data->meta_stack.ignore_flag--;
1131
1132
1133 /* Now it's ok to flush */
1134 flush_buffer( parse_data, 11 );
1135
1136
1137 /* If title is a property, turn on the property flag */
1138 if ( parse_data->titleProp )
1139 parse_data->titleProp->in_tag = start ? 1 : 0;
1140
1141
1142 /* If title is a metaname, turn on the indexing flag */
1143 if ( parse_data->titleMeta )
1144 {
1145 parse_data->titleMeta->in_tag = start ? 1 : 0;
1146 parse_data->swishdefaultMeta->in_tag = start ? 1 : 0;
1147 }
1148
1149
1150
1151 parse_data->word_pos++;
1152 parse_data->structure[IN_TITLE_BIT] += bump;
1153 }
1154
1155
1156
1157 /** BODY **/
1158
1159 else if ( strcmp( tag, "body" ) == 0 )
1160 {
1161 flush_buffer( parse_data, 12 );
1162 parse_data->structure[IN_BODY_BIT] += bump;
1163 parse_data->word_pos++;
1164 }
1165
1166
1167
1168 /** H1 HEADINGS **/
1169
1170 /* This should be split so know different level for ranking */
1171 else if ( tag[0] == 'h' && isdigit((int) tag[1]))
1172 {
1173 flush_buffer( parse_data, 13 );
1174 parse_data->structure[IN_HEADER_BIT] += bump;
1175 }
1176
1177
1178
1179 /** EMPHASIZED **/
1180
1181 /* These should not be hard coded */
1182
1183 else if ( !strcmp( tag, "em" ) || !strcmp( tag, "b" ) || !strcmp( tag, "strong" ) || !strcmp( tag, "i" ) )
1184 {
1185 /* This is hard. The idea is to not break up words. But messes up the structure
1186 * ie: "this is b<b>O</b>ld word" so this would only flush "this is" on <b>,
1187 * and </b> would not flush anything. The PROBLEM is that then will make the next words
1188 * have a IN_EMPHASIZED structure. To "fix", I set a flag to flush at next word boundry.
1189 */
1190 flush_buffer( parse_data, 0 ); // flush up to current word (leaving any leading chars in buffer)
1191
1192 if ( start )
1193 parse_data->structure[IN_EMPHASIZED_BIT]++;
1194 else
1195 {
1196 /* If there is something in the buffer then delay turning off the flag until whitespace is found */
1197 if ( parse_data->text_buffer.cur )
1198 /* Flag to flush at next word boundry */
1199 parse_data->flush_word = IN_EMPHASIZED_BIT + 1; // + 1 because we might need to use zero some day
1200 else
1201 parse_data->structure[IN_EMPHASIZED_BIT]--;
1202 }
1203
1204
1205 }
1206
1207
1208
1209
1210 /* Now, look for reasons to add whitespace
1211 * img is not really, as someone might use an image to make up a word, but
1212 * commonly an image would split up text.
1213 * other tags: frame?
1214 */
1215
1216 if ( !strcmp( tag, "br" ) || !strcmp( tag, "img" ) )
1217 append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose
1218 else
1219 {
1220 const htmlElemDesc *element = htmlTagLookup( (const xmlChar *)tag );
1221
1222 if ( !element )
1223 is_html_tag = 0; // flag that this might be a meta name
1224
1225 else if ( !element->isinline )
1226 append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose
1227 }
1228
1229
1230
1231
1232 return is_html_tag;
1233 }
1234
1235 /*********************************************************************
1236 * Allow <foo class="bar"> to start "foo.bar" meta tag
1237 *
1238 * Returns true if any found
1239 *
1240 *********************************************************************/
1241 static int start_XML_ClassAttributes( PARSE_DATA *parse_data, char *tag, const char **attr, int *meta_append, int *prop_append )
1242 {
1243 char tagbuf[256]; /* we have our limits */
1244 char *t;
1245 int i;
1246 int taglen = strlen( tag );
1247 SWISH *sw = parse_data->sw;
1248 int found = 0;
1249
1250 strcpy( tagbuf, tag );
1251 t = tagbuf + taglen;
1252 *t = '.'; /* hard coded! */
1253 t++;
1254
1255
1256 for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1257 {
1258 if ( !isXMLClassAttribute( sw, (char *)attr[i]) )
1259 continue;
1260
1261
1262 /* Is the tag going to be too long? */
1263 if ( strlen( (char *)attr[i+1] ) + taglen + 2 > 256 )
1264 {
1265 warning("ClassAttribute on tag '%s' too long\n", tag );
1266 continue;
1267 }
1268
1269
1270 /* All metanames are currently lowercase -- would be better to force this in metanames.c */
1271 strtolower( tagbuf );
1272
1273 strcpy( t, (char *)attr[i+1] ); /* create tag.attribute metaname */
1274 start_metaTag( parse_data, tagbuf, tag, meta_append, prop_append, 0 );
1275 found++;
1276
1277 /* Now, nest attributes */
1278 if ( sw->UndefinedXMLAttributes != UNDEF_META_DISABLE )
1279 index_XML_attributes( parse_data, tagbuf, attr );
1280
1281 }
1282
1283 return found;
1284
1285 }
1286
1287 /*********************************************************************
1288 * check if a tag is an XMLClassAttributes
1289 *
1290 * Note: this returns a pointer to the config set tag, so don't free it!
1291 * Duplicate code!
1292 *
1293 * This does a case-insensitive lookup
1294 *
1295 *
1296 *********************************************************************/
1297
1298 static char *isXMLClassAttribute(SWISH * sw, char *tag)
1299 {
1300 struct swline *tmplist = sw->XMLClassAttributes;
1301
1302 if (!tmplist)
1303 return 0;
1304
1305 while (tmplist)
1306 {
1307 if (strcasecmp(tag, tmplist->line) == 0)
1308 return tmplist->line;
1309
1310 tmplist = tmplist->next;
1311 }
1312
1313 return NULL;
1314 }
1315
1316
1317
1318 /*********************************************************************
1319 * This extracts out the attributes and contents and indexes them
1320 *
1321 *********************************************************************/
1322 static void index_XML_attributes( PARSE_DATA *parse_data, char *tag, const char **attr )
1323 {
1324 char tagbuf[256]; /* we have our limits */
1325 char *content;
1326 char *t;
1327 int i;
1328 int meta_append;
1329 int prop_append;
1330 int taglen = strlen( tag );
1331 SWISH *sw = parse_data->sw;
1332 UndefMetaFlag tmp_undef = sw->UndefinedMetaTags; // save
1333
1334 sw->UndefinedMetaTags = sw->UndefinedXMLAttributes;
1335
1336
1337 strcpy( tagbuf, tag );
1338 t = tagbuf + taglen;
1339 *t = '.'; /* hard coded! */
1340 t++;
1341
1342 for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1343 {
1344 meta_append = 0;
1345 prop_append = 0;
1346
1347 /* Skip attributes that are XMLClassAttribues */
1348 if ( isXMLClassAttribute( sw, (char *)attr[i] ) )
1349 continue;
1350
1351
1352 if ( strlen( (char *)attr[i] ) + taglen + 2 > 256 )
1353 {
1354 warning("Attribute '%s' on tag '%s' too long to build metaname\n", (char *)attr[i], tag );
1355 continue;
1356 }
1357
1358 strcpy( t, (char *)attr[i] ); /* create tag.attribute metaname */
1359 content = (char *)attr[i+1];
1360
1361 if ( !*content )
1362 continue;
1363
1364 strtolower( tagbuf );
1365
1366
1367
1368 flush_buffer( parse_data, 1 ); // isn't needed, right?
1369 start_metaTag( parse_data, tagbuf, tagbuf, &meta_append, &prop_append, 0 );
1370 char_hndl( parse_data, content, strlen( content ) );
1371 end_metaTag( parse_data, tagbuf, 0 );
1372 }
1373
1374 sw->UndefinedMetaTags = tmp_undef;
1375 }
1376
1377
1378
1379 /*********************************************************************
1380 * Deal with html's <meta name="foo" content="bar">
1381 * Simply calls start and end meta, and passes content
1382 *
1383 *********************************************************************/
1384
1385 static void process_htmlmeta( PARSE_DATA *parse_data, const char **attr )
1386 {
1387 char *metatag = NULL;
1388 char *content = NULL;
1389 int meta_append = 0;
1390 int prop_append = 0;
1391
1392 int i;
1393
1394 /* Don't add any meta data while looking for just the title */
1395 if ( parse_data->fprop->index_no_content )
1396 return;
1397
1398 for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1399 {
1400 if ( (strcmp( attr[i], "name" ) == 0 ) && attr[i+1] )
1401 metatag = (char *)attr[i+1];
1402
1403 else if ( (strcmp( attr[i], "content" ) == 0 ) && attr[i+1] )
1404 content = (char *)attr[i+1];
1405 }
1406
1407
1408 if ( metatag && content )
1409 {
1410
1411 /* Robots exclusion: http://www.robotstxt.org/wc/exclusion.html#meta */
1412 if ( !strcasecmp( metatag, "ROBOTS") && lstrstr( content, "NOINDEX" ) )
1413 {
1414 if ( parse_data->sw->obeyRobotsNoIndex )
1415 abort_parsing( parse_data, -3 );
1416
1417 return;
1418 }
1419
1420 /* Process as a start -> end tag sequence */
1421 strtolower( metatag );
1422
1423 flush_buffer( parse_data, 111 );
1424 start_metaTag( parse_data, metatag, metatag, &meta_append, &prop_append, 0 );
1425 char_hndl( parse_data, content, strlen( content ) );
1426 end_metaTag( parse_data, metatag, 0 );
1427 flush_buffer( parse_data, 112 );
1428 }
1429
1430 }
1431
1432
1433 /*********************************************************************
1434 * Append character data to the end of the buffer
1435 *
1436 * Buffer is extended/created if needed
1437 *
1438 * ToDo: Flush buffer if it gets too large
1439 *
1440 *
1441 *********************************************************************/
1442
1443 static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen )
1444 {
1445
1446
1447 if ( !txtlen ) // shouldn't happen
1448 return;
1449
1450 /* (re)allocate buf if needed */
1451
1452 if ( buf->cur + txtlen >= buf->max )
1453 {
1454 buf->max = ( buf->max + BUFFER_CHUNK_SIZE+1 < buf->cur + txtlen )
1455 ? buf->cur + txtlen+1
1456 : buf->max + BUFFER_CHUNK_SIZE+1;
1457
1458 buf->buffer = erealloc( buf->buffer, buf->max+1 );
1459 }
1460
1461 memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen );
1462 buf->cur += txtlen;
1463 buf->buffer[buf->cur] = '\0'; /* seems like a nice thing to do -- only used now in title check */
1464 }
1465
1466
1467
1468
1469 /*********************************************************************
1470 * Flush buffer - adds words to index, and properties
1471 *
1472 * If the clear flag is set then the entire buffer is flushed.
1473 * Otherwise, every thing up to the last *partial* word is flushed.
1474 * It's partial if there is not white-space at the very end of the buffer.
1475 *
1476 * This prevents some<b>long</b>word from being flushed into part words.
1477 *
1478 *********************************************************************/
1479 static void flush_buffer( PARSE_DATA *parse_data, int clear )
1480 {
1481 CHAR_BUFFER *buf = &parse_data->text_buffer;
1482 SWISH *sw = parse_data->sw;
1483 int structure = get_structure( parse_data );
1484 int orig_end = buf->cur;
1485 char save_char = '?';
1486 char *c;
1487
1488 /* anything to do? */
1489 if ( !buf->cur )
1490 return;
1491
1492 /* look back for word boundry when "clear" is not set */
1493
1494 if ( !clear && !isspace( (int)buf->buffer[buf->cur-1] ) ) // flush up to current word
1495 {
1496 while ( buf->cur > 0 && !isspace( (int)buf->buffer[buf->cur-1] ) )
1497 buf->cur--;
1498
1499 if ( !buf->cur ) // then there's only a single word in the buffer
1500 {
1501 buf->cur = orig_end;
1502 if ( buf->cur < BUFFER_CHUNK_SIZE ) // should reall look at indexf->header.maxwordlimit
1503 return; // but just trying to keep the buffer from growing too large
1504 }
1505
1506 save_char = buf->buffer[buf->cur];
1507 }
1508
1509
1510 /* Mark the end of the buffer - should switch over to using a length to avoid strlen */
1511
1512 buf->buffer[buf->cur] = '\0';
1513
1514
1515 /* Make sure there some non-whitespace chars to print */
1516
1517 c = buf->buffer;
1518 while ( *c && isspace( (int)*c ) )
1519 c++;
1520
1521
1522 if ( *c )
1523 {
1524 /* Index the text */
1525 if ( !parse_data->meta_stack.ignore_flag ) // this really is wrong -- should not check ignore here. Fix should be to use two buffers
1526 parse_data->total_words +=
1527 indexstring( sw, c, parse_data->filenum, structure, 0, NULL, &(parse_data->word_pos) );
1528
1529 /* Add the properties */
1530 addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path );
1531
1532
1533 /* yuck - addDocProperties should do this. Ok, add to summary, if active */
1534 {
1535 SUMMARY_INFO *summary = &parse_data->summary;
1536 if ( summary->active )
1537 addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 );
1538 }
1539 }
1540
1541
1542 /* clear the buffer */
1543
1544 if ( orig_end && orig_end > buf->cur )
1545 {
1546 buf->buffer[buf->cur] = save_char; // put back the char where null was placed
1547 memmove( buf->buffer, &buf->buffer[buf->cur], orig_end - buf->cur );
1548 buf->cur = orig_end - buf->cur;
1549 }
1550 else
1551 buf->cur = 0;
1552
1553 }
1554
1555
1556
1557 /*********************************************************************
1558 * Comments
1559 *
1560 * Should be able to call the char_hndl
1561 * Allows comments to enable/disable indexing a block by either:
1562 *
1563 * <!-- noindex -->
1564 * <!-- index -->
1565 * <!-- SwishCommand noindex -->
1566 * <!-- SwishCommand index -->
1567 *
1568 *
1569 *
1570 * To Do:
1571 * Can't use DontBump with comments. Might need a config variable for that.
1572 *
1573 *********************************************************************/
1574 static void comment_hndl(void *data, const char *txt)
1575 {
1576 PARSE_DATA *parse_data = (PARSE_DATA *)data;
1577 SWISH *sw = parse_data->sw;
1578 int structure = get_structure( parse_data );
1579 char *swishcmd;
1580 char *comment_text = str_skip_ws( (char *)txt );
1581 int found = 0;
1582
1583
1584 str_trim_ws( comment_text );
1585 if ( ! *comment_text )
1586 return;
1587
1588
1589 /* Strip off SwishCommand - might be for future use */
1590 if ( ( swishcmd = lstrstr( comment_text, "SwishCommand" )) && swishcmd == comment_text )
1591 {
1592 comment_text = str_skip_ws( comment_text + strlen( "SwishCommand" ) );
1593 found++;
1594 }
1595
1596 if ( !strcasecmp( comment_text, "noindex" ) )
1597 {
1598 parse_data->swish_noindex++;
1599 return;
1600 }
1601 else if ( !strcasecmp( comment_text, "index" ) )
1602 {
1603 if ( parse_data->swish_noindex )
1604 parse_data->swish_noindex--;
1605
1606 return;
1607 }
1608
1609
1610 if( found || !sw->indexComments )
1611 return;
1612
1613
1614 /* Bump position around comments - hard coded, always done to prevent phrase matching */
1615 parse_data->word_pos++;
1616
1617 /* Index the text */
1618 parse_data->total_words +=
1619 indexstring( sw, comment_text, parse_data->filenum, structure | IN_COMMENTS, 0, NULL, &(parse_data->word_pos) );
1620
1621
1622 parse_data->word_pos++;
1623
1624 }
1625
1626
1627
1628 /*********************************************************************
1629 * check if a tag is an IgnoreTag
1630 *
1631 * Note: this returns a pointer to the config set tag, so don't free it!
1632 *
1633 *
1634 *********************************************************************/
1635
1636 static char *isIgnoreMetaName(SWISH * sw, char *tag)
1637 {
1638 struct swline *tmplist = sw->ignoremetalist;
1639
1640 if (!tmplist)
1641 return 0;
1642
1643 while (tmplist)
1644 {
1645 if (strcmp(tag, tmplist->line) == 0)
1646 return tmplist->line;
1647
1648 tmplist = tmplist->next;
1649 }
1650
1651 return NULL;
1652 }
1653
1654 /******************************************************************
1655 * Warning and Error Messages
1656 *
1657 ******************************************************************/
1658
1659 static void error(void *data, const char *msg, ...)
1660 {
1661 va_list args;
1662 PARSE_DATA *parse_data = (PARSE_DATA *)data;
1663 char str[1000];
1664
1665 va_start(args, msg);
1666 vsnprintf(str, 1000, msg, args );
1667 va_end(args);
1668 xmlParserError(parse_data->ctxt, str);
1669 }
1670
1671 static void warning(void *data, const char *msg, ...)
1672 {
1673 va_list args;
1674 PARSE_DATA *parse_data = (PARSE_DATA *)data;
1675 char str[1000];
1676
1677 va_start(args, msg);
1678 vsnprintf(str, 1000, msg, args );
1679 va_end(args);
1680 xmlParserWarning(parse_data->ctxt, str);
1681 }
1682
1683
1684 /*********************************************************************
1685 * Index ALT tabs
1686 *
1687 *
1688 *********************************************************************/
1689 static void index_alt_tab( PARSE_DATA *parse_data, const char **attr )
1690 {
1691 int meta_append = 0;
1692 int prop_append = 0;
1693 char *tagbuf = parse_data->sw->IndexAltTagMeta;
1694 char *alt_text = extract_html_links( parse_data, attr, NULL, "alt");
1695
1696
1697 if ( !alt_text )
1698 return;
1699
1700 /* Index as regular text? */
1701 if ( !parse_data->sw->IndexAltTagMeta )
1702 {
1703 char_hndl( parse_data, alt_text, strlen( alt_text ) );
1704 return;
1705 }
1706
1707 flush_buffer( parse_data, 1 );
1708 start_metaTag( parse_data, tagbuf, tagbuf, &meta_append, &prop_append, 0 );
1709 char_hndl( parse_data, alt_text, strlen( alt_text ) );
1710 end_metaTag( parse_data, tagbuf, 0 );
1711 }
1712
1713
1714
1715
1716 /*********************************************************************
1717 * Extract out links for indexing
1718 *
1719 * Pass in a metaname, and a tag
1720 *
1721 *********************************************************************/
1722
1723 static char *extract_html_links( PARSE_DATA *parse_data, const char **attr, struct metaEntry *meta_entry, char *tag )
1724 {
1725 char *href = NULL;
1726 int i;
1727 int structure = get_structure( parse_data );
1728 char *absoluteURL;
1729 SWISH *sw = parse_data->sw;
1730
1731
1732 if ( !attr )
1733 return NULL;
1734
1735 for ( i = 0; attr[i] && attr[i+1]; i+=2 )
1736 if ( (strcmp( attr[i], tag ) == 0 ) && attr[i+1] )
1737 href = (char *)attr[i+1];
1738
1739 if ( !href )
1740 return NULL;
1741
1742 if ( !meta_entry ) /* The case for <BASE> */
1743 return href;
1744
1745
1746 /* Now, fixup the URL, if possible */
1747
1748 if ( sw->AbsoluteLinks ) // ?? || parse_data->baseURL??? always fix up if a <BASE> tag?
1749 {
1750 char *base = parse_data->baseURL
1751 ? parse_data->baseURL
1752 : parse_data->fprop->real_path;
1753
1754 absoluteURL = (char *)xmlBuildURI( (xmlChar *)href, (xmlChar *)base );
1755 }
1756 else
1757 absoluteURL = NULL;
1758
1759
1760
1761 /* Index the text */
1762 parse_data->total_words +=
1763 indexstring( sw, absoluteURL ? absoluteURL : href, parse_data->filenum, structure, 1, &meta_entry->metaID, &(parse_data->word_pos) );
1764
1765 if ( absoluteURL )
1766 xmlFree( absoluteURL );
1767
1768 return href;
1769 }
1770
1771
1772
1773 /* This doesn't look like the best method */
1774
1775 static void abort_parsing( PARSE_DATA *parse_data, int abort_code )
1776 {
1777 parse_data->abort = abort_code; /* Flag that the we are all done */
1778 /* Disable parser */
1779 parse_data->SAXHandler->startElement = (startElementSAXFunc)NULL;
1780 parse_data->SAXHandler->endElement = (endElementSAXFunc)NULL;
1781 parse_data->SAXHandler->characters = (charactersSAXFunc)NULL;
1782 }
1783
1784
1785 /* This sets the current structure context (IN_HEAD, IN_BODY, etc) */
1786
1787 static int get_structure( PARSE_DATA *parse_data )
1788 {
1789 int structure = IN_FILE;
1790
1791 /* Set structure bits */
1792 if ( parse_data->parsing_html )
1793 {
1794 int i;
1795 for ( i = 0; i <= STRUCTURE_END; i++ )
1796 if ( parse_data->structure[i] )
1797 structure |= ( 1 << i );
1798 }
1799 return structure;
1800 }
1801
1802 /*********************************************************************
1803 * Push a meta entry onto the stack
1804 *
1805 * Call With:
1806 * stack = which stack to use
1807 * tag = Element (tag name) to be used to match end tag
1808 * met = metaEntry to save
1809 * append = append to current if one (will be incremented)
1810 * ignore = if true, then flag as an ignore block and bump ignore counter
1811 *
1812 * Returns:
1813 * void
1814 *
1815 * ToDo:
1816 * move to Mem_Zone?
1817 *
1818 *
1819 *********************************************************************/
1820
1821 static void push_stack( MetaStack *stack, char *tag, struct metaEntry *meta, int *append, int ignore )
1822 {
1823 MetaStackElementPtr node;
1824
1825
1826 if ( DEBUG_MASK & DEBUG_PARSED_TAGS )
1827 {
1828 int i;
1829 for (i=0; i<stack->pointer; i++)
1830 printf(" ");
1831
1832 printf("<%s> (%s [%s]%s)\n", tag, stack->is_meta ? "meta" : "property", !meta ? "no meta name defined" : meta->metaName, ignore ? " *Start Ignore*" : "" );
1833 }
1834
1835
1836 /* Create a new node ( MetaStackElement already has one byte allocated for string ) */
1837 node = (MetaStackElementPtr) emalloc( sizeof( MetaStackElement ) + strlen( tag ) );
1838 node->next = NULL;
1839
1840 /* Turn on the meta */
1841 if ( (node->meta = meta) )
1842 meta->in_tag++;
1843
1844 if ( ( node->ignore = ignore ) ) /* entering a block to ignore */
1845 stack->ignore_flag++;
1846
1847
1848 strcpy( node->tag, tag );
1849
1850
1851
1852
1853 if ( !(*append)++ )
1854 {
1855 /* reallocate stack buffer if needed */
1856 if ( stack->pointer >= stack->maxsize )
1857 {
1858 progwarn("swish parser adding more stack space for tag %s. from %d to %d", tag, stack->maxsize, stack->maxsize+STACK_SIZE );
1859
1860 stack->maxsize += STACK_SIZE;
1861 stack->stack = (MetaStackElementPtr *)erealloc( stack->stack, sizeof( MetaStackElementPtr ) * stack->maxsize );
1862 }
1863
1864 stack->stack[stack->pointer++] = node;
1865 }
1866 else // prepend to the list
1867 {
1868 if ( !stack->pointer )
1869 progerr("Tried to append tag %s to stack, but stack is empty", tag );
1870
1871 node->next = stack->stack[stack->pointer - 1];
1872 stack->stack[stack->pointer - 1] = node;
1873 }
1874 }
1875
1876 /*********************************************************************
1877 * Pop the stack if the tag matches the last entry
1878 * Will turn off all metas associated with this tag level
1879 *
1880 * Call With:
1881 * parse_data = to automatically flush
1882 * stack = which stack to use
1883 * tag = Element (tag name) to be used for removal
1884 *
1885 * Returns:
1886 * true if tag matched
1887 *
1888 *********************************************************************/
1889
1890 static int pop_stack_ifMatch( PARSE_DATA *parse_data, MetaStack *stack, char *tag )
1891 {
1892
1893 /* return if stack is empty */
1894 if ( !stack->pointer )
1895 return 0;
1896
1897
1898
1899 /* return if doesn't match the tag at the top of the stack */
1900
1901 if ( strcmp( stack->stack[stack->pointer - 1]->tag, tag ) != 0 )
1902 return 0;
1903
1904
1905 flush_buffer( parse_data, 1 );
1906 pop_stack( stack );
1907
1908 return 1;
1909 }
1910
1911 /*********************************************************************
1912 * Pop the stack
1913 * Will turn off all metas associated with this tag level
1914 *
1915 * Call With:
1916 * stack = which stack to use
1917 *
1918 * Returns:
1919 * the stack pointer
1920 *
1921 *********************************************************************/
1922
1923 static int pop_stack( MetaStack *stack )
1924 {
1925 MetaStackElementPtr node, this;
1926
1927
1928 /* return if stack is empty */
1929 if ( !stack->pointer )
1930 return 0;
1931
1932 node = stack->stack[--stack->pointer];
1933
1934 /* Now pop the stack. */
1935
1936 // Note that some end tags can pop more than one tag
1937 // <foo class="bar"> can be to starting metanames <foo> and <foo:bar>, and </foo> pops all.
1938
1939 while ( node )
1940 {
1941 this = node;
1942
1943 if ( node->meta )
1944 node->meta->in_tag--;
1945
1946 if ( node->ignore )
1947 stack->ignore_flag--;
1948
1949
1950 if ( DEBUG_MASK & DEBUG_PARSED_TAGS )
1951 {
1952 int i;
1953 for (i=0; i<stack->pointer; i++)
1954 printf(" ");
1955
1956 printf("</%s> (%s)%s\n", node->tag, stack->is_meta ? "meta" : "property", node->ignore ? " end ignore" : "" );
1957 }
1958
1959
1960 node = node->next;
1961 efree( this );
1962 }
1963
1964 return stack->pointer;
1965 }
1966
1967 static int debug_get_indent( INDEXDATAHEADER *header )
1968 {
1969 int i;
1970 int indent = 0;
1971
1972 for (i = 0; i < header->metaCounter; i++)
1973 if ( is_meta_index(header->metaEntryArray[i]) )
1974 indent += header->metaEntryArray[i]->in_tag;
1975
1976 return indent;
1977 }
1978
1979
1980
1981 static void debug_show_tag( char *tag, PARSE_DATA *parse_data, int start, char *message )
1982 {
1983 int indent = debug_get_indent( &parse_data->sw->indexlist->header);
1984 int i;
1985
1986 for (i=0; i<indent; i++)
1987 printf(" ");
1988
1989 printf("<%s%s> %s\n", start ? "" : "/", tag, message );
1990 }
1991
1992 static void debug_show_parsed_text( PARSE_DATA *parse_data, char *txt, int len )
1993 {
1994 int indent = debug_get_indent( &parse_data->sw->indexlist->header);
1995 int i;
1996 char indent_buf[1000];
1997 int last_newline = 0;
1998 int col = 0;
1999
2000
2001 indent_buf[0] = '\0';
2002
2003 for (i=0; i<indent; i++)
2004 strcat( indent_buf, " ");
2005
2006
2007 i = 0;
2008 while ( i < len )
2009 {
2010 printf("%s", indent_buf );
2011 col = 0;
2012 last_newline = 0;
2013
2014 /* skip leading space */
2015 while ( i < len && isspace((int)txt[i] ) )
2016 i++;
2017
2018 /* print text */
2019 while ( i < len )
2020 {
2021 col++;
2022
2023
2024 if ( txt[i] == '\n' )
2025 {
2026 while ( i < len && isspace((int)txt[i] ))
2027 i++;
2028 }
2029
2030 if ( !isprint((int)txt[i] ))
2031 {
2032 i++;
2033 continue;
2034 }
2035
2036 printf("%c", txt[i] );
2037 i++;
2038
2039 if ( (col + strlen( indent_buf ) > 60 && isspace((int)txt[i])) || col + strlen( indent_buf ) > 78 )
2040 {
2041 printf("\n");
2042 last_newline=1;
2043 break;
2044 }
2045 }
2046 }
2047
2048
2049 if ( !last_newline )
2050 printf("\n");
2051 }
2052

  ViewVC Help
Powered by ViewVC 1.1.22