1 |
adcroft |
1.1 |
/* |
2 |
|
|
$Id: parser.c,v 1.42 2002/08/14 22:08:48 whmoseley Exp $ |
3 |
|
|
** |
4 |
|
|
** |
5 |
|
|
** This program and library is free software; you can redistribute it and/or |
6 |
|
|
** modify it under the terms of the GNU General Public License |
7 |
|
|
** as published by the Free Software Foundation; either version 2 |
8 |
|
|
** of the License, or any later version. |
9 |
|
|
** |
10 |
|
|
** This program is distributed in the hope that it will be useful, |
11 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
|
|
** GNU General Public License for more details. |
14 |
|
|
** |
15 |
|
|
** |
16 |
|
|
** 2001-09-21 new HTML parser using libxml2 (http://www.xmlsoft.org/) Bill Moseley |
17 |
|
|
** |
18 |
|
|
** Parser reads from the stream in READ_CHUNK_SIZE (after an initial 4 byte chunk |
19 |
|
|
** to determine the document encoding). Text is accumulated in a buffer of |
20 |
|
|
** BUFFER_CHUNK_SIZE (10K?) size. The buffer is flushed when a new metatag |
21 |
|
|
** is found. That buffer will grow, if needed, but now it will attempt |
22 |
|
|
** to flush upto the last word boundry if > BUFFER_CHUNK_SIZE. |
23 |
|
|
** |
24 |
|
|
** The buffer is really only flushed when a real metaName or PropertyName is |
25 |
|
|
** found, or when the strucutre changes -- anything that changes the |
26 |
|
|
** properities of the text that might be in the buffer. |
27 |
|
|
** |
28 |
|
|
** An optional arrangement might be to flush the buffer after processing each |
29 |
|
|
** READ_CHUNK_SIZE from the stream (flush to last word). This would limit the |
30 |
|
|
** character buffer size. It might be nice to flush on any meta tag (not just |
31 |
|
|
** tags listed as PropertyNames or MetaNames), but for large XML files one would |
32 |
|
|
** expect some use of Meta/PropertyNames. HTML files should flush more often |
33 |
|
|
** since the structure will change often. Exceptions to this are large <pre> |
34 |
|
|
** sections, but then the append_buffer() routine will force a flush when the buffer |
35 |
|
|
** exceeds BUFFER_CHUNK_SIZE. |
36 |
|
|
** |
37 |
|
|
** The TXT buffer does flush after every chunk read. |
38 |
|
|
** |
39 |
|
|
** I doubt messing with any of these would change much... |
40 |
|
|
** |
41 |
|
|
** |
42 |
|
|
** TODO: |
43 |
|
|
** |
44 |
|
|
** - FileRules title (and all abort_parsing calls - define some constants) |
45 |
|
|
** |
46 |
|
|
** - There's a lot of mixing of xmlChar and char, which will generate warnings. |
47 |
|
|
** |
48 |
|
|
** - Add a fprop->orig_path (before ReplaceRules) and a directive BaseURI to be used |
49 |
|
|
** to fixup relative urls (if no <BASE>). |
50 |
|
|
** This would save space in the property file, but probably not enough to worry about. |
51 |
|
|
** |
52 |
|
|
** |
53 |
|
|
** - UndefinedMetaTags ignore might throw things like structure off since |
54 |
|
|
** processing continues (unlike IgnoreMetaTags). But everything should balance out. |
55 |
|
|
** |
56 |
|
|
** - There are two buffers that are created for every file, but these could be done once |
57 |
|
|
** and only expanded when needed. If that would make any difference in indexing speed. |
58 |
|
|
** |
59 |
|
|
** - Note that these parse_*() functions get passed a "buffer" which is not used |
60 |
|
|
** (to be compatible with old swihs-e buffer-based parsers) |
61 |
|
|
** |
62 |
|
|
** - XML elements and attributes are all converted to lowercase. |
63 |
|
|
** |
64 |
|
|
*/ |
65 |
|
|
|
66 |
|
|
/* libxml2 */ |
67 |
|
|
#include <libxml/HTMLparser.h> |
68 |
|
|
#include <libxml/xmlerror.h> |
69 |
|
|
#include <libxml/uri.h> |
70 |
|
|
|
71 |
|
|
|
72 |
|
|
#include <stdarg.h> // for va_list |
73 |
|
|
#ifdef HAVE_VARARGS_H |
74 |
|
|
#include <varargs.h> // va_list on Win32 |
75 |
|
|
#endif |
76 |
|
|
#include "swish.h" |
77 |
|
|
#include "fs.h" // for the title check |
78 |
|
|
#include "merge.h" |
79 |
|
|
#include "mem.h" |
80 |
|
|
#include "string.h" |
81 |
|
|
#include "docprop.h" |
82 |
|
|
#include "error.h" |
83 |
|
|
#include "index.h" |
84 |
|
|
#include "metanames.h" |
85 |
|
|
|
86 |
|
|
|
87 |
|
|
/* Should be in config.h */ |
88 |
|
|
|
89 |
|
|
#define BUFFER_CHUNK_SIZE 10000 // This is the size of buffers used to accumulate text |
90 |
|
|
#define READ_CHUNK_SIZE 2048 // The size of chunks read from the stream (4096 seems to cause problems) |
91 |
|
|
|
92 |
|
|
/* to buffer text until an end tag is found */ |
93 |
|
|
|
94 |
|
|
typedef struct { |
95 |
|
|
char *buffer; // text for buffer |
96 |
|
|
int cur; // length |
97 |
|
|
int max; // max size of buffer |
98 |
|
|
int defaultID; // default ID for no meta names. |
99 |
|
|
} CHAR_BUFFER; |
100 |
|
|
|
101 |
|
|
|
102 |
|
|
|
103 |
|
|
// I think that the property system can deal with StoreDescription in a cleaner way. |
104 |
|
|
// This code shouldn't need to know about that StoreDescription. |
105 |
|
|
|
106 |
|
|
typedef struct { |
107 |
|
|
struct metaEntry *meta; |
108 |
|
|
int save_size; /* save max size */ |
109 |
|
|
char *tag; /* summary tag */ |
110 |
|
|
int active; /* inside summary */ |
111 |
|
|
} SUMMARY_INFO; |
112 |
|
|
|
113 |
|
|
#define STACK_SIZE 255 // stack size, but can grow. |
114 |
|
|
|
115 |
|
|
typedef struct MetaStackElement { |
116 |
|
|
struct MetaStackElement *next; // pointer to *siblings*, if any |
117 |
|
|
struct metaEntry *meta; // pointer to meta that's inuse |
118 |
|
|
int ignore; // flag that this meta turned on ignore |
119 |
|
|
char tag[1]; // tag to look for |
120 |
|
|
} MetaStackElement, *MetaStackElementPtr; |
121 |
|
|
|
122 |
|
|
typedef struct { |
123 |
|
|
int pointer; // next empty slot in stack |
124 |
|
|
int maxsize; // size of stack |
125 |
|
|
int ignore_flag; // count of ignores |
126 |
|
|
MetaStackElementPtr *stack; // pointer to an array of stack data |
127 |
|
|
int is_meta; // is this a metaname or property stack? |
128 |
|
|
} MetaStack; |
129 |
|
|
|
130 |
|
|
|
131 |
|
|
|
132 |
|
|
|
133 |
|
|
|
134 |
|
|
|
135 |
|
|
/* This struct is returned in all call-back functions as user data */ |
136 |
|
|
|
137 |
|
|
typedef struct { |
138 |
|
|
CHAR_BUFFER text_buffer; // buffer for collecting text |
139 |
|
|
// CHAR_BUFFER prop_buffer; // someday, may want a separate property buffer if want to collect tags within props |
140 |
|
|
SUMMARY_INFO summary; // argh. |
141 |
|
|
MetaStack meta_stack; // stacks for tracking the nested metas |
142 |
|
|
MetaStack prop_stack; |
143 |
|
|
int total_words; |
144 |
|
|
int word_pos; |
145 |
|
|
int filenum; |
146 |
|
|
INDEXDATAHEADER *header; |
147 |
|
|
SWISH *sw; |
148 |
|
|
FileProp *fprop; |
149 |
|
|
FileRec *thisFileEntry; |
150 |
|
|
int structure[STRUCTURE_END+1]; |
151 |
|
|
int parsing_html; |
152 |
|
|
struct metaEntry *titleProp; |
153 |
|
|
struct metaEntry *titleMeta; |
154 |
|
|
struct metaEntry *swishdefaultMeta; |
155 |
|
|
int flush_word; // flag to flush buffer next time there's a white space. |
156 |
|
|
xmlSAXHandlerPtr SAXHandler; // for aborting, I guess. |
157 |
|
|
xmlParserCtxtPtr ctxt; |
158 |
|
|
CHAR_BUFFER ISO_Latin1; // buffer to hold UTF-8 -> ISO Latin-1 converted text |
159 |
|
|
int abort; // flag to stop parsing |
160 |
|
|
char *baseURL; // for fixing up relative links |
161 |
|
|
int swish_noindex; // swishindex swishnoindex -- for hiding blocks with comments |
162 |
|
|
} PARSE_DATA; |
163 |
|
|
|
164 |
|
|
|
165 |
|
|
/* Prototypes */ |
166 |
|
|
static void start_hndl(void *data, const char *el, const char **attr); |
167 |
|
|
static void end_hndl(void *data, const char *el); |
168 |
|
|
static void char_hndl(void *data, const char *txt, int txtlen); |
169 |
|
|
static void Whitespace(void *data, const xmlChar *txt, int txtlen); |
170 |
|
|
static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen ); |
171 |
|
|
static void flush_buffer( PARSE_DATA *parse_data, int clear ); |
172 |
|
|
static void comment_hndl(void *data, const char *txt); |
173 |
|
|
static char *isIgnoreMetaName(SWISH * sw, char *tag); |
174 |
|
|
static void error(void *data, const char *msg, ...); |
175 |
|
|
static void warning(void *data, const char *msg, ...); |
176 |
|
|
static void process_htmlmeta( PARSE_DATA *parse_data, const char ** attr ); |
177 |
|
|
static int check_html_tag( PARSE_DATA *parse_data, char * tag, int start ); |
178 |
|
|
static void start_metaTag( PARSE_DATA *parse_data, char * tag, char *endtag, int *meta_append, int *prop_append , int is_html_tag ); |
179 |
|
|
static void end_metaTag( PARSE_DATA *parse_data, char * tag, int is_html_tag ); |
180 |
|
|
static void init_sax_handler( xmlSAXHandlerPtr SAXHandler, SWISH * sw ); |
181 |
|
|
static void init_parse_data( PARSE_DATA *parse_data, SWISH * sw, FileProp * fprop, FileRec *fi, xmlSAXHandlerPtr SAXHandler ); |
182 |
|
|
static void free_parse_data( PARSE_DATA *parse_data ); |
183 |
|
|
static void Convert_to_latin1( PARSE_DATA *parse_data, char *txt, int txtlen ); |
184 |
|
|
static int parse_chunks( PARSE_DATA *parse_data ); |
185 |
|
|
|
186 |
|
|
static void index_alt_tab( PARSE_DATA *parse_data, const char **attr ); |
187 |
|
|
static char *extract_html_links( PARSE_DATA *parse_data, const char **attr, struct metaEntry *meta_entry, char *tag ); |
188 |
|
|
static int read_next_chunk( FileProp *fprop, char *buf, int buf_size, int max_size ); |
189 |
|
|
static void abort_parsing( PARSE_DATA *parse_data, int abort_code ); |
190 |
|
|
static int get_structure( PARSE_DATA *parse_data ); |
191 |
|
|
|
192 |
|
|
static void push_stack( MetaStack *stack, char *tag, struct metaEntry *meta, int *append, int ignore ); |
193 |
|
|
static int pop_stack_ifMatch( PARSE_DATA *parse_data, MetaStack *stack, char *tag ); |
194 |
|
|
static int pop_stack( MetaStack *stack ); |
195 |
|
|
|
196 |
|
|
static void index_XML_attributes( PARSE_DATA *parse_data, char *tag, const char **attr ); |
197 |
|
|
static int start_XML_ClassAttributes( PARSE_DATA *parse_data, char *tag, const char **attr, int *meta_append, int *prop_append ); |
198 |
|
|
static char *isXMLClassAttribute(SWISH * sw, char *tag); |
199 |
|
|
|
200 |
|
|
static void debug_show_tag( char *tag, PARSE_DATA *parse_data, int start, char *message ); |
201 |
|
|
static void debug_show_parsed_text( PARSE_DATA *parse_data, char *txt, int len ); |
202 |
|
|
|
203 |
|
|
|
204 |
|
|
|
205 |
|
|
/********************************************************************* |
206 |
|
|
* XML Push parser |
207 |
|
|
* |
208 |
|
|
* Returns: |
209 |
|
|
* Count of words indexed |
210 |
|
|
* |
211 |
|
|
* |
212 |
|
|
*********************************************************************/ |
213 |
|
|
|
214 |
|
|
int parse_XML(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer) |
215 |
|
|
|
216 |
|
|
{ |
217 |
|
|
xmlSAXHandler SAXHandlerStruct; |
218 |
|
|
xmlSAXHandlerPtr SAXHandler = &SAXHandlerStruct; |
219 |
|
|
PARSE_DATA parse_data; |
220 |
|
|
|
221 |
|
|
|
222 |
|
|
init_sax_handler( SAXHandler, sw ); |
223 |
|
|
init_parse_data( &parse_data, sw, fprop, fi, SAXHandler ); |
224 |
|
|
|
225 |
|
|
|
226 |
|
|
/* Now parse the XML file */ |
227 |
|
|
return parse_chunks( &parse_data ); |
228 |
|
|
|
229 |
|
|
} |
230 |
|
|
|
231 |
|
|
/********************************************************************* |
232 |
|
|
* HTML Push parser |
233 |
|
|
* |
234 |
|
|
* Returns: |
235 |
|
|
* Count of words indexed |
236 |
|
|
* |
237 |
|
|
*********************************************************************/ |
238 |
|
|
|
239 |
|
|
int parse_HTML(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer) |
240 |
|
|
{ |
241 |
|
|
htmlSAXHandler SAXHandlerStruct; |
242 |
|
|
htmlSAXHandlerPtr SAXHandler = &SAXHandlerStruct; |
243 |
|
|
PARSE_DATA parse_data; |
244 |
|
|
|
245 |
|
|
init_sax_handler( (xmlSAXHandlerPtr)SAXHandler, sw ); |
246 |
|
|
init_parse_data( &parse_data, sw, fprop, fi, (xmlSAXHandlerPtr)SAXHandler ); |
247 |
|
|
|
248 |
|
|
|
249 |
|
|
parse_data.parsing_html = 1; |
250 |
|
|
parse_data.titleProp = getPropNameByName( parse_data.header, AUTOPROPERTY_TITLE ); |
251 |
|
|
parse_data.titleMeta = getMetaNameByName( parse_data.header, AUTOPROPERTY_TITLE ); |
252 |
|
|
parse_data.swishdefaultMeta = getMetaNameByName( parse_data.header, AUTOPROPERTY_DEFAULT ); |
253 |
|
|
|
254 |
|
|
/* Now parse the HTML file */ |
255 |
|
|
return parse_chunks( &parse_data ); |
256 |
|
|
|
257 |
|
|
} |
258 |
|
|
|
259 |
|
|
/********************************************************************* |
260 |
|
|
* TXT "Push" parser |
261 |
|
|
* |
262 |
|
|
* Returns: |
263 |
|
|
* Count of words indexed |
264 |
|
|
* |
265 |
|
|
*********************************************************************/ |
266 |
|
|
|
267 |
|
|
int parse_TXT(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer) |
268 |
|
|
{ |
269 |
|
|
PARSE_DATA parse_data; |
270 |
|
|
int res; |
271 |
|
|
char chars[READ_CHUNK_SIZE]; |
272 |
|
|
|
273 |
|
|
|
274 |
|
|
|
275 |
|
|
/* This does stuff that's not needed for txt */ |
276 |
|
|
init_parse_data( &parse_data, sw, fprop, fi, NULL ); |
277 |
|
|
|
278 |
|
|
|
279 |
|
|
/* Document Summary */ |
280 |
|
|
if ( parse_data.summary.meta && parse_data.summary.meta->max_len ) |
281 |
|
|
parse_data.summary.active++; |
282 |
|
|
|
283 |
|
|
|
284 |
|
|
while ( (res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize )) ) |
285 |
|
|
{ |
286 |
|
|
append_buffer( &parse_data.text_buffer, chars, res ); |
287 |
|
|
flush_buffer( &parse_data, 0 ); // flush upto whitespace |
288 |
|
|
|
289 |
|
|
|
290 |
|
|
/* turn off summary when we exceed size */ |
291 |
|
|
if ( parse_data.summary.meta && parse_data.summary.meta->max_len && fprop->bytes_read > parse_data.summary.meta->max_len ) |
292 |
|
|
parse_data.summary.active = 0; |
293 |
|
|
|
294 |
|
|
} |
295 |
|
|
|
296 |
|
|
flush_buffer( &parse_data, 1 ); |
297 |
|
|
free_parse_data( &parse_data ); |
298 |
|
|
return parse_data.total_words; |
299 |
|
|
} |
300 |
|
|
|
301 |
|
|
|
302 |
|
|
/********************************************************************* |
303 |
|
|
* Parse chunks (used for both XML and HTML parsing) |
304 |
|
|
* Creates the parsers, reads in chunks as one might expect |
305 |
|
|
* |
306 |
|
|
* |
307 |
|
|
*********************************************************************/ |
308 |
|
|
static int parse_chunks( PARSE_DATA *parse_data ) |
309 |
|
|
{ |
310 |
|
|
SWISH *sw = parse_data->sw; |
311 |
|
|
FileProp *fprop = parse_data->fprop; |
312 |
|
|
xmlSAXHandlerPtr SAXHandler = parse_data->SAXHandler; |
313 |
|
|
int res; |
314 |
|
|
char chars[READ_CHUNK_SIZE]; |
315 |
|
|
xmlParserCtxtPtr ctxt; |
316 |
|
|
|
317 |
|
|
|
318 |
|
|
/* Now start pulling into the libxml2 parser */ |
319 |
|
|
|
320 |
|
|
res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize ); |
321 |
|
|
if (res == 0) |
322 |
|
|
return 0; |
323 |
|
|
|
324 |
|
|
/* Create parser */ |
325 |
|
|
if ( parse_data->parsing_html ) |
326 |
|
|
ctxt = (xmlParserCtxtPtr)htmlCreatePushParserCtxt((htmlSAXHandlerPtr)SAXHandler, parse_data, chars, res, fprop->real_path,0); |
327 |
|
|
else |
328 |
|
|
ctxt = xmlCreatePushParserCtxt(SAXHandler, parse_data, chars, res, fprop->real_path); |
329 |
|
|
|
330 |
|
|
parse_data->ctxt = ctxt; // save |
331 |
|
|
|
332 |
|
|
|
333 |
|
|
|
334 |
|
|
while ( !parse_data->abort && (res = read_next_chunk( fprop, chars, READ_CHUNK_SIZE, sw->truncateDocSize )) ) |
335 |
|
|
{ |
336 |
|
|
if ( parse_data->parsing_html ) |
337 |
|
|
htmlParseChunk((htmlParserCtxtPtr)ctxt, chars, res, 0); |
338 |
|
|
else |
339 |
|
|
xmlParseChunk(ctxt, chars, res, 0); |
340 |
|
|
|
341 |
|
|
/* Doesn't seem to make much difference to flush here */ |
342 |
|
|
//flush_buffer( parse_data, 0 ); // flush upto whitespace |
343 |
|
|
} |
344 |
|
|
|
345 |
|
|
|
346 |
|
|
|
347 |
|
|
/* Tell the parser we are done, and free it */ |
348 |
|
|
if ( parse_data->parsing_html ) |
349 |
|
|
{ |
350 |
|
|
if ( !parse_data->abort ) // bug in libxml 2.4.5 |
351 |
|
|
htmlParseChunk( (htmlParserCtxtPtr)ctxt, chars, 0, 1 ); |
352 |
|
|
htmlFreeParserCtxt( (htmlParserCtxtPtr)ctxt); |
353 |
|
|
} |
354 |
|
|
else |
355 |
|
|
{ |
356 |
|
|
if ( !parse_data->abort ) // bug in libxml |
357 |
|
|
xmlParseChunk(ctxt, chars, 0, 1); |
358 |
|
|
xmlFreeParserCtxt(ctxt); |
359 |
|
|
} |
360 |
|
|
|
361 |
|
|
/* Daniel Veillard on Nov 21, 2001 says this should not be called for every doc. */ |
362 |
|
|
// But, it probably should be called when done parsing. |
363 |
|
|
// xmlCleanupParser(); |
364 |
|
|
|
365 |
|
|
/* Check for abort condition set while parsing (isoktitle, NoContents) */ |
366 |
|
|
|
367 |
|
|
if ( parse_data->abort && fprop->index_no_content && !parse_data->total_words ) |
368 |
|
|
{ |
369 |
|
|
append_buffer( &parse_data->text_buffer, fprop->real_path, strlen(fprop->real_path) ); |
370 |
|
|
|
371 |
|
|
parse_data->meta_stack.ignore_flag = 0; /* make sure we can write */ |
372 |
|
|
flush_buffer( parse_data, 3 ); |
373 |
|
|
} |
374 |
|
|
|
375 |
|
|
|
376 |
|
|
/* Flush any text left in the buffer */ |
377 |
|
|
|
378 |
|
|
if ( !parse_data->abort ) |
379 |
|
|
flush_buffer( parse_data, 3 ); |
380 |
|
|
|
381 |
|
|
|
382 |
|
|
|
383 |
|
|
free_parse_data( parse_data ); |
384 |
|
|
|
385 |
|
|
|
386 |
|
|
// $$$ This doesn't work since the file (and maybe some words) already added |
387 |
|
|
// $$$ need a way to "remove" the file entry and words already added |
388 |
|
|
|
389 |
|
|
if ( parse_data->abort < 0 ) |
390 |
|
|
return parse_data->abort; |
391 |
|
|
|
392 |
|
|
return parse_data->total_words; |
393 |
|
|
} |
394 |
|
|
|
395 |
|
|
/********************************************************************* |
396 |
|
|
* read_next_chunk - read another chunk from the stream |
397 |
|
|
* |
398 |
|
|
* Call with: |
399 |
|
|
* fprop |
400 |
|
|
* *buf - where to save the data |
401 |
|
|
* *buf_size - max size of buffer |
402 |
|
|
* *max_size - limit of *total* bytes read from this stream (for truncate) |
403 |
|
|
* |
404 |
|
|
* Returns: |
405 |
|
|
* number of bytes read (as returned from fread) |
406 |
|
|
* |
407 |
|
|
* |
408 |
|
|
*********************************************************************/ |
409 |
|
|
static int read_next_chunk( FileProp *fprop, char *buf, int buf_size, int max_size ) |
410 |
|
|
{ |
411 |
|
|
int size; |
412 |
|
|
int res; |
413 |
|
|
|
414 |
|
|
if ( fprop->done ) |
415 |
|
|
return 0; |
416 |
|
|
|
417 |
|
|
/* For -S prog, only read in the right amount of data */ |
418 |
|
|
if ( fprop->external_program && (fprop->bytes_read >= fprop->fsize )) |
419 |
|
|
return 0; |
420 |
|
|
|
421 |
|
|
|
422 |
|
|
/* fprop->external_program is set if -S prog and NOT reading from a filter */ |
423 |
|
|
|
424 |
|
|
size = fprop->external_program && (( fprop->fsize - fprop->bytes_read ) < buf_size) |
425 |
|
|
? fprop->fsize - fprop->bytes_read |
426 |
|
|
: buf_size; |
427 |
|
|
|
428 |
|
|
if ( !fprop->bytes_read && size > 4 ) |
429 |
|
|
size = 4; |
430 |
|
|
|
431 |
|
|
|
432 |
|
|
|
433 |
|
|
/* Truncate -- safety feature from Rainer. No attempt is made to backup to a whole word */ |
434 |
|
|
if ( max_size && fprop->bytes_read + size > max_size ) |
435 |
|
|
{ |
436 |
|
|
fprop->done++; // flag that we are done |
437 |
|
|
size = max_size - fprop->bytes_read; |
438 |
|
|
} |
439 |
|
|
|
440 |
|
|
|
441 |
|
|
res = fread(buf, 1, size, fprop->fp); |
442 |
|
|
|
443 |
|
|
fprop->bytes_read += res; |
444 |
|
|
|
445 |
|
|
return res; |
446 |
|
|
} |
447 |
|
|
|
448 |
|
|
|
449 |
|
|
|
450 |
|
|
/********************************************************************* |
451 |
|
|
* Init a sax handler structure |
452 |
|
|
* Must pass in the structure |
453 |
|
|
* |
454 |
|
|
*********************************************************************/ |
455 |
|
|
static void init_sax_handler( xmlSAXHandlerPtr SAXHandler, SWISH * sw ) |
456 |
|
|
{ |
457 |
|
|
/* Set event handlers for libxml2 parser */ |
458 |
|
|
memset( SAXHandler, 0, sizeof( xmlSAXHandler ) ); |
459 |
|
|
|
460 |
|
|
SAXHandler->startElement = (startElementSAXFunc)&start_hndl; |
461 |
|
|
SAXHandler->endElement = (endElementSAXFunc)&end_hndl; |
462 |
|
|
SAXHandler->characters = (charactersSAXFunc)&char_hndl; |
463 |
|
|
SAXHandler->cdataBlock = (charactersSAXFunc)&char_hndl; |
464 |
|
|
SAXHandler->ignorableWhitespace = (ignorableWhitespaceSAXFunc)&Whitespace; |
465 |
|
|
|
466 |
|
|
SAXHandler->comment = (commentSAXFunc)&comment_hndl; |
467 |
|
|
|
468 |
|
|
if ( sw->parser_warn_level >= 1 ) |
469 |
|
|
SAXHandler->fatalError = (fatalErrorSAXFunc)&error; |
470 |
|
|
|
471 |
|
|
if ( sw->parser_warn_level >= 2 ) |
472 |
|
|
SAXHandler->error = (errorSAXFunc)&error; |
473 |
|
|
|
474 |
|
|
if ( sw->parser_warn_level >= 3 ) |
475 |
|
|
SAXHandler->warning = (warningSAXFunc)&warning; |
476 |
|
|
|
477 |
|
|
} |
478 |
|
|
|
479 |
|
|
|
480 |
|
|
/********************************************************************* |
481 |
|
|
* Init the parer data structure |
482 |
|
|
* Must pass in the structure |
483 |
|
|
* |
484 |
|
|
*********************************************************************/ |
485 |
|
|
static void init_parse_data( PARSE_DATA *parse_data, SWISH * sw, FileProp * fprop, FileRec *fi, xmlSAXHandlerPtr SAXHandler ) |
486 |
|
|
{ |
487 |
|
|
IndexFILE *indexf = sw->indexlist; |
488 |
|
|
struct StoreDescription *stordesc = fprop->stordesc; |
489 |
|
|
|
490 |
|
|
/* Set defaults */ |
491 |
|
|
memset( parse_data, 0, sizeof(PARSE_DATA)); |
492 |
|
|
|
493 |
|
|
parse_data->header = &indexf->header; |
494 |
|
|
parse_data->sw = sw; |
495 |
|
|
parse_data->fprop = fprop; |
496 |
|
|
parse_data->filenum = fi->filenum; |
497 |
|
|
parse_data->word_pos = 1; /* compress doesn't like zero */ |
498 |
|
|
parse_data->SAXHandler = SAXHandler; |
499 |
|
|
parse_data->thisFileEntry = fi; |
500 |
|
|
|
501 |
|
|
|
502 |
|
|
/* Don't really like this, as mentioned above */ |
503 |
|
|
if ( stordesc && (parse_data->summary.meta = getPropNameByName(parse_data->header, AUTOPROPERTY_SUMMARY))) |
504 |
|
|
{ |
505 |
|
|
/* Set property limit size for this document type, and store previous size limit */ |
506 |
|
|
parse_data->summary.save_size = parse_data->summary.meta->max_len; |
507 |
|
|
parse_data->summary.meta->max_len = stordesc->size; |
508 |
|
|
parse_data->summary.tag = stordesc->field; |
509 |
|
|
if ( parse_data->summary.tag ) |
510 |
|
|
strtolower(parse_data->summary.tag); |
511 |
|
|
} |
512 |
|
|
|
513 |
|
|
|
514 |
|
|
/* Initialize the meta and property stacks */ |
515 |
|
|
/* Not needed for TXT processing, of course */ |
516 |
|
|
{ |
517 |
|
|
MetaStack *s; |
518 |
|
|
|
519 |
|
|
s = &parse_data->meta_stack; |
520 |
|
|
s->is_meta = 1; |
521 |
|
|
s->maxsize = STACK_SIZE; |
522 |
|
|
|
523 |
|
|
s->stack = (MetaStackElementPtr *)emalloc( sizeof( MetaStackElementPtr ) * s->maxsize ); |
524 |
|
|
if ( fprop->index_no_content ) |
525 |
|
|
s->ignore_flag++; |
526 |
|
|
|
527 |
|
|
s = &parse_data->prop_stack; |
528 |
|
|
s->is_meta = 0; |
529 |
|
|
s->maxsize = STACK_SIZE; |
530 |
|
|
s->stack = (MetaStackElementPtr *)emalloc( sizeof( MetaStackElementPtr ) * s->maxsize ); |
531 |
|
|
if ( fprop->index_no_content ) /* only works for HTML */ |
532 |
|
|
s->ignore_flag++; |
533 |
|
|
} |
534 |
|
|
|
535 |
|
|
addCommonProperties(sw, fprop, fi, NULL, NULL, 0); |
536 |
|
|
} |
537 |
|
|
|
538 |
|
|
|
539 |
|
|
/********************************************************************* |
540 |
|
|
* Free any data used by the parse_data struct |
541 |
|
|
* |
542 |
|
|
*********************************************************************/ |
543 |
|
|
static void free_parse_data( PARSE_DATA *parse_data ) |
544 |
|
|
{ |
545 |
|
|
|
546 |
|
|
if ( parse_data->ISO_Latin1.buffer ) |
547 |
|
|
efree( parse_data->ISO_Latin1.buffer ); |
548 |
|
|
|
549 |
|
|
if ( parse_data->text_buffer.buffer ) |
550 |
|
|
efree( parse_data->text_buffer.buffer ); |
551 |
|
|
|
552 |
|
|
if ( parse_data->baseURL ) |
553 |
|
|
efree( parse_data->baseURL ); |
554 |
|
|
|
555 |
|
|
|
556 |
|
|
/* Pop the stacks */ |
557 |
|
|
while( pop_stack( &parse_data->meta_stack ) ); |
558 |
|
|
while( pop_stack( &parse_data->prop_stack ) ); |
559 |
|
|
|
560 |
|
|
/* Free the stacks */ |
561 |
|
|
if ( parse_data->meta_stack.stack ) |
562 |
|
|
efree( parse_data->meta_stack.stack ); |
563 |
|
|
|
564 |
|
|
if ( parse_data->prop_stack.stack ) |
565 |
|
|
efree( parse_data->prop_stack.stack ); |
566 |
|
|
|
567 |
|
|
|
568 |
|
|
|
569 |
|
|
/* Restore the size in the StoreDescription property */ |
570 |
|
|
if ( parse_data->summary.save_size ) |
571 |
|
|
parse_data->summary.meta->max_len = parse_data->summary.save_size; |
572 |
|
|
|
573 |
|
|
} |
574 |
|
|
|
575 |
|
|
/********************************************************************* |
576 |
|
|
* Start Tag Event Handler |
577 |
|
|
* |
578 |
|
|
* This is called by libxml2. It normally just calls start_metaTag() |
579 |
|
|
* and that decides how to deal with that meta tag. |
580 |
|
|
* It also converts <meta> and <tag class=foo> into meta tags as swish |
581 |
|
|
* would expect them (and then calls start_metaTag(). |
582 |
|
|
* |
583 |
|
|
* To Do: |
584 |
|
|
* deal with attributes! |
585 |
|
|
* |
586 |
|
|
*********************************************************************/ |
587 |
|
|
|
588 |
|
|
|
589 |
|
|
static void start_hndl(void *data, const char *el, const char **attr) |
590 |
|
|
{ |
591 |
|
|
PARSE_DATA *parse_data = (PARSE_DATA *)data; |
592 |
|
|
char tag[MAXSTRLEN + 1]; |
593 |
|
|
int is_html_tag = 0; // to allow <foo> type of meta tags in HTML |
594 |
|
|
int meta_append = 0; // used to allow siblings metanames |
595 |
|
|
int prop_append = 0; |
596 |
|
|
|
597 |
|
|
|
598 |
|
|
/* disabeld by a comment? */ |
599 |
|
|
if ( parse_data->swish_noindex ) |
600 |
|
|
return; |
601 |
|
|
|
602 |
|
|
if(strlen(el) >= MAXSTRLEN) // easy way out |
603 |
|
|
{ |
604 |
|
|
warning("Warning: Tag found in %s is too long: '%s'\n", parse_data->fprop->real_path, el ); |
605 |
|
|
return; |
606 |
|
|
} |
607 |
|
|
|
608 |
|
|
strcpy(tag,(char *)el); |
609 |
|
|
strtolower( tag ); // xml? |
610 |
|
|
|
611 |
|
|
|
612 |
|
|
if ( parse_data->parsing_html ) |
613 |
|
|
{ |
614 |
|
|
|
615 |
|
|
/* handle <meta name="metaname" content="foo"> */ |
616 |
|
|
if ( (strcmp( tag, "meta") == 0) && attr ) |
617 |
|
|
{ |
618 |
|
|
process_htmlmeta( parse_data, attr ); |
619 |
|
|
return; |
620 |
|
|
} |
621 |
|
|
|
622 |
|
|
|
623 |
|
|
/* Deal with structure */ |
624 |
|
|
if ( (is_html_tag = check_html_tag( parse_data, tag, 1 )) ) |
625 |
|
|
{ |
626 |
|
|
/** Special handling for <A>, <IMG>, and <BASE> tags **/ |
627 |
|
|
|
628 |
|
|
/* Extract out links - currently only keep <a> links */ |
629 |
|
|
if ( strcmp( tag, "a") == 0 ) |
630 |
|
|
extract_html_links( parse_data, attr, parse_data->sw->links_meta, "href" ); |
631 |
|
|
|
632 |
|
|
|
633 |
|
|
/* Extract out links from images */ |
634 |
|
|
else if ( strcmp( tag, "img") == 0 ) |
635 |
|
|
{ |
636 |
|
|
if (parse_data->sw->IndexAltTag) |
637 |
|
|
index_alt_tab( parse_data, attr ); |
638 |
|
|
|
639 |
|
|
extract_html_links( parse_data, attr, parse_data->sw->images_meta, "src" ); |
640 |
|
|
} |
641 |
|
|
|
642 |
|
|
|
643 |
|
|
/* Extract out the BASE URL for fixups */ |
644 |
|
|
else if ( strcmp( tag, "base") == 0 ) |
645 |
|
|
parse_data->baseURL = estrdup( extract_html_links( parse_data, attr, NULL, "href" ) ); |
646 |
|
|
} |
647 |
|
|
|
648 |
|
|
} |
649 |
|
|
|
650 |
|
|
|
651 |
|
|
/* Now check if we are in a meta tag */ |
652 |
|
|
start_metaTag( parse_data, tag, tag, &meta_append, &prop_append, is_html_tag ); |
653 |
|
|
|
654 |
|
|
|
655 |
|
|
|
656 |
|
|
/* Index the content of attributes */ |
657 |
|
|
|
658 |
|
|
if ( !parse_data->parsing_html && attr ) |
659 |
|
|
{ |
660 |
|
|
int class_found = 0; |
661 |
|
|
|
662 |
|
|
/* Allow <foo class="bar"> to look like <foo.bar> */ |
663 |
|
|
|
664 |
|
|
if ( parse_data->sw->XMLClassAttributes ) |
665 |
|
|
class_found = start_XML_ClassAttributes( parse_data, tag, attr, &meta_append, &prop_append ); |
666 |
|
|
|
667 |
|
|
|
668 |
|
|
/* Index XML attributes */ |
669 |
|
|
|
670 |
|
|
if ( !class_found && parse_data->sw->UndefinedXMLAttributes != UNDEF_META_DISABLE ) |
671 |
|
|
index_XML_attributes( parse_data, tag, attr ); |
672 |
|
|
} |
673 |
|
|
|
674 |
|
|
} |
675 |
|
|
|
676 |
|
|
|
677 |
|
|
|
678 |
|
|
|
679 |
|
|
|
680 |
|
|
/********************************************************************* |
681 |
|
|
* End Tag Event Handler |
682 |
|
|
* |
683 |
|
|
* Called by libxml2. |
684 |
|
|
* |
685 |
|
|
* |
686 |
|
|
* |
687 |
|
|
*********************************************************************/ |
688 |
|
|
|
689 |
|
|
|
690 |
|
|
static void end_hndl(void *data, const char *el) |
691 |
|
|
{ |
692 |
|
|
PARSE_DATA *parse_data = (PARSE_DATA *)data; |
693 |
|
|
char tag[MAXSTRLEN + 1]; |
694 |
|
|
int is_html_tag = 0; // to allow <foo> type of metatags in html. |
695 |
|
|
|
696 |
|
|
|
697 |
|
|
/* disabeld by a comment? */ |
698 |
|
|
if ( parse_data->swish_noindex ) |
699 |
|
|
return; |
700 |
|
|
|
701 |
|
|
if(strlen(el) > MAXSTRLEN) |
702 |
|
|
{ |
703 |
|
|
warning("Warning: Tag found in %s is too long: '%s'\n", parse_data->fprop->real_path, el ); |
704 |
|
|
return; |
705 |
|
|
} |
706 |
|
|
|
707 |
|
|
strcpy(tag,(char *)el); |
708 |
|
|
strtolower( tag ); |
709 |
|
|
|
710 |
|
|
|
711 |
|
|
|
712 |
|
|
if ( parse_data->parsing_html ) |
713 |
|
|
{ |
714 |
|
|
|
715 |
|
|
/* <meta> tags are closed in start_hndl */ |
716 |
|
|
|
717 |
|
|
if ( (strcmp( tag, "meta") == 0) ) |
718 |
|
|
return; // this was flushed at end tag |
719 |
|
|
|
720 |
|
|
|
721 |
|
|
|
722 |
|
|
/* Deal with structure */ |
723 |
|
|
is_html_tag = check_html_tag( parse_data, tag, 0 ); |
724 |
|
|
} |
725 |
|
|
|
726 |
|
|
|
727 |
|
|
end_metaTag( parse_data, tag, is_html_tag ); |
728 |
|
|
} |
729 |
|
|
|
730 |
|
|
|
731 |
|
|
|
732 |
|
|
/********************************************************************* |
733 |
|
|
* Character Data Event Handler |
734 |
|
|
* |
735 |
|
|
* This does the actual adding of text to the index and adding properties |
736 |
|
|
* if any tags have been found to index |
737 |
|
|
* |
738 |
|
|
* |
739 |
|
|
*********************************************************************/ |
740 |
|
|
|
741 |
|
|
static void char_hndl(void *data, const char *txt, int txtlen) |
742 |
|
|
{ |
743 |
|
|
PARSE_DATA *parse_data = (PARSE_DATA *)data; |
744 |
|
|
|
745 |
|
|
|
746 |
|
|
/* Have we been disabled? */ |
747 |
|
|
if ( !parse_data->SAXHandler->characters ) |
748 |
|
|
return; |
749 |
|
|
|
750 |
|
|
/* disabeld by a comment? */ |
751 |
|
|
if ( parse_data->swish_noindex ) |
752 |
|
|
return; |
753 |
|
|
|
754 |
|
|
|
755 |
|
|
/* If currently in an ignore block, then return */ |
756 |
|
|
if ( parse_data->meta_stack.ignore_flag && parse_data->prop_stack.ignore_flag ) |
757 |
|
|
return; |
758 |
|
|
|
759 |
|
|
/* $$$ this was added to limit the buffer size */ |
760 |
|
|
if ( parse_data->text_buffer.cur + txtlen >= BUFFER_CHUNK_SIZE ) |
761 |
|
|
flush_buffer( parse_data, 0 ); // flush upto last word - somewhat expensive |
762 |
|
|
|
763 |
|
|
|
764 |
|
|
|
765 |
|
|
Convert_to_latin1( parse_data, (char *)txt, txtlen ); |
766 |
|
|
|
767 |
|
|
|
768 |
|
|
if ( DEBUG_MASK & DEBUG_PARSED_TEXT ) |
769 |
|
|
debug_show_parsed_text( parse_data, parse_data->ISO_Latin1.buffer, parse_data->ISO_Latin1.cur ); |
770 |
|
|
|
771 |
|
|
|
772 |
|
|
|
773 |
|
|
|
774 |
|
|
/* Check if we are waiting for a word boundry, and there is white space in the text */ |
775 |
|
|
/* If so, write the word, then reset the structure, then write the rest of the text. */ |
776 |
|
|
|
777 |
|
|
if ( parse_data->flush_word ) |
778 |
|
|
{ |
779 |
|
|
/* look for whitespace */ |
780 |
|
|
char *c = parse_data->ISO_Latin1.buffer; |
781 |
|
|
int i; |
782 |
|
|
for ( i=0; i < parse_data->ISO_Latin1.cur; i++ ) |
783 |
|
|
if ( isspace( (int)c[i] ) ) |
784 |
|
|
{ |
785 |
|
|
append_buffer( &parse_data->text_buffer, parse_data->ISO_Latin1.buffer, i ); |
786 |
|
|
flush_buffer( parse_data, 1 ); // Flush the entire buffer |
787 |
|
|
|
788 |
|
|
parse_data->structure[parse_data->flush_word-1]--; // now it's ok to turn of the structure bit |
789 |
|
|
parse_data->flush_word = 0; |
790 |
|
|
|
791 |
|
|
/* flush the rest */ |
792 |
|
|
append_buffer( &parse_data->text_buffer, &c[i], parse_data->ISO_Latin1.cur - i ); |
793 |
|
|
|
794 |
|
|
return; |
795 |
|
|
} |
796 |
|
|
} |
797 |
|
|
|
798 |
|
|
|
799 |
|
|
|
800 |
|
|
/* Buffer the text */ |
801 |
|
|
append_buffer( &parse_data->text_buffer, parse_data->ISO_Latin1.buffer, parse_data->ISO_Latin1.cur ); |
802 |
|
|
|
803 |
|
|
/* Some day, might want to have a separate property buffer if need to collect more than plain text */ |
804 |
|
|
// append_buffer( &parse_data->prop_buffer, txt, txtlen ); |
805 |
|
|
|
806 |
|
|
|
807 |
|
|
|
808 |
|
|
} |
809 |
|
|
|
810 |
|
|
/********************************************************************* |
811 |
|
|
* ignorableWhitespace handler |
812 |
|
|
* |
813 |
|
|
* Just adds a space to the buffer |
814 |
|
|
* |
815 |
|
|
* |
816 |
|
|
*********************************************************************/ |
817 |
|
|
|
818 |
|
|
static void Whitespace(void *data, const xmlChar *txt, int txtlen) |
819 |
|
|
{ |
820 |
|
|
PARSE_DATA *parse_data = (PARSE_DATA *)data; |
821 |
|
|
|
822 |
|
|
append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose |
823 |
|
|
} |
824 |
|
|
|
825 |
|
|
|
826 |
|
|
|
827 |
|
|
|
828 |
|
|
/********************************************************************* |
829 |
|
|
* Convert UTF-8 to Latin-1 |
830 |
|
|
* |
831 |
|
|
* Buffer is extended/created if needed |
832 |
|
|
* |
833 |
|
|
*********************************************************************/ |
834 |
|
|
|
835 |
|
|
static void Convert_to_latin1( PARSE_DATA *parse_data, char *txt, int txtlen ) |
836 |
|
|
{ |
837 |
|
|
CHAR_BUFFER *buf = &parse_data->ISO_Latin1; |
838 |
|
|
int inlen = txtlen; |
839 |
|
|
int ret; |
840 |
|
|
char *start_buf; |
841 |
|
|
char *end_buf = txt + txtlen - 1; |
842 |
|
|
int used; |
843 |
|
|
|
844 |
|
|
|
845 |
|
|
/* (re)allocate buf if needed */ |
846 |
|
|
|
847 |
|
|
if ( txtlen >= buf->max ) |
848 |
|
|
{ |
849 |
|
|
buf->max = ( buf->max + BUFFER_CHUNK_SIZE+1 < txtlen ) |
850 |
|
|
? buf->max + txtlen+1 |
851 |
|
|
: buf->max + BUFFER_CHUNK_SIZE+1; |
852 |
|
|
|
853 |
|
|
buf->buffer = erealloc( buf->buffer, buf->max ); |
854 |
|
|
} |
855 |
|
|
|
856 |
|
|
buf->cur = 0; /* start at the beginning of the buffer */ |
857 |
|
|
|
858 |
|
|
while( 1 ) |
859 |
|
|
{ |
860 |
|
|
used = buf->max - buf->cur; /* size available in buffer */ |
861 |
|
|
start_buf = &buf->buffer[buf->cur]; /* offset into buffer */ |
862 |
|
|
|
863 |
|
|
/* Returns 0 for OK */ |
864 |
|
|
ret = UTF8Toisolat1( (unsigned char *)start_buf, &used, (const unsigned char *)txt, &inlen ); |
865 |
|
|
|
866 |
|
|
if ( used > 0 ) // tally up total bytes consumed |
867 |
|
|
buf->cur += used; |
868 |
|
|
|
869 |
|
|
if ( ret == 0 ) // all done |
870 |
|
|
return; |
871 |
|
|
|
872 |
|
|
if ( ret == -2 ) // encoding failed |
873 |
|
|
{ |
874 |
|
|
if ( parse_data->sw->parser_warn_level >= 1 ) |
875 |
|
|
xmlParserWarning(parse_data->ctxt, "Failed to convert internal UTF-8 to Latin-1.\nReplacing non ISO-8859-1 char with char '%c'\n", ENCODE_ERROR_CHAR); |
876 |
|
|
|
877 |
|
|
|
878 |
|
|
buf->buffer[buf->cur++] = ENCODE_ERROR_CHAR; |
879 |
|
|
|
880 |
|
|
|
881 |
|
|
/* Skip one UTF-8 character -- returns null if not pointing to a UTF-8 char */ |
882 |
|
|
if ( !(txt = (char *)xmlUTF8Strpos( (const xmlChar *)(&txt[inlen]), 1) )) |
883 |
|
|
return; |
884 |
|
|
|
885 |
|
|
/* Calculate the remaining length of the input string */ |
886 |
|
|
inlen = (unsigned long)end_buf - (unsigned long)txt + 1; |
887 |
|
|
|
888 |
|
|
if ( inlen <= 0 ) |
889 |
|
|
return; |
890 |
|
|
|
891 |
|
|
start_buf += buf->cur-1; |
892 |
|
|
} |
893 |
|
|
else |
894 |
|
|
{ |
895 |
|
|
xmlParserWarning(parse_data->ctxt, "Error '%d' converting internal UTF-8 to Latin-1.\n", ret ); |
896 |
|
|
return; |
897 |
|
|
} |
898 |
|
|
} |
899 |
|
|
} |
900 |
|
|
|
901 |
|
|
|
902 |
|
|
/********************************************************************* |
903 |
|
|
* Start of a MetaTag |
904 |
|
|
* All XML tags are metatags, but for HTML there's special handling. |
905 |
|
|
* |
906 |
|
|
* Call with: |
907 |
|
|
* parse_data |
908 |
|
|
* tag = tag to look for as a metaname/property |
909 |
|
|
* endtag = tag to look for as the ending tag (since might be different from start tag) |
910 |
|
|
* meta_append = if zero, tells push that this is a new meta |
911 |
|
|
* prop_append otherwise, says it's a sibling of a previous call |
912 |
|
|
* (Argh Jan 29, 2001 -- now I don't remember what that _append does!) |
913 |
|
|
* (it's for working with xml attributes) |
914 |
|
|
* is_html_tag = prevents UndefinedMetaTags from being applied to html tags |
915 |
|
|
* |
916 |
|
|
* <foo class=bar> can start two meta tags "foo" and "foo.bar". But "bar" |
917 |
|
|
* will end both tags. |
918 |
|
|
* |
919 |
|
|
* |
920 |
|
|
*********************************************************************/ |
921 |
|
|
static void start_metaTag( PARSE_DATA *parse_data, char * tag, char *endtag, int *meta_append, int *prop_append, int is_html_tag ) |
922 |
|
|
{ |
923 |
|
|
SWISH *sw = parse_data->sw; |
924 |
|
|
struct metaEntry *m = NULL; |
925 |
|
|
|
926 |
|
|
|
927 |
|
|
/* Bump on all meta names, unless overridden */ |
928 |
|
|
if (!is_html_tag && !isDontBumpMetaName(sw->dontbumpstarttagslist, tag)) |
929 |
|
|
parse_data->word_pos++; |
930 |
|
|
|
931 |
|
|
|
932 |
|
|
/* check for ignore tag (should probably remove char handler for speed) */ |
933 |
|
|
// Should specific property names and meta names override this? |
934 |
|
|
|
935 |
|
|
if ( isIgnoreMetaName( sw, tag ) ) |
936 |
|
|
{ |
937 |
|
|
/* shouldn't need to flush buffer since it's just blocking out a section and should be balanced */ |
938 |
|
|
/* but need to due to the weird way the char buffer is used (and shared with props) and how metatags are assigned to the buffer */ |
939 |
|
|
/* basically, since flush_buffer looks at the ignore flag and always clears the buffer, need to do it now */ |
940 |
|
|
/* flush_buffer really should not be in the business of checking the ignore flag, and rather we need to keep two buffers -- or maybe just always flush with any change */ |
941 |
|
|
|
942 |
|
|
flush_buffer( parse_data, 1 ); |
943 |
|
|
|
944 |
|
|
push_stack( &parse_data->meta_stack, endtag, NULL, meta_append, 1 ); |
945 |
|
|
push_stack( &parse_data->prop_stack, endtag, NULL, prop_append, 1 ); |
946 |
|
|
parse_data->structure[IN_META_BIT]++; // so we are in balance with pop_stack |
947 |
|
|
return; |
948 |
|
|
} |
949 |
|
|
|
950 |
|
|
|
951 |
|
|
/* Check for metaNames */ |
952 |
|
|
|
953 |
|
|
if ( !(m = getMetaNameByName( parse_data->header, tag)) ) |
954 |
|
|
{ |
955 |
|
|
|
956 |
|
|
if ( !is_html_tag ) |
957 |
|
|
{ |
958 |
|
|
if ( sw->UndefinedMetaTags == UNDEF_META_AUTO ) |
959 |
|
|
{ |
960 |
|
|
if (sw->verbose) |
961 |
|
|
printf("**Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path); |
962 |
|
|
|
963 |
|
|
m = addMetaEntry( parse_data->header, tag, META_INDEX, 0); |
964 |
|
|
} |
965 |
|
|
|
966 |
|
|
|
967 |
|
|
else if ( sw->UndefinedMetaTags == UNDEF_META_IGNORE ) /* Ignore this block of text for metanames only (props ok) */ |
968 |
|
|
{ |
969 |
|
|
flush_buffer( parse_data, 66 ); // flush because we must still continue to process, and structures might change |
970 |
|
|
push_stack( &parse_data->meta_stack, endtag, NULL, meta_append, 1 ); |
971 |
|
|
parse_data->structure[IN_META_BIT]++; // so we are in balance with pop_stack |
972 |
|
|
/* must fall though to property check */ |
973 |
|
|
} |
974 |
|
|
} |
975 |
|
|
} |
976 |
|
|
|
977 |
|
|
|
978 |
|
|
|
979 |
|
|
if ( m ) /* Is a meta name */ |
980 |
|
|
{ |
981 |
|
|
flush_buffer( parse_data, 6 ); /* new meta tag, so must flush */ |
982 |
|
|
push_stack( &parse_data->meta_stack, endtag, m, meta_append, 0 ); |
983 |
|
|
parse_data->structure[IN_META_BIT]++; |
984 |
|
|
} |
985 |
|
|
|
986 |
|
|
else if ( !is_html_tag ) |
987 |
|
|
{ |
988 |
|
|
/* If set to "error" on undefined meta tags, then error */ |
989 |
|
|
if ( sw->UndefinedMetaTags == UNDEF_META_ERROR ) |
990 |
|
|
progerr("Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path); |
991 |
|
|
|
992 |
|
|
else if ( DEBUG_MASK & DEBUG_PARSED_TAGS ) |
993 |
|
|
debug_show_tag( tag, parse_data, 1, "(undefined meta name - no action)" ); |
994 |
|
|
} |
995 |
|
|
|
996 |
|
|
|
997 |
|
|
/* Check property names -- allows HTML tags as property names */ |
998 |
|
|
|
999 |
|
|
|
1000 |
|
|
if ( (m = getPropNameByName( parse_data->header, tag)) ) |
1001 |
|
|
{ |
1002 |
|
|
flush_buffer( parse_data, 7 ); // flush since it's a new meta tag |
1003 |
|
|
push_stack( &parse_data->prop_stack, endtag, m, prop_append, 0 ); |
1004 |
|
|
} |
1005 |
|
|
|
1006 |
|
|
|
1007 |
|
|
|
1008 |
|
|
/* Look to enable StoreDescription - allow any tag */ |
1009 |
|
|
/* Don't need to flush since this has it's own buffer */ |
1010 |
|
|
|
1011 |
|
|
// This should really be a property, and use aliasing as needed |
1012 |
|
|
{ |
1013 |
|
|
SUMMARY_INFO *summary = &parse_data->summary; |
1014 |
|
|
|
1015 |
|
|
if ( summary->tag && (strcmp( tag, summary->tag ) == 0 )) |
1016 |
|
|
{ |
1017 |
|
|
/* Flush data in buffer */ |
1018 |
|
|
if ( 0 == summary->active ) |
1019 |
|
|
flush_buffer( parse_data, 1 ); |
1020 |
|
|
|
1021 |
|
|
summary->active++; |
1022 |
|
|
} |
1023 |
|
|
} |
1024 |
|
|
|
1025 |
|
|
} |
1026 |
|
|
|
1027 |
|
|
|
1028 |
|
|
/********************************************************************* |
1029 |
|
|
* End of a MetaTag |
1030 |
|
|
* All XML tags are metatags, but for HTML there's special handling. |
1031 |
|
|
* |
1032 |
|
|
*********************************************************************/ |
1033 |
|
|
static void end_metaTag( PARSE_DATA *parse_data, char * tag, int is_html_tag ) |
1034 |
|
|
{ |
1035 |
|
|
|
1036 |
|
|
if ( pop_stack_ifMatch( parse_data, &parse_data->meta_stack, tag ) ) |
1037 |
|
|
parse_data->structure[IN_META_BIT]--; |
1038 |
|
|
|
1039 |
|
|
|
1040 |
|
|
/* Out of a property? */ |
1041 |
|
|
pop_stack_ifMatch( parse_data, &parse_data->prop_stack, tag ); |
1042 |
|
|
|
1043 |
|
|
|
1044 |
|
|
/* Don't allow matching across tag boundry */ |
1045 |
|
|
if (!is_html_tag && !isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag)) |
1046 |
|
|
parse_data->word_pos++; |
1047 |
|
|
|
1048 |
|
|
|
1049 |
|
|
|
1050 |
|
|
/* Look to disable StoreDescription */ |
1051 |
|
|
{ |
1052 |
|
|
SUMMARY_INFO *summary = &parse_data->summary; |
1053 |
|
|
if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 )) |
1054 |
|
|
{ |
1055 |
|
|
/* Flush data in buffer */ |
1056 |
|
|
if ( 1 == summary->active ) |
1057 |
|
|
flush_buffer( parse_data, 1 ); // do first since flush buffer looks at summary->active |
1058 |
|
|
|
1059 |
|
|
summary->active--; |
1060 |
|
|
} |
1061 |
|
|
} |
1062 |
|
|
|
1063 |
|
|
} |
1064 |
|
|
|
1065 |
|
|
|
1066 |
|
|
/********************************************************************* |
1067 |
|
|
* Checks the HTML tag, and sets the "structure" |
1068 |
|
|
* Also deals with FileRules title |
1069 |
|
|
* In general, flushes the character buffer due to the change in structure. |
1070 |
|
|
* |
1071 |
|
|
* returns false if not a valid HTML tag (which might be a "fake" metaname) |
1072 |
|
|
* |
1073 |
|
|
*********************************************************************/ |
1074 |
|
|
|
1075 |
|
|
static int check_html_tag( PARSE_DATA *parse_data, char * tag, int start ) |
1076 |
|
|
{ |
1077 |
|
|
int is_html_tag = 1; |
1078 |
|
|
int bump = start ? +1 : -1; |
1079 |
|
|
|
1080 |
|
|
/* Check for structure bits */ |
1081 |
|
|
|
1082 |
|
|
|
1083 |
|
|
/** HEAD **/ |
1084 |
|
|
|
1085 |
|
|
if ( strcmp( tag, "head" ) == 0 ) |
1086 |
|
|
{ |
1087 |
|
|
flush_buffer( parse_data, 10 ); |
1088 |
|
|
parse_data->structure[IN_HEAD_BIT] += bump; |
1089 |
|
|
|
1090 |
|
|
/* Check for NoContents - can quit looking now once out of <head> block */ |
1091 |
|
|
|
1092 |
|
|
if ( !start && parse_data->fprop->index_no_content ) |
1093 |
|
|
abort_parsing( parse_data, 1 ); |
1094 |
|
|
|
1095 |
|
|
} |
1096 |
|
|
|
1097 |
|
|
|
1098 |
|
|
|
1099 |
|
|
/** TITLE **/ |
1100 |
|
|
|
1101 |
|
|
// Note: I think storing the title words by default should be optional. |
1102 |
|
|
// Someone might not want to search title tags, if if they don't they are |
1103 |
|
|
// screwed since title by default ranks higher than body words. |
1104 |
|
|
|
1105 |
|
|
|
1106 |
|
|
else if ( strcmp( tag, "title" ) == 0 ) |
1107 |
|
|
{ |
1108 |
|
|
/* Can't flush buffer until we have looked at the title */ |
1109 |
|
|
|
1110 |
|
|
if ( !start ) |
1111 |
|
|
{ |
1112 |
|
|
struct MOD_FS *fs = parse_data->sw->FS; |
1113 |
|
|
|
1114 |
|
|
/* Check isoktitle - before NoContents? */ |
1115 |
|
|
if ( match_regex_list( parse_data->text_buffer.buffer, fs->filerules.title) ) |
1116 |
|
|
{ |
1117 |
|
|
abort_parsing( parse_data, -2 ); |
1118 |
|
|
return 1; |
1119 |
|
|
} |
1120 |
|
|
|
1121 |
|
|
/* Check for NoContents - abort since all we need is the title text */ |
1122 |
|
|
if ( parse_data->fprop->index_no_content ) |
1123 |
|
|
abort_parsing( parse_data, 1 ); |
1124 |
|
|
|
1125 |
|
|
|
1126 |
|
|
} |
1127 |
|
|
else |
1128 |
|
|
/* In start tag, allow capture of text (NoContents sets ignore_flag at start) */ |
1129 |
|
|
if ( parse_data->fprop->index_no_content ) |
1130 |
|
|
parse_data->meta_stack.ignore_flag--; |
1131 |
|
|
|
1132 |
|
|
|
1133 |
|
|
/* Now it's ok to flush */ |
1134 |
|
|
flush_buffer( parse_data, 11 ); |
1135 |
|
|
|
1136 |
|
|
|
1137 |
|
|
/* If title is a property, turn on the property flag */ |
1138 |
|
|
if ( parse_data->titleProp ) |
1139 |
|
|
parse_data->titleProp->in_tag = start ? 1 : 0; |
1140 |
|
|
|
1141 |
|
|
|
1142 |
|
|
/* If title is a metaname, turn on the indexing flag */ |
1143 |
|
|
if ( parse_data->titleMeta ) |
1144 |
|
|
{ |
1145 |
|
|
parse_data->titleMeta->in_tag = start ? 1 : 0; |
1146 |
|
|
parse_data->swishdefaultMeta->in_tag = start ? 1 : 0; |
1147 |
|
|
} |
1148 |
|
|
|
1149 |
|
|
|
1150 |
|
|
|
1151 |
|
|
parse_data->word_pos++; |
1152 |
|
|
parse_data->structure[IN_TITLE_BIT] += bump; |
1153 |
|
|
} |
1154 |
|
|
|
1155 |
|
|
|
1156 |
|
|
|
1157 |
|
|
/** BODY **/ |
1158 |
|
|
|
1159 |
|
|
else if ( strcmp( tag, "body" ) == 0 ) |
1160 |
|
|
{ |
1161 |
|
|
flush_buffer( parse_data, 12 ); |
1162 |
|
|
parse_data->structure[IN_BODY_BIT] += bump; |
1163 |
|
|
parse_data->word_pos++; |
1164 |
|
|
} |
1165 |
|
|
|
1166 |
|
|
|
1167 |
|
|
|
1168 |
|
|
/** H1 HEADINGS **/ |
1169 |
|
|
|
1170 |
|
|
/* This should be split so know different level for ranking */ |
1171 |
|
|
else if ( tag[0] == 'h' && isdigit((int) tag[1])) |
1172 |
|
|
{ |
1173 |
|
|
flush_buffer( parse_data, 13 ); |
1174 |
|
|
parse_data->structure[IN_HEADER_BIT] += bump; |
1175 |
|
|
} |
1176 |
|
|
|
1177 |
|
|
|
1178 |
|
|
|
1179 |
|
|
/** EMPHASIZED **/ |
1180 |
|
|
|
1181 |
|
|
/* These should not be hard coded */ |
1182 |
|
|
|
1183 |
|
|
else if ( !strcmp( tag, "em" ) || !strcmp( tag, "b" ) || !strcmp( tag, "strong" ) || !strcmp( tag, "i" ) ) |
1184 |
|
|
{ |
1185 |
|
|
/* This is hard. The idea is to not break up words. But messes up the structure |
1186 |
|
|
* ie: "this is b<b>O</b>ld word" so this would only flush "this is" on <b>, |
1187 |
|
|
* and </b> would not flush anything. The PROBLEM is that then will make the next words |
1188 |
|
|
* have a IN_EMPHASIZED structure. To "fix", I set a flag to flush at next word boundry. |
1189 |
|
|
*/ |
1190 |
|
|
flush_buffer( parse_data, 0 ); // flush up to current word (leaving any leading chars in buffer) |
1191 |
|
|
|
1192 |
|
|
if ( start ) |
1193 |
|
|
parse_data->structure[IN_EMPHASIZED_BIT]++; |
1194 |
|
|
else |
1195 |
|
|
{ |
1196 |
|
|
/* If there is something in the buffer then delay turning off the flag until whitespace is found */ |
1197 |
|
|
if ( parse_data->text_buffer.cur ) |
1198 |
|
|
/* Flag to flush at next word boundry */ |
1199 |
|
|
parse_data->flush_word = IN_EMPHASIZED_BIT + 1; // + 1 because we might need to use zero some day |
1200 |
|
|
else |
1201 |
|
|
parse_data->structure[IN_EMPHASIZED_BIT]--; |
1202 |
|
|
} |
1203 |
|
|
|
1204 |
|
|
|
1205 |
|
|
} |
1206 |
|
|
|
1207 |
|
|
|
1208 |
|
|
|
1209 |
|
|
|
1210 |
|
|
/* Now, look for reasons to add whitespace |
1211 |
|
|
* img is not really, as someone might use an image to make up a word, but |
1212 |
|
|
* commonly an image would split up text. |
1213 |
|
|
* other tags: frame? |
1214 |
|
|
*/ |
1215 |
|
|
|
1216 |
|
|
if ( !strcmp( tag, "br" ) || !strcmp( tag, "img" ) ) |
1217 |
|
|
append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose |
1218 |
|
|
else |
1219 |
|
|
{ |
1220 |
|
|
const htmlElemDesc *element = htmlTagLookup( (const xmlChar *)tag ); |
1221 |
|
|
|
1222 |
|
|
if ( !element ) |
1223 |
|
|
is_html_tag = 0; // flag that this might be a meta name |
1224 |
|
|
|
1225 |
|
|
else if ( !element->isinline ) |
1226 |
|
|
append_buffer( &parse_data->text_buffer, " ", 1 ); // could flush buffer, I suppose |
1227 |
|
|
} |
1228 |
|
|
|
1229 |
|
|
|
1230 |
|
|
|
1231 |
|
|
|
1232 |
|
|
return is_html_tag; |
1233 |
|
|
} |
1234 |
|
|
|
1235 |
|
|
/********************************************************************* |
1236 |
|
|
* Allow <foo class="bar"> to start "foo.bar" meta tag |
1237 |
|
|
* |
1238 |
|
|
* Returns true if any found |
1239 |
|
|
* |
1240 |
|
|
*********************************************************************/ |
1241 |
|
|
static int start_XML_ClassAttributes( PARSE_DATA *parse_data, char *tag, const char **attr, int *meta_append, int *prop_append ) |
1242 |
|
|
{ |
1243 |
|
|
char tagbuf[256]; /* we have our limits */ |
1244 |
|
|
char *t; |
1245 |
|
|
int i; |
1246 |
|
|
int taglen = strlen( tag ); |
1247 |
|
|
SWISH *sw = parse_data->sw; |
1248 |
|
|
int found = 0; |
1249 |
|
|
|
1250 |
|
|
strcpy( tagbuf, tag ); |
1251 |
|
|
t = tagbuf + taglen; |
1252 |
|
|
*t = '.'; /* hard coded! */ |
1253 |
|
|
t++; |
1254 |
|
|
|
1255 |
|
|
|
1256 |
|
|
for ( i = 0; attr[i] && attr[i+1]; i+=2 ) |
1257 |
|
|
{ |
1258 |
|
|
if ( !isXMLClassAttribute( sw, (char *)attr[i]) ) |
1259 |
|
|
continue; |
1260 |
|
|
|
1261 |
|
|
|
1262 |
|
|
/* Is the tag going to be too long? */ |
1263 |
|
|
if ( strlen( (char *)attr[i+1] ) + taglen + 2 > 256 ) |
1264 |
|
|
{ |
1265 |
|
|
warning("ClassAttribute on tag '%s' too long\n", tag ); |
1266 |
|
|
continue; |
1267 |
|
|
} |
1268 |
|
|
|
1269 |
|
|
|
1270 |
|
|
/* All metanames are currently lowercase -- would be better to force this in metanames.c */ |
1271 |
|
|
strtolower( tagbuf ); |
1272 |
|
|
|
1273 |
|
|
strcpy( t, (char *)attr[i+1] ); /* create tag.attribute metaname */ |
1274 |
|
|
start_metaTag( parse_data, tagbuf, tag, meta_append, prop_append, 0 ); |
1275 |
|
|
found++; |
1276 |
|
|
|
1277 |
|
|
/* Now, nest attributes */ |
1278 |
|
|
if ( sw->UndefinedXMLAttributes != UNDEF_META_DISABLE ) |
1279 |
|
|
index_XML_attributes( parse_data, tagbuf, attr ); |
1280 |
|
|
|
1281 |
|
|
} |
1282 |
|
|
|
1283 |
|
|
return found; |
1284 |
|
|
|
1285 |
|
|
} |
1286 |
|
|
|
1287 |
|
|
/********************************************************************* |
1288 |
|
|
* check if a tag is an XMLClassAttributes |
1289 |
|
|
* |
1290 |
|
|
* Note: this returns a pointer to the config set tag, so don't free it! |
1291 |
|
|
* Duplicate code! |
1292 |
|
|
* |
1293 |
|
|
* This does a case-insensitive lookup |
1294 |
|
|
* |
1295 |
|
|
* |
1296 |
|
|
*********************************************************************/ |
1297 |
|
|
|
1298 |
|
|
static char *isXMLClassAttribute(SWISH * sw, char *tag) |
1299 |
|
|
{ |
1300 |
|
|
struct swline *tmplist = sw->XMLClassAttributes; |
1301 |
|
|
|
1302 |
|
|
if (!tmplist) |
1303 |
|
|
return 0; |
1304 |
|
|
|
1305 |
|
|
while (tmplist) |
1306 |
|
|
{ |
1307 |
|
|
if (strcasecmp(tag, tmplist->line) == 0) |
1308 |
|
|
return tmplist->line; |
1309 |
|
|
|
1310 |
|
|
tmplist = tmplist->next; |
1311 |
|
|
} |
1312 |
|
|
|
1313 |
|
|
return NULL; |
1314 |
|
|
} |
1315 |
|
|
|
1316 |
|
|
|
1317 |
|
|
|
1318 |
|
|
/********************************************************************* |
1319 |
|
|
* This extracts out the attributes and contents and indexes them |
1320 |
|
|
* |
1321 |
|
|
*********************************************************************/ |
1322 |
|
|
static void index_XML_attributes( PARSE_DATA *parse_data, char *tag, const char **attr ) |
1323 |
|
|
{ |
1324 |
|
|
char tagbuf[256]; /* we have our limits */ |
1325 |
|
|
char *content; |
1326 |
|
|
char *t; |
1327 |
|
|
int i; |
1328 |
|
|
int meta_append; |
1329 |
|
|
int prop_append; |
1330 |
|
|
int taglen = strlen( tag ); |
1331 |
|
|
SWISH *sw = parse_data->sw; |
1332 |
|
|
UndefMetaFlag tmp_undef = sw->UndefinedMetaTags; // save |
1333 |
|
|
|
1334 |
|
|
sw->UndefinedMetaTags = sw->UndefinedXMLAttributes; |
1335 |
|
|
|
1336 |
|
|
|
1337 |
|
|
strcpy( tagbuf, tag ); |
1338 |
|
|
t = tagbuf + taglen; |
1339 |
|
|
*t = '.'; /* hard coded! */ |
1340 |
|
|
t++; |
1341 |
|
|
|
1342 |
|
|
for ( i = 0; attr[i] && attr[i+1]; i+=2 ) |
1343 |
|
|
{ |
1344 |
|
|
meta_append = 0; |
1345 |
|
|
prop_append = 0; |
1346 |
|
|
|
1347 |
|
|
/* Skip attributes that are XMLClassAttribues */ |
1348 |
|
|
if ( isXMLClassAttribute( sw, (char *)attr[i] ) ) |
1349 |
|
|
continue; |
1350 |
|
|
|
1351 |
|
|
|
1352 |
|
|
if ( strlen( (char *)attr[i] ) + taglen + 2 > 256 ) |
1353 |
|
|
{ |
1354 |
|
|
warning("Attribute '%s' on tag '%s' too long to build metaname\n", (char *)attr[i], tag ); |
1355 |
|
|
continue; |
1356 |
|
|
} |
1357 |
|
|
|
1358 |
|
|
strcpy( t, (char *)attr[i] ); /* create tag.attribute metaname */ |
1359 |
|
|
content = (char *)attr[i+1]; |
1360 |
|
|
|
1361 |
|
|
if ( !*content ) |
1362 |
|
|
continue; |
1363 |
|
|
|
1364 |
|
|
strtolower( tagbuf ); |
1365 |
|
|
|
1366 |
|
|
|
1367 |
|
|
|
1368 |
|
|
flush_buffer( parse_data, 1 ); // isn't needed, right? |
1369 |
|
|
start_metaTag( parse_data, tagbuf, tagbuf, &meta_append, &prop_append, 0 ); |
1370 |
|
|
char_hndl( parse_data, content, strlen( content ) ); |
1371 |
|
|
end_metaTag( parse_data, tagbuf, 0 ); |
1372 |
|
|
} |
1373 |
|
|
|
1374 |
|
|
sw->UndefinedMetaTags = tmp_undef; |
1375 |
|
|
} |
1376 |
|
|
|
1377 |
|
|
|
1378 |
|
|
|
1379 |
|
|
/********************************************************************* |
1380 |
|
|
* Deal with html's <meta name="foo" content="bar"> |
1381 |
|
|
* Simply calls start and end meta, and passes content |
1382 |
|
|
* |
1383 |
|
|
*********************************************************************/ |
1384 |
|
|
|
1385 |
|
|
static void process_htmlmeta( PARSE_DATA *parse_data, const char **attr ) |
1386 |
|
|
{ |
1387 |
|
|
char *metatag = NULL; |
1388 |
|
|
char *content = NULL; |
1389 |
|
|
int meta_append = 0; |
1390 |
|
|
int prop_append = 0; |
1391 |
|
|
|
1392 |
|
|
int i; |
1393 |
|
|
|
1394 |
|
|
/* Don't add any meta data while looking for just the title */ |
1395 |
|
|
if ( parse_data->fprop->index_no_content ) |
1396 |
|
|
return; |
1397 |
|
|
|
1398 |
|
|
for ( i = 0; attr[i] && attr[i+1]; i+=2 ) |
1399 |
|
|
{ |
1400 |
|
|
if ( (strcmp( attr[i], "name" ) == 0 ) && attr[i+1] ) |
1401 |
|
|
metatag = (char *)attr[i+1]; |
1402 |
|
|
|
1403 |
|
|
else if ( (strcmp( attr[i], "content" ) == 0 ) && attr[i+1] ) |
1404 |
|
|
content = (char *)attr[i+1]; |
1405 |
|
|
} |
1406 |
|
|
|
1407 |
|
|
|
1408 |
|
|
if ( metatag && content ) |
1409 |
|
|
{ |
1410 |
|
|
|
1411 |
|
|
/* Robots exclusion: http://www.robotstxt.org/wc/exclusion.html#meta */ |
1412 |
|
|
if ( !strcasecmp( metatag, "ROBOTS") && lstrstr( content, "NOINDEX" ) ) |
1413 |
|
|
{ |
1414 |
|
|
if ( parse_data->sw->obeyRobotsNoIndex ) |
1415 |
|
|
abort_parsing( parse_data, -3 ); |
1416 |
|
|
|
1417 |
|
|
return; |
1418 |
|
|
} |
1419 |
|
|
|
1420 |
|
|
/* Process as a start -> end tag sequence */ |
1421 |
|
|
strtolower( metatag ); |
1422 |
|
|
|
1423 |
|
|
flush_buffer( parse_data, 111 ); |
1424 |
|
|
start_metaTag( parse_data, metatag, metatag, &meta_append, &prop_append, 0 ); |
1425 |
|
|
char_hndl( parse_data, content, strlen( content ) ); |
1426 |
|
|
end_metaTag( parse_data, metatag, 0 ); |
1427 |
|
|
flush_buffer( parse_data, 112 ); |
1428 |
|
|
} |
1429 |
|
|
|
1430 |
|
|
} |
1431 |
|
|
|
1432 |
|
|
|
1433 |
|
|
/********************************************************************* |
1434 |
|
|
* Append character data to the end of the buffer |
1435 |
|
|
* |
1436 |
|
|
* Buffer is extended/created if needed |
1437 |
|
|
* |
1438 |
|
|
* ToDo: Flush buffer if it gets too large |
1439 |
|
|
* |
1440 |
|
|
* |
1441 |
|
|
*********************************************************************/ |
1442 |
|
|
|
1443 |
|
|
static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen ) |
1444 |
|
|
{ |
1445 |
|
|
|
1446 |
|
|
|
1447 |
|
|
if ( !txtlen ) // shouldn't happen |
1448 |
|
|
return; |
1449 |
|
|
|
1450 |
|
|
/* (re)allocate buf if needed */ |
1451 |
|
|
|
1452 |
|
|
if ( buf->cur + txtlen >= buf->max ) |
1453 |
|
|
{ |
1454 |
|
|
buf->max = ( buf->max + BUFFER_CHUNK_SIZE+1 < buf->cur + txtlen ) |
1455 |
|
|
? buf->cur + txtlen+1 |
1456 |
|
|
: buf->max + BUFFER_CHUNK_SIZE+1; |
1457 |
|
|
|
1458 |
|
|
buf->buffer = erealloc( buf->buffer, buf->max+1 ); |
1459 |
|
|
} |
1460 |
|
|
|
1461 |
|
|
memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen ); |
1462 |
|
|
buf->cur += txtlen; |
1463 |
|
|
buf->buffer[buf->cur] = '\0'; /* seems like a nice thing to do -- only used now in title check */ |
1464 |
|
|
} |
1465 |
|
|
|
1466 |
|
|
|
1467 |
|
|
|
1468 |
|
|
|
1469 |
|
|
/********************************************************************* |
1470 |
|
|
* Flush buffer - adds words to index, and properties |
1471 |
|
|
* |
1472 |
|
|
* If the clear flag is set then the entire buffer is flushed. |
1473 |
|
|
* Otherwise, every thing up to the last *partial* word is flushed. |
1474 |
|
|
* It's partial if there is not white-space at the very end of the buffer. |
1475 |
|
|
* |
1476 |
|
|
* This prevents some<b>long</b>word from being flushed into part words. |
1477 |
|
|
* |
1478 |
|
|
*********************************************************************/ |
1479 |
|
|
static void flush_buffer( PARSE_DATA *parse_data, int clear ) |
1480 |
|
|
{ |
1481 |
|
|
CHAR_BUFFER *buf = &parse_data->text_buffer; |
1482 |
|
|
SWISH *sw = parse_data->sw; |
1483 |
|
|
int structure = get_structure( parse_data ); |
1484 |
|
|
int orig_end = buf->cur; |
1485 |
|
|
char save_char = '?'; |
1486 |
|
|
char *c; |
1487 |
|
|
|
1488 |
|
|
/* anything to do? */ |
1489 |
|
|
if ( !buf->cur ) |
1490 |
|
|
return; |
1491 |
|
|
|
1492 |
|
|
/* look back for word boundry when "clear" is not set */ |
1493 |
|
|
|
1494 |
|
|
if ( !clear && !isspace( (int)buf->buffer[buf->cur-1] ) ) // flush up to current word |
1495 |
|
|
{ |
1496 |
|
|
while ( buf->cur > 0 && !isspace( (int)buf->buffer[buf->cur-1] ) ) |
1497 |
|
|
buf->cur--; |
1498 |
|
|
|
1499 |
|
|
if ( !buf->cur ) // then there's only a single word in the buffer |
1500 |
|
|
{ |
1501 |
|
|
buf->cur = orig_end; |
1502 |
|
|
if ( buf->cur < BUFFER_CHUNK_SIZE ) // should reall look at indexf->header.maxwordlimit |
1503 |
|
|
return; // but just trying to keep the buffer from growing too large |
1504 |
|
|
} |
1505 |
|
|
|
1506 |
|
|
save_char = buf->buffer[buf->cur]; |
1507 |
|
|
} |
1508 |
|
|
|
1509 |
|
|
|
1510 |
|
|
/* Mark the end of the buffer - should switch over to using a length to avoid strlen */ |
1511 |
|
|
|
1512 |
|
|
buf->buffer[buf->cur] = '\0'; |
1513 |
|
|
|
1514 |
|
|
|
1515 |
|
|
/* Make sure there some non-whitespace chars to print */ |
1516 |
|
|
|
1517 |
|
|
c = buf->buffer; |
1518 |
|
|
while ( *c && isspace( (int)*c ) ) |
1519 |
|
|
c++; |
1520 |
|
|
|
1521 |
|
|
|
1522 |
|
|
if ( *c ) |
1523 |
|
|
{ |
1524 |
|
|
/* Index the text */ |
1525 |
|
|
if ( !parse_data->meta_stack.ignore_flag ) // this really is wrong -- should not check ignore here. Fix should be to use two buffers |
1526 |
|
|
parse_data->total_words += |
1527 |
|
|
indexstring( sw, c, parse_data->filenum, structure, 0, NULL, &(parse_data->word_pos) ); |
1528 |
|
|
|
1529 |
|
|
/* Add the properties */ |
1530 |
|
|
addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path ); |
1531 |
|
|
|
1532 |
|
|
|
1533 |
|
|
/* yuck - addDocProperties should do this. Ok, add to summary, if active */ |
1534 |
|
|
{ |
1535 |
|
|
SUMMARY_INFO *summary = &parse_data->summary; |
1536 |
|
|
if ( summary->active ) |
1537 |
|
|
addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 ); |
1538 |
|
|
} |
1539 |
|
|
} |
1540 |
|
|
|
1541 |
|
|
|
1542 |
|
|
/* clear the buffer */ |
1543 |
|
|
|
1544 |
|
|
if ( orig_end && orig_end > buf->cur ) |
1545 |
|
|
{ |
1546 |
|
|
buf->buffer[buf->cur] = save_char; // put back the char where null was placed |
1547 |
|
|
memmove( buf->buffer, &buf->buffer[buf->cur], orig_end - buf->cur ); |
1548 |
|
|
buf->cur = orig_end - buf->cur; |
1549 |
|
|
} |
1550 |
|
|
else |
1551 |
|
|
buf->cur = 0; |
1552 |
|
|
|
1553 |
|
|
} |
1554 |
|
|
|
1555 |
|
|
|
1556 |
|
|
|
1557 |
|
|
/********************************************************************* |
1558 |
|
|
* Comments |
1559 |
|
|
* |
1560 |
|
|
* Should be able to call the char_hndl |
1561 |
|
|
* Allows comments to enable/disable indexing a block by either: |
1562 |
|
|
* |
1563 |
|
|
* <!-- noindex --> |
1564 |
|
|
* <!-- index --> |
1565 |
|
|
* <!-- SwishCommand noindex --> |
1566 |
|
|
* <!-- SwishCommand index --> |
1567 |
|
|
* |
1568 |
|
|
* |
1569 |
|
|
* |
1570 |
|
|
* To Do: |
1571 |
|
|
* Can't use DontBump with comments. Might need a config variable for that. |
1572 |
|
|
* |
1573 |
|
|
*********************************************************************/ |
1574 |
|
|
static void comment_hndl(void *data, const char *txt) |
1575 |
|
|
{ |
1576 |
|
|
PARSE_DATA *parse_data = (PARSE_DATA *)data; |
1577 |
|
|
SWISH *sw = parse_data->sw; |
1578 |
|
|
int structure = get_structure( parse_data ); |
1579 |
|
|
char *swishcmd; |
1580 |
|
|
char *comment_text = str_skip_ws( (char *)txt ); |
1581 |
|
|
int found = 0; |
1582 |
|
|
|
1583 |
|
|
|
1584 |
|
|
str_trim_ws( comment_text ); |
1585 |
|
|
if ( ! *comment_text ) |
1586 |
|
|
return; |
1587 |
|
|
|
1588 |
|
|
|
1589 |
|
|
/* Strip off SwishCommand - might be for future use */ |
1590 |
|
|
if ( ( swishcmd = lstrstr( comment_text, "SwishCommand" )) && swishcmd == comment_text ) |
1591 |
|
|
{ |
1592 |
|
|
comment_text = str_skip_ws( comment_text + strlen( "SwishCommand" ) ); |
1593 |
|
|
found++; |
1594 |
|
|
} |
1595 |
|
|
|
1596 |
|
|
if ( !strcasecmp( comment_text, "noindex" ) ) |
1597 |
|
|
{ |
1598 |
|
|
parse_data->swish_noindex++; |
1599 |
|
|
return; |
1600 |
|
|
} |
1601 |
|
|
else if ( !strcasecmp( comment_text, "index" ) ) |
1602 |
|
|
{ |
1603 |
|
|
if ( parse_data->swish_noindex ) |
1604 |
|
|
parse_data->swish_noindex--; |
1605 |
|
|
|
1606 |
|
|
return; |
1607 |
|
|
} |
1608 |
|
|
|
1609 |
|
|
|
1610 |
|
|
if( found || !sw->indexComments ) |
1611 |
|
|
return; |
1612 |
|
|
|
1613 |
|
|
|
1614 |
|
|
/* Bump position around comments - hard coded, always done to prevent phrase matching */ |
1615 |
|
|
parse_data->word_pos++; |
1616 |
|
|
|
1617 |
|
|
/* Index the text */ |
1618 |
|
|
parse_data->total_words += |
1619 |
|
|
indexstring( sw, comment_text, parse_data->filenum, structure | IN_COMMENTS, 0, NULL, &(parse_data->word_pos) ); |
1620 |
|
|
|
1621 |
|
|
|
1622 |
|
|
parse_data->word_pos++; |
1623 |
|
|
|
1624 |
|
|
} |
1625 |
|
|
|
1626 |
|
|
|
1627 |
|
|
|
1628 |
|
|
/********************************************************************* |
1629 |
|
|
* check if a tag is an IgnoreTag |
1630 |
|
|
* |
1631 |
|
|
* Note: this returns a pointer to the config set tag, so don't free it! |
1632 |
|
|
* |
1633 |
|
|
* |
1634 |
|
|
*********************************************************************/ |
1635 |
|
|
|
1636 |
|
|
static char *isIgnoreMetaName(SWISH * sw, char *tag) |
1637 |
|
|
{ |
1638 |
|
|
struct swline *tmplist = sw->ignoremetalist; |
1639 |
|
|
|
1640 |
|
|
if (!tmplist) |
1641 |
|
|
return 0; |
1642 |
|
|
|
1643 |
|
|
while (tmplist) |
1644 |
|
|
{ |
1645 |
|
|
if (strcmp(tag, tmplist->line) == 0) |
1646 |
|
|
return tmplist->line; |
1647 |
|
|
|
1648 |
|
|
tmplist = tmplist->next; |
1649 |
|
|
} |
1650 |
|
|
|
1651 |
|
|
return NULL; |
1652 |
|
|
} |
1653 |
|
|
|
1654 |
|
|
/****************************************************************** |
1655 |
|
|
* Warning and Error Messages |
1656 |
|
|
* |
1657 |
|
|
******************************************************************/ |
1658 |
|
|
|
1659 |
|
|
static void error(void *data, const char *msg, ...) |
1660 |
|
|
{ |
1661 |
|
|
va_list args; |
1662 |
|
|
PARSE_DATA *parse_data = (PARSE_DATA *)data; |
1663 |
|
|
char str[1000]; |
1664 |
|
|
|
1665 |
|
|
va_start(args, msg); |
1666 |
|
|
vsnprintf(str, 1000, msg, args ); |
1667 |
|
|
va_end(args); |
1668 |
|
|
xmlParserError(parse_data->ctxt, str); |
1669 |
|
|
} |
1670 |
|
|
|
1671 |
|
|
static void warning(void *data, const char *msg, ...) |
1672 |
|
|
{ |
1673 |
|
|
va_list args; |
1674 |
|
|
PARSE_DATA *parse_data = (PARSE_DATA *)data; |
1675 |
|
|
char str[1000]; |
1676 |
|
|
|
1677 |
|
|
va_start(args, msg); |
1678 |
|
|
vsnprintf(str, 1000, msg, args ); |
1679 |
|
|
va_end(args); |
1680 |
|
|
xmlParserWarning(parse_data->ctxt, str); |
1681 |
|
|
} |
1682 |
|
|
|
1683 |
|
|
|
1684 |
|
|
/********************************************************************* |
1685 |
|
|
* Index ALT tabs |
1686 |
|
|
* |
1687 |
|
|
* |
1688 |
|
|
*********************************************************************/ |
1689 |
|
|
static void index_alt_tab( PARSE_DATA *parse_data, const char **attr ) |
1690 |
|
|
{ |
1691 |
|
|
int meta_append = 0; |
1692 |
|
|
int prop_append = 0; |
1693 |
|
|
char *tagbuf = parse_data->sw->IndexAltTagMeta; |
1694 |
|
|
char *alt_text = extract_html_links( parse_data, attr, NULL, "alt"); |
1695 |
|
|
|
1696 |
|
|
|
1697 |
|
|
if ( !alt_text ) |
1698 |
|
|
return; |
1699 |
|
|
|
1700 |
|
|
/* Index as regular text? */ |
1701 |
|
|
if ( !parse_data->sw->IndexAltTagMeta ) |
1702 |
|
|
{ |
1703 |
|
|
char_hndl( parse_data, alt_text, strlen( alt_text ) ); |
1704 |
|
|
return; |
1705 |
|
|
} |
1706 |
|
|
|
1707 |
|
|
flush_buffer( parse_data, 1 ); |
1708 |
|
|
start_metaTag( parse_data, tagbuf, tagbuf, &meta_append, &prop_append, 0 ); |
1709 |
|
|
char_hndl( parse_data, alt_text, strlen( alt_text ) ); |
1710 |
|
|
end_metaTag( parse_data, tagbuf, 0 ); |
1711 |
|
|
} |
1712 |
|
|
|
1713 |
|
|
|
1714 |
|
|
|
1715 |
|
|
|
1716 |
|
|
/********************************************************************* |
1717 |
|
|
* Extract out links for indexing |
1718 |
|
|
* |
1719 |
|
|
* Pass in a metaname, and a tag |
1720 |
|
|
* |
1721 |
|
|
*********************************************************************/ |
1722 |
|
|
|
1723 |
|
|
static char *extract_html_links( PARSE_DATA *parse_data, const char **attr, struct metaEntry *meta_entry, char *tag ) |
1724 |
|
|
{ |
1725 |
|
|
char *href = NULL; |
1726 |
|
|
int i; |
1727 |
|
|
int structure = get_structure( parse_data ); |
1728 |
|
|
char *absoluteURL; |
1729 |
|
|
SWISH *sw = parse_data->sw; |
1730 |
|
|
|
1731 |
|
|
|
1732 |
|
|
if ( !attr ) |
1733 |
|
|
return NULL; |
1734 |
|
|
|
1735 |
|
|
for ( i = 0; attr[i] && attr[i+1]; i+=2 ) |
1736 |
|
|
if ( (strcmp( attr[i], tag ) == 0 ) && attr[i+1] ) |
1737 |
|
|
href = (char *)attr[i+1]; |
1738 |
|
|
|
1739 |
|
|
if ( !href ) |
1740 |
|
|
return NULL; |
1741 |
|
|
|
1742 |
|
|
if ( !meta_entry ) /* The case for <BASE> */ |
1743 |
|
|
return href; |
1744 |
|
|
|
1745 |
|
|
|
1746 |
|
|
/* Now, fixup the URL, if possible */ |
1747 |
|
|
|
1748 |
|
|
if ( sw->AbsoluteLinks ) // ?? || parse_data->baseURL??? always fix up if a <BASE> tag? |
1749 |
|
|
{ |
1750 |
|
|
char *base = parse_data->baseURL |
1751 |
|
|
? parse_data->baseURL |
1752 |
|
|
: parse_data->fprop->real_path; |
1753 |
|
|
|
1754 |
|
|
absoluteURL = (char *)xmlBuildURI( (xmlChar *)href, (xmlChar *)base ); |
1755 |
|
|
} |
1756 |
|
|
else |
1757 |
|
|
absoluteURL = NULL; |
1758 |
|
|
|
1759 |
|
|
|
1760 |
|
|
|
1761 |
|
|
/* Index the text */ |
1762 |
|
|
parse_data->total_words += |
1763 |
|
|
indexstring( sw, absoluteURL ? absoluteURL : href, parse_data->filenum, structure, 1, &meta_entry->metaID, &(parse_data->word_pos) ); |
1764 |
|
|
|
1765 |
|
|
if ( absoluteURL ) |
1766 |
|
|
xmlFree( absoluteURL ); |
1767 |
|
|
|
1768 |
|
|
return href; |
1769 |
|
|
} |
1770 |
|
|
|
1771 |
|
|
|
1772 |
|
|
|
1773 |
|
|
/* This doesn't look like the best method */ |
1774 |
|
|
|
1775 |
|
|
static void abort_parsing( PARSE_DATA *parse_data, int abort_code ) |
1776 |
|
|
{ |
1777 |
|
|
parse_data->abort = abort_code; /* Flag that the we are all done */ |
1778 |
|
|
/* Disable parser */ |
1779 |
|
|
parse_data->SAXHandler->startElement = (startElementSAXFunc)NULL; |
1780 |
|
|
parse_data->SAXHandler->endElement = (endElementSAXFunc)NULL; |
1781 |
|
|
parse_data->SAXHandler->characters = (charactersSAXFunc)NULL; |
1782 |
|
|
} |
1783 |
|
|
|
1784 |
|
|
|
1785 |
|
|
/* This sets the current structure context (IN_HEAD, IN_BODY, etc) */ |
1786 |
|
|
|
1787 |
|
|
static int get_structure( PARSE_DATA *parse_data ) |
1788 |
|
|
{ |
1789 |
|
|
int structure = IN_FILE; |
1790 |
|
|
|
1791 |
|
|
/* Set structure bits */ |
1792 |
|
|
if ( parse_data->parsing_html ) |
1793 |
|
|
{ |
1794 |
|
|
int i; |
1795 |
|
|
for ( i = 0; i <= STRUCTURE_END; i++ ) |
1796 |
|
|
if ( parse_data->structure[i] ) |
1797 |
|
|
structure |= ( 1 << i ); |
1798 |
|
|
} |
1799 |
|
|
return structure; |
1800 |
|
|
} |
1801 |
|
|
|
1802 |
|
|
/********************************************************************* |
1803 |
|
|
* Push a meta entry onto the stack |
1804 |
|
|
* |
1805 |
|
|
* Call With: |
1806 |
|
|
* stack = which stack to use |
1807 |
|
|
* tag = Element (tag name) to be used to match end tag |
1808 |
|
|
* met = metaEntry to save |
1809 |
|
|
* append = append to current if one (will be incremented) |
1810 |
|
|
* ignore = if true, then flag as an ignore block and bump ignore counter |
1811 |
|
|
* |
1812 |
|
|
* Returns: |
1813 |
|
|
* void |
1814 |
|
|
* |
1815 |
|
|
* ToDo: |
1816 |
|
|
* move to Mem_Zone? |
1817 |
|
|
* |
1818 |
|
|
* |
1819 |
|
|
*********************************************************************/ |
1820 |
|
|
|
1821 |
|
|
static void push_stack( MetaStack *stack, char *tag, struct metaEntry *meta, int *append, int ignore ) |
1822 |
|
|
{ |
1823 |
|
|
MetaStackElementPtr node; |
1824 |
|
|
|
1825 |
|
|
|
1826 |
|
|
if ( DEBUG_MASK & DEBUG_PARSED_TAGS ) |
1827 |
|
|
{ |
1828 |
|
|
int i; |
1829 |
|
|
for (i=0; i<stack->pointer; i++) |
1830 |
|
|
printf(" "); |
1831 |
|
|
|
1832 |
|
|
printf("<%s> (%s [%s]%s)\n", tag, stack->is_meta ? "meta" : "property", !meta ? "no meta name defined" : meta->metaName, ignore ? " *Start Ignore*" : "" ); |
1833 |
|
|
} |
1834 |
|
|
|
1835 |
|
|
|
1836 |
|
|
/* Create a new node ( MetaStackElement already has one byte allocated for string ) */ |
1837 |
|
|
node = (MetaStackElementPtr) emalloc( sizeof( MetaStackElement ) + strlen( tag ) ); |
1838 |
|
|
node->next = NULL; |
1839 |
|
|
|
1840 |
|
|
/* Turn on the meta */ |
1841 |
|
|
if ( (node->meta = meta) ) |
1842 |
|
|
meta->in_tag++; |
1843 |
|
|
|
1844 |
|
|
if ( ( node->ignore = ignore ) ) /* entering a block to ignore */ |
1845 |
|
|
stack->ignore_flag++; |
1846 |
|
|
|
1847 |
|
|
|
1848 |
|
|
strcpy( node->tag, tag ); |
1849 |
|
|
|
1850 |
|
|
|
1851 |
|
|
|
1852 |
|
|
|
1853 |
|
|
if ( !(*append)++ ) |
1854 |
|
|
{ |
1855 |
|
|
/* reallocate stack buffer if needed */ |
1856 |
|
|
if ( stack->pointer >= stack->maxsize ) |
1857 |
|
|
{ |
1858 |
|
|
progwarn("swish parser adding more stack space for tag %s. from %d to %d", tag, stack->maxsize, stack->maxsize+STACK_SIZE ); |
1859 |
|
|
|
1860 |
|
|
stack->maxsize += STACK_SIZE; |
1861 |
|
|
stack->stack = (MetaStackElementPtr *)erealloc( stack->stack, sizeof( MetaStackElementPtr ) * stack->maxsize ); |
1862 |
|
|
} |
1863 |
|
|
|
1864 |
|
|
stack->stack[stack->pointer++] = node; |
1865 |
|
|
} |
1866 |
|
|
else // prepend to the list |
1867 |
|
|
{ |
1868 |
|
|
if ( !stack->pointer ) |
1869 |
|
|
progerr("Tried to append tag %s to stack, but stack is empty", tag ); |
1870 |
|
|
|
1871 |
|
|
node->next = stack->stack[stack->pointer - 1]; |
1872 |
|
|
stack->stack[stack->pointer - 1] = node; |
1873 |
|
|
} |
1874 |
|
|
} |
1875 |
|
|
|
1876 |
|
|
/********************************************************************* |
1877 |
|
|
* Pop the stack if the tag matches the last entry |
1878 |
|
|
* Will turn off all metas associated with this tag level |
1879 |
|
|
* |
1880 |
|
|
* Call With: |
1881 |
|
|
* parse_data = to automatically flush |
1882 |
|
|
* stack = which stack to use |
1883 |
|
|
* tag = Element (tag name) to be used for removal |
1884 |
|
|
* |
1885 |
|
|
* Returns: |
1886 |
|
|
* true if tag matched |
1887 |
|
|
* |
1888 |
|
|
*********************************************************************/ |
1889 |
|
|
|
1890 |
|
|
static int pop_stack_ifMatch( PARSE_DATA *parse_data, MetaStack *stack, char *tag ) |
1891 |
|
|
{ |
1892 |
|
|
|
1893 |
|
|
/* return if stack is empty */ |
1894 |
|
|
if ( !stack->pointer ) |
1895 |
|
|
return 0; |
1896 |
|
|
|
1897 |
|
|
|
1898 |
|
|
|
1899 |
|
|
/* return if doesn't match the tag at the top of the stack */ |
1900 |
|
|
|
1901 |
|
|
if ( strcmp( stack->stack[stack->pointer - 1]->tag, tag ) != 0 ) |
1902 |
|
|
return 0; |
1903 |
|
|
|
1904 |
|
|
|
1905 |
|
|
flush_buffer( parse_data, 1 ); |
1906 |
|
|
pop_stack( stack ); |
1907 |
|
|
|
1908 |
|
|
return 1; |
1909 |
|
|
} |
1910 |
|
|
|
1911 |
|
|
/********************************************************************* |
1912 |
|
|
* Pop the stack |
1913 |
|
|
* Will turn off all metas associated with this tag level |
1914 |
|
|
* |
1915 |
|
|
* Call With: |
1916 |
|
|
* stack = which stack to use |
1917 |
|
|
* |
1918 |
|
|
* Returns: |
1919 |
|
|
* the stack pointer |
1920 |
|
|
* |
1921 |
|
|
*********************************************************************/ |
1922 |
|
|
|
1923 |
|
|
static int pop_stack( MetaStack *stack ) |
1924 |
|
|
{ |
1925 |
|
|
MetaStackElementPtr node, this; |
1926 |
|
|
|
1927 |
|
|
|
1928 |
|
|
/* return if stack is empty */ |
1929 |
|
|
if ( !stack->pointer ) |
1930 |
|
|
return 0; |
1931 |
|
|
|
1932 |
|
|
node = stack->stack[--stack->pointer]; |
1933 |
|
|
|
1934 |
|
|
/* Now pop the stack. */ |
1935 |
|
|
|
1936 |
|
|
// Note that some end tags can pop more than one tag |
1937 |
|
|
// <foo class="bar"> can be to starting metanames <foo> and <foo:bar>, and </foo> pops all. |
1938 |
|
|
|
1939 |
|
|
while ( node ) |
1940 |
|
|
{ |
1941 |
|
|
this = node; |
1942 |
|
|
|
1943 |
|
|
if ( node->meta ) |
1944 |
|
|
node->meta->in_tag--; |
1945 |
|
|
|
1946 |
|
|
if ( node->ignore ) |
1947 |
|
|
stack->ignore_flag--; |
1948 |
|
|
|
1949 |
|
|
|
1950 |
|
|
if ( DEBUG_MASK & DEBUG_PARSED_TAGS ) |
1951 |
|
|
{ |
1952 |
|
|
int i; |
1953 |
|
|
for (i=0; i<stack->pointer; i++) |
1954 |
|
|
printf(" "); |
1955 |
|
|
|
1956 |
|
|
printf("</%s> (%s)%s\n", node->tag, stack->is_meta ? "meta" : "property", node->ignore ? " end ignore" : "" ); |
1957 |
|
|
} |
1958 |
|
|
|
1959 |
|
|
|
1960 |
|
|
node = node->next; |
1961 |
|
|
efree( this ); |
1962 |
|
|
} |
1963 |
|
|
|
1964 |
|
|
return stack->pointer; |
1965 |
|
|
} |
1966 |
|
|
|
1967 |
|
|
static int debug_get_indent( INDEXDATAHEADER *header ) |
1968 |
|
|
{ |
1969 |
|
|
int i; |
1970 |
|
|
int indent = 0; |
1971 |
|
|
|
1972 |
|
|
for (i = 0; i < header->metaCounter; i++) |
1973 |
|
|
if ( is_meta_index(header->metaEntryArray[i]) ) |
1974 |
|
|
indent += header->metaEntryArray[i]->in_tag; |
1975 |
|
|
|
1976 |
|
|
return indent; |
1977 |
|
|
} |
1978 |
|
|
|
1979 |
|
|
|
1980 |
|
|
|
1981 |
|
|
static void debug_show_tag( char *tag, PARSE_DATA *parse_data, int start, char *message ) |
1982 |
|
|
{ |
1983 |
|
|
int indent = debug_get_indent( &parse_data->sw->indexlist->header); |
1984 |
|
|
int i; |
1985 |
|
|
|
1986 |
|
|
for (i=0; i<indent; i++) |
1987 |
|
|
printf(" "); |
1988 |
|
|
|
1989 |
|
|
printf("<%s%s> %s\n", start ? "" : "/", tag, message ); |
1990 |
|
|
} |
1991 |
|
|
|
1992 |
|
|
static void debug_show_parsed_text( PARSE_DATA *parse_data, char *txt, int len ) |
1993 |
|
|
{ |
1994 |
|
|
int indent = debug_get_indent( &parse_data->sw->indexlist->header); |
1995 |
|
|
int i; |
1996 |
|
|
char indent_buf[1000]; |
1997 |
|
|
int last_newline = 0; |
1998 |
|
|
int col = 0; |
1999 |
|
|
|
2000 |
|
|
|
2001 |
|
|
indent_buf[0] = '\0'; |
2002 |
|
|
|
2003 |
|
|
for (i=0; i<indent; i++) |
2004 |
|
|
strcat( indent_buf, " "); |
2005 |
|
|
|
2006 |
|
|
|
2007 |
|
|
i = 0; |
2008 |
|
|
while ( i < len ) |
2009 |
|
|
{ |
2010 |
|
|
printf("%s", indent_buf ); |
2011 |
|
|
col = 0; |
2012 |
|
|
last_newline = 0; |
2013 |
|
|
|
2014 |
|
|
/* skip leading space */ |
2015 |
|
|
while ( i < len && isspace((int)txt[i] ) ) |
2016 |
|
|
i++; |
2017 |
|
|
|
2018 |
|
|
/* print text */ |
2019 |
|
|
while ( i < len ) |
2020 |
|
|
{ |
2021 |
|
|
col++; |
2022 |
|
|
|
2023 |
|
|
|
2024 |
|
|
if ( txt[i] == '\n' ) |
2025 |
|
|
{ |
2026 |
|
|
while ( i < len && isspace((int)txt[i] )) |
2027 |
|
|
i++; |
2028 |
|
|
} |
2029 |
|
|
|
2030 |
|
|
if ( !isprint((int)txt[i] )) |
2031 |
|
|
{ |
2032 |
|
|
i++; |
2033 |
|
|
continue; |
2034 |
|
|
} |
2035 |
|
|
|
2036 |
|
|
printf("%c", txt[i] ); |
2037 |
|
|
i++; |
2038 |
|
|
|
2039 |
|
|
if ( (col + strlen( indent_buf ) > 60 && isspace((int)txt[i])) || col + strlen( indent_buf ) > 78 ) |
2040 |
|
|
{ |
2041 |
|
|
printf("\n"); |
2042 |
|
|
last_newline=1; |
2043 |
|
|
break; |
2044 |
|
|
} |
2045 |
|
|
} |
2046 |
|
|
} |
2047 |
|
|
|
2048 |
|
|
|
2049 |
|
|
if ( !last_newline ) |
2050 |
|
|
printf("\n"); |
2051 |
|
|
} |
2052 |
|
|
|