/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/xml.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/xml.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 $Id: xml.c,v 1.55 2001/10/11 22:21:14 whmoseley Exp $
3 **
4 **
5 ** This program and library is free software; you can redistribute it and/or
6 ** modify it under the terms of the GNU (Library) General Public License
7 ** as published by the Free Software Foundation; either version 2
8 ** of the License, or any later version.
9 **
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU (Library) General Public License for more details.
14 **
15 **
16 ** 2001-03-17 rasc save real_filename as title (instead full real_path)
17 ** was: compatibility issue to v 1.x.x
18 ** 2001-05-09 rasc entities changed (new module)
19 **
20 ** 2001-07-25 moseley complete rewrite to use James Clark's Expat parser
21 **
22 ** BUGS:
23 ** UndefinedMetaTags ignore is not coded
24 */
25
26 #include "swish.h"
27 #include "merge.h"
28 #include "mem.h"
29 #include "string.h"
30 #include "docprop.h"
31 #include "error.h"
32 #include "index.h"
33 #include "metanames.h"
34
35 #include "xmlparse.h" // James Clark's Expat
36
37 #define BUFFER_CHUNK_SIZE 20000
38
39 typedef struct {
40 char *buffer; // text for buffer
41 int cur; // pointer to end of buffer
42 int max; // max size of buffer
43 int defaultID; // default ID for no meta names.
44 } CHAR_BUFFER;
45
46
47 // I think that the property system can deal with StoreDescription in a cleaner way.
48 // This code shouldn't need to know about that StoreDescription.
49
50 typedef struct {
51 struct metaEntry *meta;
52 int save_size; /* save max size */
53 char *tag; /* summary tag */
54 int active; /* inside summary */
55 } SUMMARY_INFO;
56
57
58 typedef struct {
59 CHAR_BUFFER text_buffer; // buffer for collecting text
60
61 // CHAR_BUFFER prop_buffer; // someday, may want a separate property buffer if want to collect tags within props
62
63 SUMMARY_INFO summary; // argh.
64
65 char *ignore_tag; // tag that triggered ignore (currently used for both)
66 int total_words;
67 int word_pos;
68 int filenum;
69 XML_Parser *parser;
70 INDEXDATAHEADER *header;
71 SWISH *sw;
72 FileProp *fprop;
73 FileRec *thisFileEntry;
74
75 } PARSE_DATA;
76
77
78 /* Prototypes */
79 static void start_hndl(void *data, const char *el, const char **attr);
80 static void end_hndl(void *data, const char *el);
81 static void char_hndl(void *data, const char *txt, int txtlen);
82 static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen );
83 static void flush_buffer( PARSE_DATA *parse_data );
84 static void comment_hndl(void *data, const char *txt);
85 static char *isIgnoreMetaName(SWISH * sw, char *tag);
86
87
88
89
90 /*********************************************************************
91 * Entry to index an XML file.
92 *
93 * Creates an XML_Parser object and parses buffer
94 *
95 * Returns:
96 * Count of words indexed
97 *
98 * ToDo:
99 * This is a stream parser, so could avoid loading entire document into RAM before parsing
100 *
101 *********************************************************************/
102
103 int countwords_XML (SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
104 {
105 PARSE_DATA parse_data;
106 XML_Parser p = XML_ParserCreate(NULL);
107 IndexFILE *indexf = sw->indexlist;
108 struct StoreDescription *stordesc = fprop->stordesc;
109
110
111 /* Set defaults */
112 memset(&parse_data, 0, sizeof(parse_data));
113
114 parse_data.header = &indexf->header;
115 parse_data.parser = p;
116 parse_data.sw = sw;
117 parse_data.fprop = fprop;
118 parse_data.filenum = fi->filenum;
119 parse_data.word_pos= 1; /* compress doesn't like zero */
120 parse_data.thisFileEntry = fi;
121
122
123 /* Don't really like this, as mentioned above */
124 if ( stordesc && (parse_data.summary.meta = getPropNameByName(parse_data.header, AUTOPROPERTY_SUMMARY)))
125 {
126 /* Set property limit size for this document type, and store previous size limit */
127 parse_data.summary.save_size = parse_data.summary.meta->max_len;
128 parse_data.summary.meta->max_len = stordesc->size;
129 parse_data.summary.tag = stordesc->field;
130 }
131
132
133 addCommonProperties(sw, fprop, fi, NULL,NULL, 0);
134
135
136
137 if (!p)
138 progerr("Failed to create XML parser object for '%s'", fprop->real_path );
139
140
141 /* Set event handlers */
142 XML_SetUserData( p, (void *)&parse_data ); // local data to pass around
143 XML_SetElementHandler(p, start_hndl, end_hndl);
144 XML_SetCharacterDataHandler(p, char_hndl);
145
146 if( sw->indexComments )
147 XML_SetCommentHandler( p, comment_hndl );
148
149 //XML_SetProcessingInstructionHandler(p, proc_hndl);
150
151 if ( !XML_Parse(p, buffer, fprop->fsize, 1) )
152 progwarn("XML parse error in file '%s' line %d. Error: %s",
153 fprop->real_path, XML_GetCurrentLineNumber(p),XML_ErrorString(XML_GetErrorCode(p)));
154
155
156 /* clean up */
157 XML_ParserFree(p);
158
159 /* Flush any text left in the buffer, and free the buffer */
160 flush_buffer( &parse_data );
161
162 if ( parse_data.text_buffer.buffer )
163 efree( parse_data.text_buffer.buffer );
164
165
166 /* Restore the size in the StoreDescription property */
167 if ( parse_data.summary.save_size )
168 parse_data.summary.meta->max_len = parse_data.summary.save_size;
169
170 return parse_data.total_words;
171 }
172
173 /*********************************************************************
174 * Start Tag Event Handler
175 *
176 * These routines check to see if a given meta tag should be indexed
177 * and if the tags should be added as a property
178 *
179 * To Do:
180 * deal with attributes!
181 *
182 *********************************************************************/
183
184
185 static void start_hndl(void *data, const char *el, const char **attr)
186 {
187 PARSE_DATA *parse_data = (PARSE_DATA *)data;
188 struct metaEntry *m;
189 SWISH *sw = parse_data->sw;
190 char tag[MAXSTRLEN + 1];
191
192
193 /* return if within an ignore block */
194 if ( parse_data->ignore_tag )
195 return;
196
197 /* Flush any text in the buffer */
198 flush_buffer( parse_data );
199
200
201 if(strlen(el) >= MAXSTRLEN) // easy way out
202 {
203 progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
204 return;
205 }
206
207 strcpy(tag,(char *)el);
208 strtolower( tag ); // $$$ swish ignores case in xml tags!
209
210
211
212 /* Bump on all meta names, unless overridden */
213 /* Done before the ignore tag check since still need to bump */
214
215 if (!isDontBumpMetaName(sw->dontbumpstarttagslist, tag))
216 parse_data->word_pos++;
217
218
219 /* check for ignore tag (should propably remove char handler for speed) */
220 if ( (parse_data->ignore_tag = isIgnoreMetaName( sw, tag )))
221 return;
222
223
224 /* Check for metaNames */
225
226 if ( (m = getMetaNameByName( parse_data->header, tag)) )
227 m->in_tag++;
228
229 else
230 {
231 if (sw->UndefinedMetaTags == UNDEF_META_AUTO)
232 {
233 if (sw->verbose)
234 printf("!!!Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path);
235
236 addMetaEntry( parse_data->header, tag, META_INDEX, 0)->in_tag++;
237 }
238
239
240 /* If set to "error" on undefined meta tags, then error */
241 if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
242 progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path);
243 }
244
245
246 /* Check property names */
247
248 if ( (m = getPropNameByName( parse_data->header, tag)) )
249 m->in_tag++;
250
251
252 /* Look to enable StoreDescription */
253 {
254 SUMMARY_INFO *summary = &parse_data->summary;
255 if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
256 summary->active++;
257 }
258
259 }
260
261
262 /*********************************************************************
263 * End Tag Event Handler
264 *
265 *
266 *
267 *********************************************************************/
268
269
270 static void end_hndl(void *data, const char *el)
271 {
272 PARSE_DATA *parse_data = (PARSE_DATA *)data;
273 char tag[MAXSTRLEN + 1];
274 struct metaEntry *m;
275
276 if(strlen(el) > MAXSTRLEN)
277 {
278 progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
279 return;
280 }
281
282 strcpy(tag,(char *)el);
283 strtolower( tag );
284
285 if ( parse_data->ignore_tag )
286 {
287 if (strcmp( parse_data->ignore_tag, tag ) == 0)
288 parse_data->ignore_tag = NULL; // don't free since it's a pointer to the config setting
289 return;
290 }
291
292 /* Flush any text in the buffer */
293 flush_buffer( parse_data );
294
295
296 /* Don't allow matching across tag boundry */
297 if (!isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag))
298 parse_data->word_pos++;
299
300
301
302 /* Flag that we are not in tag anymore - tags must be balanced, of course. */
303
304 if ( ( m = getMetaNameByName( parse_data->header, tag) ) )
305 if ( m->in_tag )
306 m->in_tag--;
307
308
309 if ( ( m = getPropNameByName( parse_data->header, tag) ) )
310 if ( m->in_tag )
311 m->in_tag--;
312
313
314 /* Look to disable StoreDescription */
315 {
316 SUMMARY_INFO *summary = &parse_data->summary;
317 if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
318 summary->active--;
319 }
320
321 }
322
323 /*********************************************************************
324 * Character Data Event Handler
325 *
326 * This does the actual adding of text to the index and adding properties
327 * if any tags have been found to index
328 *
329 *
330 *********************************************************************/
331
332 static void char_hndl(void *data, const char *txt, int txtlen)
333 {
334 PARSE_DATA *parse_data = (PARSE_DATA *)data;
335
336
337 /* If currently in an ignore block, then return */
338 if ( parse_data->ignore_tag )
339 return;
340
341 /* Buffer the text */
342 append_buffer( &parse_data->text_buffer, txt, txtlen );
343
344 /* Some day, might want to have a separate property buffer if need to collect more than plain text */
345 // append_buffer( parse_data->prop_buffer, txt, txtlen );
346
347 }
348
349 /*********************************************************************
350 * Append character data to the end of the buffer
351 *
352 * Buffer is extended/created if needed
353 *
354 * ToDo: Flush buffer if it gets too large
355 *
356 *
357 *********************************************************************/
358
359 static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen )
360 {
361
362 if ( !txtlen ) // shouldn't happen
363 return;
364
365
366 /* (re)allocate buf if needed */
367
368 if ( buf->cur + txtlen >= buf->max )
369 buf->buffer = erealloc( buf->buffer, ( buf->max += BUFFER_CHUNK_SIZE+1 ) );
370
371
372 memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen );
373 buf->cur += txtlen;
374 }
375
376
377
378
379 /*********************************************************************
380 * Flush buffer - adds words to index, and properties
381 *
382 * 2001-08 jmruiz Change structure from IN_FILE | IN_META to IN_FILE
383 * Since structure does not have much sense in XML, if we use only IN_FILE
384 * we will save memory and disk space (one byte per location)
385 *
386 *
387 *********************************************************************/
388 static void flush_buffer( PARSE_DATA *parse_data )
389 {
390 CHAR_BUFFER *buf = &parse_data->text_buffer;
391 SWISH *sw = parse_data->sw;
392
393 /* anything to do? */
394 if ( !buf->cur )
395 return;
396
397 buf->buffer[buf->cur] = '\0';
398
399
400 /* Index the text */
401 parse_data->total_words +=
402 indexstring( sw, buf->buffer, parse_data->filenum, IN_FILE, 0, NULL, &(parse_data->word_pos) );
403
404
405 /* Add the properties */
406 addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path );
407
408
409 /* yuck. Ok, add to summary, if active */
410 {
411 SUMMARY_INFO *summary = &parse_data->summary;
412 if ( summary->active )
413 addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 );
414 }
415
416
417 /* clear the buffer */
418 buf->cur = 0;
419 }
420
421
422
423 /*********************************************************************
424 * Comments
425 *
426 * Should be able to call the char_hndl
427 *
428 * To Do:
429 * Can't use DontBump with comments. Might need a config variable for that.
430 *
431 *********************************************************************/
432 static void comment_hndl(void *data, const char *txt)
433 {
434 PARSE_DATA *parse_data = (PARSE_DATA *)data;
435 SWISH *sw = parse_data->sw;
436
437
438 /* Bump position around comments - hard coded, always done to prevent phrase matching */
439 parse_data->word_pos++;
440
441 /* Index the text */
442 parse_data->total_words +=
443 indexstring( sw, (char *)txt, parse_data->filenum, IN_COMMENTS, 0, NULL, &(parse_data->word_pos) );
444
445
446 parse_data->word_pos++;
447
448 }
449
450
451
452 /*********************************************************************
453 * check if a tag is an IgnoreTag
454 *
455 * Note: this returns a pointer to the config set tag, so don't free it!
456 *
457 *
458 *********************************************************************/
459
460 static char *isIgnoreMetaName(SWISH * sw, char *tag)
461 {
462 struct swline *tmplist = sw->ignoremetalist;
463
464 if (!tmplist)
465 return 0;
466
467 while (tmplist)
468 {
469 if (strcmp(tag, tmplist->line) == 0)
470 return tmplist->line;
471
472 tmplist = tmplist->next;
473 }
474
475 return NULL;
476 }
477
478

  ViewVC Help
Powered by ViewVC 1.1.22