/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/xml.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/xml.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     $Id: xml.c,v 1.55 2001/10/11 22:21:14 whmoseley Exp $
3     **
4     **
5     ** This program and library is free software; you can redistribute it and/or
6     ** modify it under the terms of the GNU (Library) General Public License
7     ** as published by the Free Software Foundation; either version 2
8     ** of the License, or any later version.
9     **
10     ** This program is distributed in the hope that it will be useful,
11     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13     ** GNU (Library) General Public License for more details.
14     **
15     **
16     ** 2001-03-17 rasc save real_filename as title (instead full real_path)
17     ** was: compatibility issue to v 1.x.x
18     ** 2001-05-09 rasc entities changed (new module)
19     **
20     ** 2001-07-25 moseley complete rewrite to use James Clark's Expat parser
21     **
22     ** BUGS:
23     ** UndefinedMetaTags ignore is not coded
24     */
25    
26     #include "swish.h"
27     #include "merge.h"
28     #include "mem.h"
29     #include "string.h"
30     #include "docprop.h"
31     #include "error.h"
32     #include "index.h"
33     #include "metanames.h"
34    
35     #include "xmlparse.h" // James Clark's Expat
36    
37     #define BUFFER_CHUNK_SIZE 20000
38    
39     typedef struct {
40     char *buffer; // text for buffer
41     int cur; // pointer to end of buffer
42     int max; // max size of buffer
43     int defaultID; // default ID for no meta names.
44     } CHAR_BUFFER;
45    
46    
47     // I think that the property system can deal with StoreDescription in a cleaner way.
48     // This code shouldn't need to know about that StoreDescription.
49    
50     typedef struct {
51     struct metaEntry *meta;
52     int save_size; /* save max size */
53     char *tag; /* summary tag */
54     int active; /* inside summary */
55     } SUMMARY_INFO;
56    
57    
58     typedef struct {
59     CHAR_BUFFER text_buffer; // buffer for collecting text
60    
61     // CHAR_BUFFER prop_buffer; // someday, may want a separate property buffer if want to collect tags within props
62    
63     SUMMARY_INFO summary; // argh.
64    
65     char *ignore_tag; // tag that triggered ignore (currently used for both)
66     int total_words;
67     int word_pos;
68     int filenum;
69     XML_Parser *parser;
70     INDEXDATAHEADER *header;
71     SWISH *sw;
72     FileProp *fprop;
73     FileRec *thisFileEntry;
74    
75     } PARSE_DATA;
76    
77    
78     /* Prototypes */
79     static void start_hndl(void *data, const char *el, const char **attr);
80     static void end_hndl(void *data, const char *el);
81     static void char_hndl(void *data, const char *txt, int txtlen);
82     static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen );
83     static void flush_buffer( PARSE_DATA *parse_data );
84     static void comment_hndl(void *data, const char *txt);
85     static char *isIgnoreMetaName(SWISH * sw, char *tag);
86    
87    
88    
89    
90     /*********************************************************************
91     * Entry to index an XML file.
92     *
93     * Creates an XML_Parser object and parses buffer
94     *
95     * Returns:
96     * Count of words indexed
97     *
98     * ToDo:
99     * This is a stream parser, so could avoid loading entire document into RAM before parsing
100     *
101     *********************************************************************/
102    
103     int countwords_XML (SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
104     {
105     PARSE_DATA parse_data;
106     XML_Parser p = XML_ParserCreate(NULL);
107     IndexFILE *indexf = sw->indexlist;
108     struct StoreDescription *stordesc = fprop->stordesc;
109    
110    
111     /* Set defaults */
112     memset(&parse_data, 0, sizeof(parse_data));
113    
114     parse_data.header = &indexf->header;
115     parse_data.parser = p;
116     parse_data.sw = sw;
117     parse_data.fprop = fprop;
118     parse_data.filenum = fi->filenum;
119     parse_data.word_pos= 1; /* compress doesn't like zero */
120     parse_data.thisFileEntry = fi;
121    
122    
123     /* Don't really like this, as mentioned above */
124     if ( stordesc && (parse_data.summary.meta = getPropNameByName(parse_data.header, AUTOPROPERTY_SUMMARY)))
125     {
126     /* Set property limit size for this document type, and store previous size limit */
127     parse_data.summary.save_size = parse_data.summary.meta->max_len;
128     parse_data.summary.meta->max_len = stordesc->size;
129     parse_data.summary.tag = stordesc->field;
130     }
131    
132    
133     addCommonProperties(sw, fprop, fi, NULL,NULL, 0);
134    
135    
136    
137     if (!p)
138     progerr("Failed to create XML parser object for '%s'", fprop->real_path );
139    
140    
141     /* Set event handlers */
142     XML_SetUserData( p, (void *)&parse_data ); // local data to pass around
143     XML_SetElementHandler(p, start_hndl, end_hndl);
144     XML_SetCharacterDataHandler(p, char_hndl);
145    
146     if( sw->indexComments )
147     XML_SetCommentHandler( p, comment_hndl );
148    
149     //XML_SetProcessingInstructionHandler(p, proc_hndl);
150    
151     if ( !XML_Parse(p, buffer, fprop->fsize, 1) )
152     progwarn("XML parse error in file '%s' line %d. Error: %s",
153     fprop->real_path, XML_GetCurrentLineNumber(p),XML_ErrorString(XML_GetErrorCode(p)));
154    
155    
156     /* clean up */
157     XML_ParserFree(p);
158    
159     /* Flush any text left in the buffer, and free the buffer */
160     flush_buffer( &parse_data );
161    
162     if ( parse_data.text_buffer.buffer )
163     efree( parse_data.text_buffer.buffer );
164    
165    
166     /* Restore the size in the StoreDescription property */
167     if ( parse_data.summary.save_size )
168     parse_data.summary.meta->max_len = parse_data.summary.save_size;
169    
170     return parse_data.total_words;
171     }
172    
173     /*********************************************************************
174     * Start Tag Event Handler
175     *
176     * These routines check to see if a given meta tag should be indexed
177     * and if the tags should be added as a property
178     *
179     * To Do:
180     * deal with attributes!
181     *
182     *********************************************************************/
183    
184    
185     static void start_hndl(void *data, const char *el, const char **attr)
186     {
187     PARSE_DATA *parse_data = (PARSE_DATA *)data;
188     struct metaEntry *m;
189     SWISH *sw = parse_data->sw;
190     char tag[MAXSTRLEN + 1];
191    
192    
193     /* return if within an ignore block */
194     if ( parse_data->ignore_tag )
195     return;
196    
197     /* Flush any text in the buffer */
198     flush_buffer( parse_data );
199    
200    
201     if(strlen(el) >= MAXSTRLEN) // easy way out
202     {
203     progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
204     return;
205     }
206    
207     strcpy(tag,(char *)el);
208     strtolower( tag ); // $$$ swish ignores case in xml tags!
209    
210    
211    
212     /* Bump on all meta names, unless overridden */
213     /* Done before the ignore tag check since still need to bump */
214    
215     if (!isDontBumpMetaName(sw->dontbumpstarttagslist, tag))
216     parse_data->word_pos++;
217    
218    
219     /* check for ignore tag (should propably remove char handler for speed) */
220     if ( (parse_data->ignore_tag = isIgnoreMetaName( sw, tag )))
221     return;
222    
223    
224     /* Check for metaNames */
225    
226     if ( (m = getMetaNameByName( parse_data->header, tag)) )
227     m->in_tag++;
228    
229     else
230     {
231     if (sw->UndefinedMetaTags == UNDEF_META_AUTO)
232     {
233     if (sw->verbose)
234     printf("!!!Adding automatic MetaName '%s' found in file '%s'\n", tag, parse_data->fprop->real_path);
235    
236     addMetaEntry( parse_data->header, tag, META_INDEX, 0)->in_tag++;
237     }
238    
239    
240     /* If set to "error" on undefined meta tags, then error */
241     if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
242     progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", tag, parse_data->fprop->real_path);
243     }
244    
245    
246     /* Check property names */
247    
248     if ( (m = getPropNameByName( parse_data->header, tag)) )
249     m->in_tag++;
250    
251    
252     /* Look to enable StoreDescription */
253     {
254     SUMMARY_INFO *summary = &parse_data->summary;
255     if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
256     summary->active++;
257     }
258    
259     }
260    
261    
262     /*********************************************************************
263     * End Tag Event Handler
264     *
265     *
266     *
267     *********************************************************************/
268    
269    
270     static void end_hndl(void *data, const char *el)
271     {
272     PARSE_DATA *parse_data = (PARSE_DATA *)data;
273     char tag[MAXSTRLEN + 1];
274     struct metaEntry *m;
275    
276     if(strlen(el) > MAXSTRLEN)
277     {
278     progwarn("Warning: Tag found in %s is too long: '%s'", parse_data->fprop->real_path, el );
279     return;
280     }
281    
282     strcpy(tag,(char *)el);
283     strtolower( tag );
284    
285     if ( parse_data->ignore_tag )
286     {
287     if (strcmp( parse_data->ignore_tag, tag ) == 0)
288     parse_data->ignore_tag = NULL; // don't free since it's a pointer to the config setting
289     return;
290     }
291    
292     /* Flush any text in the buffer */
293     flush_buffer( parse_data );
294    
295    
296     /* Don't allow matching across tag boundry */
297     if (!isDontBumpMetaName(parse_data->sw->dontbumpendtagslist, tag))
298     parse_data->word_pos++;
299    
300    
301    
302     /* Flag that we are not in tag anymore - tags must be balanced, of course. */
303    
304     if ( ( m = getMetaNameByName( parse_data->header, tag) ) )
305     if ( m->in_tag )
306     m->in_tag--;
307    
308    
309     if ( ( m = getPropNameByName( parse_data->header, tag) ) )
310     if ( m->in_tag )
311     m->in_tag--;
312    
313    
314     /* Look to disable StoreDescription */
315     {
316     SUMMARY_INFO *summary = &parse_data->summary;
317     if ( summary->tag && (strcasecmp( tag, summary->tag ) == 0 ))
318     summary->active--;
319     }
320    
321     }
322    
323     /*********************************************************************
324     * Character Data Event Handler
325     *
326     * This does the actual adding of text to the index and adding properties
327     * if any tags have been found to index
328     *
329     *
330     *********************************************************************/
331    
332     static void char_hndl(void *data, const char *txt, int txtlen)
333     {
334     PARSE_DATA *parse_data = (PARSE_DATA *)data;
335    
336    
337     /* If currently in an ignore block, then return */
338     if ( parse_data->ignore_tag )
339     return;
340    
341     /* Buffer the text */
342     append_buffer( &parse_data->text_buffer, txt, txtlen );
343    
344     /* Some day, might want to have a separate property buffer if need to collect more than plain text */
345     // append_buffer( parse_data->prop_buffer, txt, txtlen );
346    
347     }
348    
349     /*********************************************************************
350     * Append character data to the end of the buffer
351     *
352     * Buffer is extended/created if needed
353     *
354     * ToDo: Flush buffer if it gets too large
355     *
356     *
357     *********************************************************************/
358    
359     static void append_buffer( CHAR_BUFFER *buf, const char *txt, int txtlen )
360     {
361    
362     if ( !txtlen ) // shouldn't happen
363     return;
364    
365    
366     /* (re)allocate buf if needed */
367    
368     if ( buf->cur + txtlen >= buf->max )
369     buf->buffer = erealloc( buf->buffer, ( buf->max += BUFFER_CHUNK_SIZE+1 ) );
370    
371    
372     memcpy( (void *) &(buf->buffer[buf->cur]), txt, txtlen );
373     buf->cur += txtlen;
374     }
375    
376    
377    
378    
379     /*********************************************************************
380     * Flush buffer - adds words to index, and properties
381     *
382     * 2001-08 jmruiz Change structure from IN_FILE | IN_META to IN_FILE
383     * Since structure does not have much sense in XML, if we use only IN_FILE
384     * we will save memory and disk space (one byte per location)
385     *
386     *
387     *********************************************************************/
388     static void flush_buffer( PARSE_DATA *parse_data )
389     {
390     CHAR_BUFFER *buf = &parse_data->text_buffer;
391     SWISH *sw = parse_data->sw;
392    
393     /* anything to do? */
394     if ( !buf->cur )
395     return;
396    
397     buf->buffer[buf->cur] = '\0';
398    
399    
400     /* Index the text */
401     parse_data->total_words +=
402     indexstring( sw, buf->buffer, parse_data->filenum, IN_FILE, 0, NULL, &(parse_data->word_pos) );
403    
404    
405     /* Add the properties */
406     addDocProperties( parse_data->header, &(parse_data->thisFileEntry->docProperties), (unsigned char *)buf->buffer, buf->cur, parse_data->fprop->real_path );
407    
408    
409     /* yuck. Ok, add to summary, if active */
410     {
411     SUMMARY_INFO *summary = &parse_data->summary;
412     if ( summary->active )
413     addDocProperty( &(parse_data->thisFileEntry->docProperties), summary->meta, (unsigned char *)buf->buffer, buf->cur, 0 );
414     }
415    
416    
417     /* clear the buffer */
418     buf->cur = 0;
419     }
420    
421    
422    
423     /*********************************************************************
424     * Comments
425     *
426     * Should be able to call the char_hndl
427     *
428     * To Do:
429     * Can't use DontBump with comments. Might need a config variable for that.
430     *
431     *********************************************************************/
432     static void comment_hndl(void *data, const char *txt)
433     {
434     PARSE_DATA *parse_data = (PARSE_DATA *)data;
435     SWISH *sw = parse_data->sw;
436    
437    
438     /* Bump position around comments - hard coded, always done to prevent phrase matching */
439     parse_data->word_pos++;
440    
441     /* Index the text */
442     parse_data->total_words +=
443     indexstring( sw, (char *)txt, parse_data->filenum, IN_COMMENTS, 0, NULL, &(parse_data->word_pos) );
444    
445    
446     parse_data->word_pos++;
447    
448     }
449    
450    
451    
452     /*********************************************************************
453     * check if a tag is an IgnoreTag
454     *
455     * Note: this returns a pointer to the config set tag, so don't free it!
456     *
457     *
458     *********************************************************************/
459    
460     static char *isIgnoreMetaName(SWISH * sw, char *tag)
461     {
462     struct swline *tmplist = sw->ignoremetalist;
463    
464     if (!tmplist)
465     return 0;
466    
467     while (tmplist)
468     {
469     if (strcmp(tag, tmplist->line) == 0)
470     return tmplist->line;
471    
472     tmplist = tmplist->next;
473     }
474    
475     return NULL;
476     }
477    
478    

  ViewVC Help
Powered by ViewVC 1.1.22