/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/html.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/html.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 adcroft 1.1 /*
2     $Id: html.c,v 1.65 2002/05/16 18:51:52 whmoseley Exp $
3     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
4     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
5     **
6     ** This program and library is free software; you can redistribute it and/or
7     ** modify it under the terms of the GNU (Library) General Public License
8     ** as published by the Free Software Foundation; either version 2
9     ** of the License, or any later version.
10     **
11     ** This program is distributed in the hope that it will be useful,
12     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
13     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14     ** GNU (Library) General Public License for more details.
15     **
16     ** You should have received a copy of the GNU (Library) General Public License
17     ** along with this program; if not, write to the Free Software
18     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19     **---------------------------------------------------------
20     ** ** ** PATCHED 5/13/96, CJC
21     ** Added MatchAndChange for regex in replace rule G.Hill 2/10/98
22     **
23     ** change sprintf to snprintf to avoid corruption
24     ** added safestrcpy() macro to avoid corruption from strcpy overflow
25     ** SRE 11/17/99
26     **
27     ** fixed cast to int problems pointed out by "gcc -Wall"
28     ** SRE 2/22/00
29     **
30     ** 2001-03-17 rasc save real_filename as title (instead full real_path)
31     ** was: compatibility issue to v 1.x.x
32     **
33     ** 2001-05-09 rasc entities completly rewritten (new module)
34     ** small fix in parseHTMLsummary
35     **
36     **
37     */
38    
39     #include "swish.h"
40     #include "mem.h"
41     #include "string.h"
42     #include "index.h"
43     #include "compress.h"
44     #include "merge.h"
45     #include "docprop.h"
46     #include "metanames.h"
47     #include "html.h"
48     #include "entities.h"
49     #include "fs.h"
50     #include "error.h"
51    
52     /* #### */
53    
54     static char *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive);
55    
56     static struct metaEntry *getHTMLMeta(IndexFILE * indexf, char *tag, SWISH *sw, char *name,
57     char **parsed_tag, char *filename)
58     {
59     char *temp;
60     int lenword = 0;
61     char *word = NULL;
62     char buffer[MAXSTRLEN + 1];
63     int i;
64     struct metaEntry *e = NULL;
65    
66    
67     word = buffer;
68     lenword = sizeof(buffer) - 1;
69    
70     if (!name)
71     {
72     if (!(temp = (char *) lstrstr((char *) tag, (char *) "NAME")))
73     return NULL;
74     }
75     else
76     temp = name;
77     temp += 4; /* strlen("NAME") */
78    
79     /* Get to the '=' sign disreguarding any other char */
80     while (*temp)
81     {
82     if (*temp && (*temp != '=')) /* TAB */
83     temp++;
84     else
85     {
86     temp++;
87     break;
88     }
89     }
90    
91     /* Get to the beginning of the word disreguarding blanks and quotes */
92     /* TAB */
93     while (*temp)
94     {
95     if (*temp == ' ' || *temp == '"')
96     temp++;
97     else
98     break;
99     }
100    
101     /* Copy the word and convert to lowercase */
102     /* TAB */
103     /* while (temp !=NULL && strncmp(temp," ",1) */
104     /* && strncmp(temp,"\"",1) && i<= MAXWORDLEN ) { */
105    
106     /* and the above <= was wrong, should be < which caused the
107     null insertion below to be off by two bytes */
108    
109    
110     for (i = 0; temp != NULL && *temp && *temp != ' ' && *temp != '"';)
111     {
112     if (i == lenword)
113     {
114     lenword *= 2;
115     if(word == buffer)
116     {
117     word = (char *) emalloc(lenword + 1);
118     memcpy(word,buffer,sizeof(buffer));
119     }
120     else
121     word = (char *) erealloc(word, lenword + 1);
122     }
123     word[i] = *temp++;
124     i++;
125     }
126     if (i == lenword)
127     {
128     lenword *= 2;
129     word = (char *) erealloc(word, lenword + 1);
130     }
131     word[i] = '\0';
132    
133     /* Use Rainer's routine */
134     strtolower(word);
135    
136     *parsed_tag = word;
137    
138     if ((e = getMetaNameByName(&indexf->header, word)))
139     return e;
140    
141     if ( (sw->UndefinedMetaTags == UNDEF_META_AUTO) && word && *word)
142     {
143     if (sw->verbose)
144     printf("Adding automatic MetaName '%s' found in file '%s'\n", word, filename);
145    
146     return addMetaEntry(&indexf->header, word, META_INDEX, 0);
147     }
148    
149     /* If it is ok not to have the name listed, just index as no-name */
150     if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
151     progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", word, filename);
152    
153     if(word != buffer)
154     efree(word);
155    
156     return NULL;
157    
158     }
159    
160    
161     /* Parses the Meta tag */
162     static int parseMetaData(SWISH * sw, IndexFILE * indexf, char *tag, int filenum, int structure, char *name, char *content, FileRec *thisFileEntry,
163     int *position, char *filename)
164     {
165     int metaName;
166     struct metaEntry *metaNameEntry;
167     char *temp,
168     *start,
169     *convtag;
170     int wordcount = 0; /* Word count */
171     char *parsed_tag;
172    
173    
174     /* Lookup (or add if "auto") meta name for tag */
175    
176     metaNameEntry = getHTMLMeta(indexf, tag, sw, name, &parsed_tag, filename);
177     metaName = metaNameEntry ? metaNameEntry->metaID : 1;
178    
179    
180     temp = content + 7; /* 7 is strlen("CONTENT") */
181    
182     /* Get to the " sign disreguarding other characters */
183     if ((temp = strchr(temp, '\"')))
184     {
185     structure |= IN_META;
186    
187     start = temp + 1;
188    
189     /* Jump escaped \" */
190     temp = strchr(start, '\"');
191     while (temp)
192     {
193     if (*(temp - 1) == '\\')
194     temp = strchr(temp + 1, '\"');
195     else
196     break;
197     }
198    
199     if (temp)
200     *temp = '\0'; /* terminate CONTENT, temporarily */
201    
202    
203     /* Convert entities, if requested, and remove newlines */
204     convtag = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)start);
205     remove_newlines(convtag); /** why isn't this just done for the entire doc? */
206    
207    
208    
209     /* Index only if a metaEntry was found, or if not not ReqMetaName */
210     if ( sw->UndefinedMetaTags != UNDEF_META_IGNORE || metaNameEntry)
211     {
212     /* Meta tags get bummped */
213     /* I'm not clear this works as well as I'd like because it always bumps on a new Meta tag,
214     * but in order to disable this behavior the name MUST be a meta name.
215     * Probably better to let getHTMLMeta() return the name as a string.
216     */
217    
218    
219     if (!metaNameEntry || !isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName))
220     position[0]++;
221    
222     wordcount = indexstring(sw, convtag, filenum, structure, 1, &metaName, position);
223    
224     if (!metaNameEntry || !isDontBumpMetaName(sw->dontbumpendtagslist, metaNameEntry->metaName))
225     position[0]++;
226    
227     }
228    
229    
230     /* If it is a property store it */
231    
232     if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag)))
233     if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char*)convtag, strlen(convtag), 0))
234     progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, filename);
235    
236     if (temp)
237     *temp = '\"'; /* restore string */
238     }
239    
240     return wordcount;
241     }
242    
243    
244     /* Extracts anything in <title> tags from an HTML file and returns it.
245     ** Otherwise, only the file name without its path is returned.
246     */
247    
248     char *parseHTMLtitle(SWISH *sw, char *buffer)
249     {
250     char *title;
251     char *empty_title;
252    
253     empty_title = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,1);
254     *empty_title = '\0';
255    
256     if (!buffer)
257     return empty_title;
258    
259     if ((title = parsetag(sw, "title", buffer, TITLETOPLINES, CASE_SENSITIVE_OFF)))
260     return title;
261    
262     return empty_title;
263     }
264    
265    
266     /* Check if a particular title (read: file!) should be ignored
267     ** according to the settings in the configuration file.
268     */
269     /* This is to check "title contains" option in config file */
270    
271     int isoktitle(SWISH * sw, char *title)
272     {
273     struct MOD_FS *fs = sw->FS;
274    
275     return !match_regex_list(title, fs->filerules.title);
276     }
277    
278    
279    
280    
281     /* This returns the value corresponding to the HTML structures
282     ** a word is in.
283     */
284    
285     static int getstructure(char *tag, int structure)
286     {
287    
288     /* int len; *//* not used - 2/22/00 */
289     char oldChar = 0;
290     char *endOfTag = NULL;
291     char *pos;
292    
293     pos = tag;
294     while (*pos)
295     {
296     if (isspace((int) ((unsigned char) *pos)))
297     {
298     endOfTag = pos; /* remember where we are... */
299     oldChar = *pos; /* ...and what we saw */
300     *pos = '\0'; /* truncate string, for now */
301     }
302     else
303     pos++;
304     }
305     /* Store Word Context
306     ** Modified DLN 1999-10-24 - Comments and Cleaning
307     ** TODO: Make sure that these allow for HTML attributes
308     * */
309    
310     /* HEAD */
311     if (strcasecmp(tag, "/head") == 0)
312     structure &= ~IN_HEAD; /* Out */
313     else if (strcasecmp(tag, "head") == 0)
314     structure |= IN_HEAD; /* In */
315     /* TITLE */
316     else if (strcasecmp(tag, "/title") == 0)
317     structure &= ~IN_TITLE;
318     else if (strcasecmp(tag, "title") == 0)
319     structure |= IN_TITLE;
320     /* BODY */
321     else if (strcasecmp(tag, "/body") == 0)
322     structure &= ~IN_BODY; /* In */
323     else if (strcasecmp(tag, "body") == 0)
324     structure |= IN_BODY; /* Out */
325     /* H1, H2, H3, H4, H5, H6 */
326     else if (tag[0] == '/' && tolower((int)((unsigned char)tag[1])) == 'h' && isdigit((int)((unsigned char)tag[2]))) /* cast to int - 2/22/00 */
327     structure &= ~IN_HEADER; /* In */
328     else if (tolower((int)((unsigned char)tag[0])) == 'h' && isdigit((int)(unsigned char)tag[1])) /* cast to int - 2/22/00 */
329     structure |= IN_HEADER; /* Out */
330     /* EM, STRONG */
331     else if ((strcasecmp(tag, "/em") == 0) || (strcasecmp(tag, "/strong") == 0))
332     structure &= ~IN_EMPHASIZED; /* Out */
333     else if ((strcasecmp(tag, "em") == 0) || (strcasecmp(tag, "strong") == 0))
334     structure |= IN_EMPHASIZED; /* In */
335     /* B, I are seperate for semantics */
336     else if ((strcasecmp(tag, "/b") == 0) || (strcasecmp(tag, "/i") == 0))
337     structure &= ~IN_EMPHASIZED; /* Out */
338     else if ((strcasecmp(tag, "b") == 0) || (strcasecmp(tag, "i") == 0))
339     structure |= IN_EMPHASIZED; /* In */
340     /* The End */
341    
342     if (endOfTag != NULL)
343     {
344     *endOfTag = oldChar;
345     }
346     return structure;
347     }
348    
349    
350    
351     /* Get the MetaData index when the whole tag is passed */
352    
353     /* Patch by Tom Brown */
354     /* TAB, this routine is/was somewhat pathetic... but it was pathetic in
355     1.2.4 too ... someone needed a course in defensive programming... there are
356     lots of tests below for temp != NULL, but what is desired is *temp != '\0'
357     (e.g. simply *temp) ... I'm going to remove some strncmp(temp,constant,1)
358     which are must faster as *temp != constant ...
359    
360     Anyhow, the test case I've got that's core dumping is:
361     <META content=3D"MSHTML 5.00.2614.3401" name=3DGENERATOR>
362     no trailing quote, no trailing space... and with the missing/broken check for+ end of string it scribbles over the stack...
363    
364     */
365    
366    
367    
368     static char *parseHtmlSummary(char *buffer, char *field, int size, SWISH * sw)
369     {
370     char *p,
371     *q,
372     *tag,
373     *endtag,
374     c = '\0';
375     char *summary,
376     *beginsum,
377     *endsum,
378     *tmp,
379     *tmp2,
380     *tmp3;
381     int found,
382     lensummary;
383    
384     /* Get the summary if no metaname/field is given */
385     if (!field && size)
386     {
387     /* Jump title if it exists */
388     if ((p = lstrstr(buffer, "</title>")))
389     {
390     p += 8;
391     }
392     else
393     p = buffer;
394     /* Let us try to find <body> */
395     if ((q = lstrstr(p, "<body")))
396     {
397     q = strchr(q, '>');
398     }
399     else
400     q = p;
401     summary = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(p)+1);
402     strcpy(summary,p);
403     remove_newlines(summary);
404    
405     //$$$$ Todo: remove tag and content of scripts, css, java, embeddedobjects, comments, etc
406    
407     remove_tags(summary);
408    
409     summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);
410    
411    
412     /* use only the required memory -save those not used */
413     /* 2001-03-13 rasc copy only <size> bytes of string */
414     if((int) strlen(summary) > size)
415     summary[size]='\0';
416     return summary;
417     }
418    
419     for (p = buffer, summary = NULL, found = 0, beginsum = NULL, endsum = NULL; p && *p;)
420     {
421     if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
422     { /* Look for non escaped '<' */
423     tag++;
424     for (endtag = tag;;)
425     if ((endtag = strchr(endtag, '>')))
426     {
427     if (*(endtag - 1) != '\\')
428     break;
429     else
430     endtag++;
431     }
432     else
433     break;
434     if (endtag)
435     {
436     c = *endtag;
437     *endtag++ = '\0';
438     if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
439     { /* Check for META TAG TYPE 1 */
440     if (lstrstr(tag, "START"))
441     {
442     if ((tmp = lstrstr(tag, "NAME")))
443     {
444     tmp += 4;
445     if (lstrstr(tmp, field))
446     {
447     beginsum = endtag;
448     found = 1;
449     }
450     p = endtag;
451     }
452     else
453     p = endtag;
454     }
455     else if (lstrstr(tag, "END"))
456     {
457     if (!found)
458     {
459     p = endtag;
460     }
461     else
462     {
463     endsum = tag - 1;
464     *(endtag - 1) = c;
465     break;
466     }
467     }
468     } /* Check for META TAG TYPE 2 */
469     else if ((tag[0] != '!') && lstrstr(tag, "META") && (tmp = lstrstr(tag, "NAME")) && (tmp2 = lstrstr(tag, "CONTENT")))
470     {
471     tmp += 4;
472     tmp3 = lstrstr(tmp, field);
473     if (tmp3 && tmp3 < tmp2)
474     {
475     tmp2 += 7;
476     if ((tmp = strchr(tmp2, '=')))
477     {
478     for (++tmp; isspace((int) ((unsigned char) *tmp)); tmp++);
479     if (*tmp == '\"')
480     {
481     beginsum = tmp + 1;
482     for (tmp = endtag - 1; tmp > beginsum; tmp--)
483     if (*tmp == '\"')
484     break;
485     if (tmp == beginsum)
486     endsum = endtag - 1;
487     else
488     endsum = tmp;
489     }
490     else
491     {
492     beginsum = tmp;
493     endsum = endtag - 1;
494     }
495     found = 1;
496     *(endtag - 1) = c;
497     break;
498    
499     }
500     }
501     p = endtag;
502     } /* Default: Continue */
503     else
504     {
505     p = endtag;
506     }
507     }
508     else
509     p = NULL; /* tag not closed ->END */
510     if (endtag)
511     *(endtag - 1) = c;
512     }
513     else
514     { /* No more '<' */
515     p = NULL;
516     }
517     }
518     if (found && beginsum && endsum && endsum > beginsum)
519     {
520     lensummary = endsum - beginsum;
521     summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lensummary + 1);
522     memcpy(summary, beginsum, lensummary);
523     summary[lensummary] = '\0';
524     }
525     /* If field is set an no metaname is found, let us search */
526     /* for something like <field>bla bla </field> */
527     if (!summary && field)
528     {
529     summary = parsetag(sw, field, buffer, 0, CASE_SENSITIVE_OFF);
530     }
531     /* Finally check for something after title (if exists) and */
532     /* after <body> (if exists) */
533    
534     if (!summary)
535     {
536     /* Jump title if it exists */
537     if ((p = lstrstr(buffer, "</title>")))
538     {
539     p += 8;
540     }
541     else
542     p = buffer;
543    
544     /* Let us try to find <body> */
545     if ((q = lstrstr(p, "<body")))
546     {
547     q = strchr(q, '>');
548     }
549     else
550     q = p;
551    
552     summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(q) + 1);
553     strcpy(summary,q);
554     }
555    
556     if (summary)
557     {
558     remove_newlines(summary);
559     remove_tags(summary);
560     summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);
561     }
562    
563     if (summary && size && ((int) strlen(summary)) > size)
564     summary[size] = '\0';
565     return summary;
566     }
567    
568    
569     #define NO_TAG 0
570     #define TAG_CLOSE 1
571     #define TAG_FOUND 2
572    
573     /* Gets the content between "<parsetag>" and "</parsetag>" from buffer
574     limiting the scan to the first max_lines lines (0 means all lines) */
575     static char *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive)
576     {
577     register int c,
578     d;
579     register char *p,
580     *r;
581     char *tag;
582     int lencontent;
583     char *content;
584     int i,
585     j,
586     lines,
587     status,
588     tagbuflen,
589     totaltaglen,
590     curlencontent;
591     char *begintag;
592     char *endtag;
593     char *newbuf;
594     char *(*f_strstr) ();
595    
596    
597    
598     if (case_sensitive)
599     f_strstr = strstr;
600     else
601     f_strstr = lstrstr;
602    
603     lencontent = strlen(parsetag);
604     begintag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 3);
605     endtag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 4);
606     sprintf(begintag, "<%s>", parsetag);
607     sprintf(endtag, "</%s>", parsetag);
608    
609     tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, 1);
610     tag[0] = '\0';
611    
612     content = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (lencontent = MAXSTRLEN) + 1);
613     lines = 0;
614     status = NO_TAG;
615     p = content;
616     *p = '\0';
617    
618    
619     for (r = buffer;;)
620     {
621     c = *r++;
622     if (c == '\n')
623     {
624     lines++;
625     if (max_lines && lines == max_lines)
626     break;
627     }
628     if (!c)
629     return NULL;
630    
631     switch (c)
632     {
633     case '<':
634     tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (tagbuflen = MAXSTRLEN) + 1);
635     totaltaglen = 0;
636     tag[totaltaglen++] = '<';
637    
638     /* Collect until find '>' */
639     while (1)
640     {
641     d = *r++;
642     if (!d)
643     return NULL;
644     if (totaltaglen == tagbuflen)
645     {
646     newbuf = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, tagbuflen + 200 + 1);
647     memcpy(newbuf,tag,tagbuflen + 1);
648     tag = newbuf;
649     tagbuflen += 200;
650     }
651     tag[totaltaglen++] = d;
652     if (d == '>')
653     {
654     tag[totaltaglen] = '\0';
655     break;
656     }
657     }
658    
659    
660     if (f_strstr(tag, endtag))
661     {
662     status = TAG_CLOSE;
663     *p = '\0';
664    
665     /* nulls to spaces */
666     for (i = 0; content[i]; i++)
667     if (content[i] == '\n')
668     content[i] = ' ';
669    
670     /* skip over initial spaces and quotes */
671     for (i = 0; isspace((int) ((unsigned char) content[i])) || content[i] == '\"'; i++)
672     ;
673    
674     /* shift buffer to left */
675     for (j = 0; content[i]; j++)
676     content[j] = content[i++];
677    
678     content[j] = '\0';
679    
680    
681     /* remove trailing spaces, nulls, quotes */
682     for (j = strlen(content) - 1; ( j >= 0 ) && ( isspace((int) ((unsigned char) content[j])) || content[j] == '\0' || content[j] == '\"'); j--)
683     content[j] = '\0';
684    
685     /* replace double quotes with single quotes -- why? */
686     for (j = 0; content[j]; j++)
687     if (content[j] == '\"')
688     content[j] = '\'';
689    
690     if (*content)
691     return (content);
692     else
693     return NULL;
694     }
695     else if (f_strstr(tag, begintag))
696     {
697     status = TAG_FOUND;
698     }
699     break;
700     default:
701     if (status == TAG_FOUND)
702     {
703     curlencontent = p - content;
704     if (curlencontent == lencontent)
705     {
706     newbuf = Mem_ZoneAlloc(sw->Index->perDocTmpZone,(lencontent * 2) + 1);
707     memcpy(newbuf,content,lencontent + 1);
708     lencontent *= 2;
709     content = newbuf;
710     p = content + curlencontent;
711     }
712     *p = c;
713     p++;
714     }
715     }
716     }
717     return NULL;
718     }
719    
720    
721    
722    
723    
724    
725    
726     /* Parses the words in a comment.
727     */
728    
729     int parsecomment(SWISH * sw, char *tag, int filenum, int structure, int metaID, int *position)
730     {
731     structure |= IN_COMMENTS;
732     return indexstring(sw, tag + 1, filenum, structure, 1, &metaID, position);
733     }
734    
735     /* Indexes all the words in a html file and adds the appropriate information
736     ** to the appropriate structures.
737     */
738    
739     /* Indexes all the words in a html file and adds the appropriate information
740     ** to the appropriate structures.
741     */
742    
743     int countwords_HTML(SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
744     {
745     int ftotalwords;
746     int *metaID;
747     int metaIDlen;
748     int position; /* Position of word in file */
749     int currentmetanames;
750     char *p,
751     *newp,
752     *tag,
753     *endtag;
754     int structure;
755     FileRec *thisFileEntry = fi;
756     struct metaEntry *metaNameEntry;
757     IndexFILE *indexf = sw->indexlist;
758     struct MOD_Index *idx = sw->Index;
759     char *Content = NULL,
760     *Name = NULL,
761     *summary = NULL;
762     char *title = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)parseHTMLtitle(sw, buffer));
763    
764     if (!isoktitle(sw, title))
765     return -2;
766    
767    
768     if (fprop->stordesc)
769     summary = parseHtmlSummary(buffer, fprop->stordesc->field, fprop->stordesc->size, sw);
770    
771     addCommonProperties( sw, fprop, fi, title, summary, 0 );
772    
773     /* Init meta info */
774     metaID = (int *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,(metaIDlen = 16) * sizeof(int));
775    
776     currentmetanames = 0;
777     ftotalwords = 0;
778     structure = IN_FILE;
779     metaID[0] = 1;
780     position = 1;
781    
782    
783     for (p = buffer; p && *p;)
784     {
785    
786     /* Look for non escaped '<' */
787     if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
788     {
789     /* Index up to the tag */
790     *tag++ = '\0';
791    
792     newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
793    
794     if ( ! currentmetanames )
795     currentmetanames++;
796     ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);
797    
798     /* Now let us look for a not escaped '>' */
799     for (endtag = tag;;)
800     if ((endtag = strchr(endtag, '>')))
801     {
802     if (*(endtag - 1) != '\\')
803     break;
804     else
805     endtag++;
806     }
807     else
808     break;
809    
810    
811     if (endtag)
812     {
813     *endtag++ = '\0';
814    
815     if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
816     {
817     /* Check for META TAG TYPE 1 */
818     structure |= IN_META;
819     if (lstrstr(tag, "START"))
820     {
821     char *parsed_tag;
822    
823     if (
824     (metaNameEntry =
825     getHTMLMeta(indexf, tag, sw, NULL, &parsed_tag, fprop->real_path)))
826     {
827     /* realloc memory if needed */
828     if (currentmetanames == metaIDlen)
829     {
830     int *newbuf = (int *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, metaIDlen * 2 * sizeof(int));
831     memcpy((char *)newbuf,(char *)metaID,metaIDlen * sizeof(int));
832     metaID = newbuf;
833     metaIDlen *= 2;
834     }
835    
836     /* add metaname to array of current metanames */
837     metaID[currentmetanames] = metaNameEntry->metaID;
838    
839     /* Bump position for all metanames unless metaname in dontbumppositionOnmetatags */
840     if (!isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName))
841     position++;
842    
843     currentmetanames++;
844    
845    
846     p = endtag;
847    
848     /* If it is also a property store it until a < is found */
849     if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag)))
850     {
851     if ((endtag = strchr(p, '<')))
852     *endtag = '\0';
853    
854    
855     p = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
856    
857     remove_newlines(p); /** why isn't this just done for the entire doc? */
858    
859     if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char *)p, strlen(p), 0))
860     progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, fprop->real_path);
861    
862    
863     if (endtag)
864     *endtag = '<';
865    
866     continue;
867     }
868     }
869    
870     }
871    
872     else if (lstrstr(tag, "END"))
873     {
874     /* this will close the last metaname */
875     if (currentmetanames)
876     {
877     currentmetanames--;
878     if (!currentmetanames)
879     metaID[0] = 1;
880     }
881     }
882    
883     p = endtag;
884     }
885    
886     /* Check for META TAG TYPE 2 */
887     else if ((tag[0] != '!') && lstrstr(tag, "META") && (Name = lstrstr(tag, "NAME")) && (Content = lstrstr(tag, "CONTENT")))
888     {
889     ftotalwords += parseMetaData(sw, indexf, tag, idx->filenum, structure, Name, Content, thisFileEntry, &position, fprop->real_path);
890     p = endtag;
891     } /* Check for COMMENT */
892    
893     else if ((tag[0] == '!') && sw->indexComments)
894     {
895     ftotalwords += parsecomment(sw, tag, idx->filenum, structure, 1, &position);
896     p = endtag;
897     } /* Default: Continue */
898    
899     else
900     {
901     structure = getstructure(tag, structure);
902     p = endtag;
903     }
904     }
905     else
906     p = tag; /* tag not closed: continue */
907     }
908    
909     else
910     { /* No more '<' */
911    
912     newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
913    
914     if ( ! currentmetanames )
915     currentmetanames++;
916     ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);
917    
918     p = NULL;
919     }
920     }
921     return ftotalwords;
922     }

  ViewVC Help
Powered by ViewVC 1.1.22