/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/html.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/html.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 $Id: html.c,v 1.65 2002/05/16 18:51:52 whmoseley Exp $
3 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
4 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
5 **
6 ** This program and library is free software; you can redistribute it and/or
7 ** modify it under the terms of the GNU (Library) General Public License
8 ** as published by the Free Software Foundation; either version 2
9 ** of the License, or any later version.
10 **
11 ** This program is distributed in the hope that it will be useful,
12 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ** GNU (Library) General Public License for more details.
15 **
16 ** You should have received a copy of the GNU (Library) General Public License
17 ** along with this program; if not, write to the Free Software
18 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 **---------------------------------------------------------
20 ** ** ** PATCHED 5/13/96, CJC
21 ** Added MatchAndChange for regex in replace rule G.Hill 2/10/98
22 **
23 ** change sprintf to snprintf to avoid corruption
24 ** added safestrcpy() macro to avoid corruption from strcpy overflow
25 ** SRE 11/17/99
26 **
27 ** fixed cast to int problems pointed out by "gcc -Wall"
28 ** SRE 2/22/00
29 **
30 ** 2001-03-17 rasc save real_filename as title (instead full real_path)
31 ** was: compatibility issue to v 1.x.x
32 **
33 ** 2001-05-09 rasc entities completly rewritten (new module)
34 ** small fix in parseHTMLsummary
35 **
36 **
37 */
38
39 #include "swish.h"
40 #include "mem.h"
41 #include "string.h"
42 #include "index.h"
43 #include "compress.h"
44 #include "merge.h"
45 #include "docprop.h"
46 #include "metanames.h"
47 #include "html.h"
48 #include "entities.h"
49 #include "fs.h"
50 #include "error.h"
51
52 /* #### */
53
54 static char *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive);
55
56 static struct metaEntry *getHTMLMeta(IndexFILE * indexf, char *tag, SWISH *sw, char *name,
57 char **parsed_tag, char *filename)
58 {
59 char *temp;
60 int lenword = 0;
61 char *word = NULL;
62 char buffer[MAXSTRLEN + 1];
63 int i;
64 struct metaEntry *e = NULL;
65
66
67 word = buffer;
68 lenword = sizeof(buffer) - 1;
69
70 if (!name)
71 {
72 if (!(temp = (char *) lstrstr((char *) tag, (char *) "NAME")))
73 return NULL;
74 }
75 else
76 temp = name;
77 temp += 4; /* strlen("NAME") */
78
79 /* Get to the '=' sign disreguarding any other char */
80 while (*temp)
81 {
82 if (*temp && (*temp != '=')) /* TAB */
83 temp++;
84 else
85 {
86 temp++;
87 break;
88 }
89 }
90
91 /* Get to the beginning of the word disreguarding blanks and quotes */
92 /* TAB */
93 while (*temp)
94 {
95 if (*temp == ' ' || *temp == '"')
96 temp++;
97 else
98 break;
99 }
100
101 /* Copy the word and convert to lowercase */
102 /* TAB */
103 /* while (temp !=NULL && strncmp(temp," ",1) */
104 /* && strncmp(temp,"\"",1) && i<= MAXWORDLEN ) { */
105
106 /* and the above <= was wrong, should be < which caused the
107 null insertion below to be off by two bytes */
108
109
110 for (i = 0; temp != NULL && *temp && *temp != ' ' && *temp != '"';)
111 {
112 if (i == lenword)
113 {
114 lenword *= 2;
115 if(word == buffer)
116 {
117 word = (char *) emalloc(lenword + 1);
118 memcpy(word,buffer,sizeof(buffer));
119 }
120 else
121 word = (char *) erealloc(word, lenword + 1);
122 }
123 word[i] = *temp++;
124 i++;
125 }
126 if (i == lenword)
127 {
128 lenword *= 2;
129 word = (char *) erealloc(word, lenword + 1);
130 }
131 word[i] = '\0';
132
133 /* Use Rainer's routine */
134 strtolower(word);
135
136 *parsed_tag = word;
137
138 if ((e = getMetaNameByName(&indexf->header, word)))
139 return e;
140
141 if ( (sw->UndefinedMetaTags == UNDEF_META_AUTO) && word && *word)
142 {
143 if (sw->verbose)
144 printf("Adding automatic MetaName '%s' found in file '%s'\n", word, filename);
145
146 return addMetaEntry(&indexf->header, word, META_INDEX, 0);
147 }
148
149 /* If it is ok not to have the name listed, just index as no-name */
150 if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
151 progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", word, filename);
152
153 if(word != buffer)
154 efree(word);
155
156 return NULL;
157
158 }
159
160
161 /* Parses the Meta tag */
162 static int parseMetaData(SWISH * sw, IndexFILE * indexf, char *tag, int filenum, int structure, char *name, char *content, FileRec *thisFileEntry,
163 int *position, char *filename)
164 {
165 int metaName;
166 struct metaEntry *metaNameEntry;
167 char *temp,
168 *start,
169 *convtag;
170 int wordcount = 0; /* Word count */
171 char *parsed_tag;
172
173
174 /* Lookup (or add if "auto") meta name for tag */
175
176 metaNameEntry = getHTMLMeta(indexf, tag, sw, name, &parsed_tag, filename);
177 metaName = metaNameEntry ? metaNameEntry->metaID : 1;
178
179
180 temp = content + 7; /* 7 is strlen("CONTENT") */
181
182 /* Get to the " sign disreguarding other characters */
183 if ((temp = strchr(temp, '\"')))
184 {
185 structure |= IN_META;
186
187 start = temp + 1;
188
189 /* Jump escaped \" */
190 temp = strchr(start, '\"');
191 while (temp)
192 {
193 if (*(temp - 1) == '\\')
194 temp = strchr(temp + 1, '\"');
195 else
196 break;
197 }
198
199 if (temp)
200 *temp = '\0'; /* terminate CONTENT, temporarily */
201
202
203 /* Convert entities, if requested, and remove newlines */
204 convtag = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)start);
205 remove_newlines(convtag); /** why isn't this just done for the entire doc? */
206
207
208
209 /* Index only if a metaEntry was found, or if not not ReqMetaName */
210 if ( sw->UndefinedMetaTags != UNDEF_META_IGNORE || metaNameEntry)
211 {
212 /* Meta tags get bummped */
213 /* I'm not clear this works as well as I'd like because it always bumps on a new Meta tag,
214 * but in order to disable this behavior the name MUST be a meta name.
215 * Probably better to let getHTMLMeta() return the name as a string.
216 */
217
218
219 if (!metaNameEntry || !isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName))
220 position[0]++;
221
222 wordcount = indexstring(sw, convtag, filenum, structure, 1, &metaName, position);
223
224 if (!metaNameEntry || !isDontBumpMetaName(sw->dontbumpendtagslist, metaNameEntry->metaName))
225 position[0]++;
226
227 }
228
229
230 /* If it is a property store it */
231
232 if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag)))
233 if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char*)convtag, strlen(convtag), 0))
234 progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, filename);
235
236 if (temp)
237 *temp = '\"'; /* restore string */
238 }
239
240 return wordcount;
241 }
242
243
244 /* Extracts anything in <title> tags from an HTML file and returns it.
245 ** Otherwise, only the file name without its path is returned.
246 */
247
248 char *parseHTMLtitle(SWISH *sw, char *buffer)
249 {
250 char *title;
251 char *empty_title;
252
253 empty_title = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,1);
254 *empty_title = '\0';
255
256 if (!buffer)
257 return empty_title;
258
259 if ((title = parsetag(sw, "title", buffer, TITLETOPLINES, CASE_SENSITIVE_OFF)))
260 return title;
261
262 return empty_title;
263 }
264
265
266 /* Check if a particular title (read: file!) should be ignored
267 ** according to the settings in the configuration file.
268 */
269 /* This is to check "title contains" option in config file */
270
271 int isoktitle(SWISH * sw, char *title)
272 {
273 struct MOD_FS *fs = sw->FS;
274
275 return !match_regex_list(title, fs->filerules.title);
276 }
277
278
279
280
281 /* This returns the value corresponding to the HTML structures
282 ** a word is in.
283 */
284
285 static int getstructure(char *tag, int structure)
286 {
287
288 /* int len; *//* not used - 2/22/00 */
289 char oldChar = 0;
290 char *endOfTag = NULL;
291 char *pos;
292
293 pos = tag;
294 while (*pos)
295 {
296 if (isspace((int) ((unsigned char) *pos)))
297 {
298 endOfTag = pos; /* remember where we are... */
299 oldChar = *pos; /* ...and what we saw */
300 *pos = '\0'; /* truncate string, for now */
301 }
302 else
303 pos++;
304 }
305 /* Store Word Context
306 ** Modified DLN 1999-10-24 - Comments and Cleaning
307 ** TODO: Make sure that these allow for HTML attributes
308 * */
309
310 /* HEAD */
311 if (strcasecmp(tag, "/head") == 0)
312 structure &= ~IN_HEAD; /* Out */
313 else if (strcasecmp(tag, "head") == 0)
314 structure |= IN_HEAD; /* In */
315 /* TITLE */
316 else if (strcasecmp(tag, "/title") == 0)
317 structure &= ~IN_TITLE;
318 else if (strcasecmp(tag, "title") == 0)
319 structure |= IN_TITLE;
320 /* BODY */
321 else if (strcasecmp(tag, "/body") == 0)
322 structure &= ~IN_BODY; /* In */
323 else if (strcasecmp(tag, "body") == 0)
324 structure |= IN_BODY; /* Out */
325 /* H1, H2, H3, H4, H5, H6 */
326 else if (tag[0] == '/' && tolower((int)((unsigned char)tag[1])) == 'h' && isdigit((int)((unsigned char)tag[2]))) /* cast to int - 2/22/00 */
327 structure &= ~IN_HEADER; /* In */
328 else if (tolower((int)((unsigned char)tag[0])) == 'h' && isdigit((int)(unsigned char)tag[1])) /* cast to int - 2/22/00 */
329 structure |= IN_HEADER; /* Out */
330 /* EM, STRONG */
331 else if ((strcasecmp(tag, "/em") == 0) || (strcasecmp(tag, "/strong") == 0))
332 structure &= ~IN_EMPHASIZED; /* Out */
333 else if ((strcasecmp(tag, "em") == 0) || (strcasecmp(tag, "strong") == 0))
334 structure |= IN_EMPHASIZED; /* In */
335 /* B, I are seperate for semantics */
336 else if ((strcasecmp(tag, "/b") == 0) || (strcasecmp(tag, "/i") == 0))
337 structure &= ~IN_EMPHASIZED; /* Out */
338 else if ((strcasecmp(tag, "b") == 0) || (strcasecmp(tag, "i") == 0))
339 structure |= IN_EMPHASIZED; /* In */
340 /* The End */
341
342 if (endOfTag != NULL)
343 {
344 *endOfTag = oldChar;
345 }
346 return structure;
347 }
348
349
350
351 /* Get the MetaData index when the whole tag is passed */
352
353 /* Patch by Tom Brown */
354 /* TAB, this routine is/was somewhat pathetic... but it was pathetic in
355 1.2.4 too ... someone needed a course in defensive programming... there are
356 lots of tests below for temp != NULL, but what is desired is *temp != '\0'
357 (e.g. simply *temp) ... I'm going to remove some strncmp(temp,constant,1)
358 which are must faster as *temp != constant ...
359
360 Anyhow, the test case I've got that's core dumping is:
361 <META content=3D"MSHTML 5.00.2614.3401" name=3DGENERATOR>
362 no trailing quote, no trailing space... and with the missing/broken check for+ end of string it scribbles over the stack...
363
364 */
365
366
367
368 static char *parseHtmlSummary(char *buffer, char *field, int size, SWISH * sw)
369 {
370 char *p,
371 *q,
372 *tag,
373 *endtag,
374 c = '\0';
375 char *summary,
376 *beginsum,
377 *endsum,
378 *tmp,
379 *tmp2,
380 *tmp3;
381 int found,
382 lensummary;
383
384 /* Get the summary if no metaname/field is given */
385 if (!field && size)
386 {
387 /* Jump title if it exists */
388 if ((p = lstrstr(buffer, "</title>")))
389 {
390 p += 8;
391 }
392 else
393 p = buffer;
394 /* Let us try to find <body> */
395 if ((q = lstrstr(p, "<body")))
396 {
397 q = strchr(q, '>');
398 }
399 else
400 q = p;
401 summary = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(p)+1);
402 strcpy(summary,p);
403 remove_newlines(summary);
404
405 //$$$$ Todo: remove tag and content of scripts, css, java, embeddedobjects, comments, etc
406
407 remove_tags(summary);
408
409 summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);
410
411
412 /* use only the required memory -save those not used */
413 /* 2001-03-13 rasc copy only <size> bytes of string */
414 if((int) strlen(summary) > size)
415 summary[size]='\0';
416 return summary;
417 }
418
419 for (p = buffer, summary = NULL, found = 0, beginsum = NULL, endsum = NULL; p && *p;)
420 {
421 if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
422 { /* Look for non escaped '<' */
423 tag++;
424 for (endtag = tag;;)
425 if ((endtag = strchr(endtag, '>')))
426 {
427 if (*(endtag - 1) != '\\')
428 break;
429 else
430 endtag++;
431 }
432 else
433 break;
434 if (endtag)
435 {
436 c = *endtag;
437 *endtag++ = '\0';
438 if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
439 { /* Check for META TAG TYPE 1 */
440 if (lstrstr(tag, "START"))
441 {
442 if ((tmp = lstrstr(tag, "NAME")))
443 {
444 tmp += 4;
445 if (lstrstr(tmp, field))
446 {
447 beginsum = endtag;
448 found = 1;
449 }
450 p = endtag;
451 }
452 else
453 p = endtag;
454 }
455 else if (lstrstr(tag, "END"))
456 {
457 if (!found)
458 {
459 p = endtag;
460 }
461 else
462 {
463 endsum = tag - 1;
464 *(endtag - 1) = c;
465 break;
466 }
467 }
468 } /* Check for META TAG TYPE 2 */
469 else if ((tag[0] != '!') && lstrstr(tag, "META") && (tmp = lstrstr(tag, "NAME")) && (tmp2 = lstrstr(tag, "CONTENT")))
470 {
471 tmp += 4;
472 tmp3 = lstrstr(tmp, field);
473 if (tmp3 && tmp3 < tmp2)
474 {
475 tmp2 += 7;
476 if ((tmp = strchr(tmp2, '=')))
477 {
478 for (++tmp; isspace((int) ((unsigned char) *tmp)); tmp++);
479 if (*tmp == '\"')
480 {
481 beginsum = tmp + 1;
482 for (tmp = endtag - 1; tmp > beginsum; tmp--)
483 if (*tmp == '\"')
484 break;
485 if (tmp == beginsum)
486 endsum = endtag - 1;
487 else
488 endsum = tmp;
489 }
490 else
491 {
492 beginsum = tmp;
493 endsum = endtag - 1;
494 }
495 found = 1;
496 *(endtag - 1) = c;
497 break;
498
499 }
500 }
501 p = endtag;
502 } /* Default: Continue */
503 else
504 {
505 p = endtag;
506 }
507 }
508 else
509 p = NULL; /* tag not closed ->END */
510 if (endtag)
511 *(endtag - 1) = c;
512 }
513 else
514 { /* No more '<' */
515 p = NULL;
516 }
517 }
518 if (found && beginsum && endsum && endsum > beginsum)
519 {
520 lensummary = endsum - beginsum;
521 summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lensummary + 1);
522 memcpy(summary, beginsum, lensummary);
523 summary[lensummary] = '\0';
524 }
525 /* If field is set an no metaname is found, let us search */
526 /* for something like <field>bla bla </field> */
527 if (!summary && field)
528 {
529 summary = parsetag(sw, field, buffer, 0, CASE_SENSITIVE_OFF);
530 }
531 /* Finally check for something after title (if exists) and */
532 /* after <body> (if exists) */
533
534 if (!summary)
535 {
536 /* Jump title if it exists */
537 if ((p = lstrstr(buffer, "</title>")))
538 {
539 p += 8;
540 }
541 else
542 p = buffer;
543
544 /* Let us try to find <body> */
545 if ((q = lstrstr(p, "<body")))
546 {
547 q = strchr(q, '>');
548 }
549 else
550 q = p;
551
552 summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(q) + 1);
553 strcpy(summary,q);
554 }
555
556 if (summary)
557 {
558 remove_newlines(summary);
559 remove_tags(summary);
560 summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);
561 }
562
563 if (summary && size && ((int) strlen(summary)) > size)
564 summary[size] = '\0';
565 return summary;
566 }
567
568
569 #define NO_TAG 0
570 #define TAG_CLOSE 1
571 #define TAG_FOUND 2
572
573 /* Gets the content between "<parsetag>" and "</parsetag>" from buffer
574 limiting the scan to the first max_lines lines (0 means all lines) */
575 static char *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive)
576 {
577 register int c,
578 d;
579 register char *p,
580 *r;
581 char *tag;
582 int lencontent;
583 char *content;
584 int i,
585 j,
586 lines,
587 status,
588 tagbuflen,
589 totaltaglen,
590 curlencontent;
591 char *begintag;
592 char *endtag;
593 char *newbuf;
594 char *(*f_strstr) ();
595
596
597
598 if (case_sensitive)
599 f_strstr = strstr;
600 else
601 f_strstr = lstrstr;
602
603 lencontent = strlen(parsetag);
604 begintag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 3);
605 endtag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 4);
606 sprintf(begintag, "<%s>", parsetag);
607 sprintf(endtag, "</%s>", parsetag);
608
609 tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, 1);
610 tag[0] = '\0';
611
612 content = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (lencontent = MAXSTRLEN) + 1);
613 lines = 0;
614 status = NO_TAG;
615 p = content;
616 *p = '\0';
617
618
619 for (r = buffer;;)
620 {
621 c = *r++;
622 if (c == '\n')
623 {
624 lines++;
625 if (max_lines && lines == max_lines)
626 break;
627 }
628 if (!c)
629 return NULL;
630
631 switch (c)
632 {
633 case '<':
634 tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (tagbuflen = MAXSTRLEN) + 1);
635 totaltaglen = 0;
636 tag[totaltaglen++] = '<';
637
638 /* Collect until find '>' */
639 while (1)
640 {
641 d = *r++;
642 if (!d)
643 return NULL;
644 if (totaltaglen == tagbuflen)
645 {
646 newbuf = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, tagbuflen + 200 + 1);
647 memcpy(newbuf,tag,tagbuflen + 1);
648 tag = newbuf;
649 tagbuflen += 200;
650 }
651 tag[totaltaglen++] = d;
652 if (d == '>')
653 {
654 tag[totaltaglen] = '\0';
655 break;
656 }
657 }
658
659
660 if (f_strstr(tag, endtag))
661 {
662 status = TAG_CLOSE;
663 *p = '\0';
664
665 /* nulls to spaces */
666 for (i = 0; content[i]; i++)
667 if (content[i] == '\n')
668 content[i] = ' ';
669
670 /* skip over initial spaces and quotes */
671 for (i = 0; isspace((int) ((unsigned char) content[i])) || content[i] == '\"'; i++)
672 ;
673
674 /* shift buffer to left */
675 for (j = 0; content[i]; j++)
676 content[j] = content[i++];
677
678 content[j] = '\0';
679
680
681 /* remove trailing spaces, nulls, quotes */
682 for (j = strlen(content) - 1; ( j >= 0 ) && ( isspace((int) ((unsigned char) content[j])) || content[j] == '\0' || content[j] == '\"'); j--)
683 content[j] = '\0';
684
685 /* replace double quotes with single quotes -- why? */
686 for (j = 0; content[j]; j++)
687 if (content[j] == '\"')
688 content[j] = '\'';
689
690 if (*content)
691 return (content);
692 else
693 return NULL;
694 }
695 else if (f_strstr(tag, begintag))
696 {
697 status = TAG_FOUND;
698 }
699 break;
700 default:
701 if (status == TAG_FOUND)
702 {
703 curlencontent = p - content;
704 if (curlencontent == lencontent)
705 {
706 newbuf = Mem_ZoneAlloc(sw->Index->perDocTmpZone,(lencontent * 2) + 1);
707 memcpy(newbuf,content,lencontent + 1);
708 lencontent *= 2;
709 content = newbuf;
710 p = content + curlencontent;
711 }
712 *p = c;
713 p++;
714 }
715 }
716 }
717 return NULL;
718 }
719
720
721
722
723
724
725
726 /* Parses the words in a comment.
727 */
728
729 int parsecomment(SWISH * sw, char *tag, int filenum, int structure, int metaID, int *position)
730 {
731 structure |= IN_COMMENTS;
732 return indexstring(sw, tag + 1, filenum, structure, 1, &metaID, position);
733 }
734
735 /* Indexes all the words in a html file and adds the appropriate information
736 ** to the appropriate structures.
737 */
738
739 /* Indexes all the words in a html file and adds the appropriate information
740 ** to the appropriate structures.
741 */
742
743 int countwords_HTML(SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
744 {
745 int ftotalwords;
746 int *metaID;
747 int metaIDlen;
748 int position; /* Position of word in file */
749 int currentmetanames;
750 char *p,
751 *newp,
752 *tag,
753 *endtag;
754 int structure;
755 FileRec *thisFileEntry = fi;
756 struct metaEntry *metaNameEntry;
757 IndexFILE *indexf = sw->indexlist;
758 struct MOD_Index *idx = sw->Index;
759 char *Content = NULL,
760 *Name = NULL,
761 *summary = NULL;
762 char *title = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)parseHTMLtitle(sw, buffer));
763
764 if (!isoktitle(sw, title))
765 return -2;
766
767
768 if (fprop->stordesc)
769 summary = parseHtmlSummary(buffer, fprop->stordesc->field, fprop->stordesc->size, sw);
770
771 addCommonProperties( sw, fprop, fi, title, summary, 0 );
772
773 /* Init meta info */
774 metaID = (int *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,(metaIDlen = 16) * sizeof(int));
775
776 currentmetanames = 0;
777 ftotalwords = 0;
778 structure = IN_FILE;
779 metaID[0] = 1;
780 position = 1;
781
782
783 for (p = buffer; p && *p;)
784 {
785
786 /* Look for non escaped '<' */
787 if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
788 {
789 /* Index up to the tag */
790 *tag++ = '\0';
791
792 newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
793
794 if ( ! currentmetanames )
795 currentmetanames++;
796 ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);
797
798 /* Now let us look for a not escaped '>' */
799 for (endtag = tag;;)
800 if ((endtag = strchr(endtag, '>')))
801 {
802 if (*(endtag - 1) != '\\')
803 break;
804 else
805 endtag++;
806 }
807 else
808 break;
809
810
811 if (endtag)
812 {
813 *endtag++ = '\0';
814
815 if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
816 {
817 /* Check for META TAG TYPE 1 */
818 structure |= IN_META;
819 if (lstrstr(tag, "START"))
820 {
821 char *parsed_tag;
822
823 if (
824 (metaNameEntry =
825 getHTMLMeta(indexf, tag, sw, NULL, &parsed_tag, fprop->real_path)))
826 {
827 /* realloc memory if needed */
828 if (currentmetanames == metaIDlen)
829 {
830 int *newbuf = (int *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, metaIDlen * 2 * sizeof(int));
831 memcpy((char *)newbuf,(char *)metaID,metaIDlen * sizeof(int));
832 metaID = newbuf;
833 metaIDlen *= 2;
834 }
835
836 /* add metaname to array of current metanames */
837 metaID[currentmetanames] = metaNameEntry->metaID;
838
839 /* Bump position for all metanames unless metaname in dontbumppositionOnmetatags */
840 if (!isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName))
841 position++;
842
843 currentmetanames++;
844
845
846 p = endtag;
847
848 /* If it is also a property store it until a < is found */
849 if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag)))
850 {
851 if ((endtag = strchr(p, '<')))
852 *endtag = '\0';
853
854
855 p = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
856
857 remove_newlines(p); /** why isn't this just done for the entire doc? */
858
859 if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char *)p, strlen(p), 0))
860 progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, fprop->real_path);
861
862
863 if (endtag)
864 *endtag = '<';
865
866 continue;
867 }
868 }
869
870 }
871
872 else if (lstrstr(tag, "END"))
873 {
874 /* this will close the last metaname */
875 if (currentmetanames)
876 {
877 currentmetanames--;
878 if (!currentmetanames)
879 metaID[0] = 1;
880 }
881 }
882
883 p = endtag;
884 }
885
886 /* Check for META TAG TYPE 2 */
887 else if ((tag[0] != '!') && lstrstr(tag, "META") && (Name = lstrstr(tag, "NAME")) && (Content = lstrstr(tag, "CONTENT")))
888 {
889 ftotalwords += parseMetaData(sw, indexf, tag, idx->filenum, structure, Name, Content, thisFileEntry, &position, fprop->real_path);
890 p = endtag;
891 } /* Check for COMMENT */
892
893 else if ((tag[0] == '!') && sw->indexComments)
894 {
895 ftotalwords += parsecomment(sw, tag, idx->filenum, structure, 1, &position);
896 p = endtag;
897 } /* Default: Continue */
898
899 else
900 {
901 structure = getstructure(tag, structure);
902 p = endtag;
903 }
904 }
905 else
906 p = tag; /* tag not closed: continue */
907 }
908
909 else
910 { /* No more '<' */
911
912 newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
913
914 if ( ! currentmetanames )
915 currentmetanames++;
916 ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);
917
918 p = NULL;
919 }
920 }
921 return ftotalwords;
922 }

  ViewVC Help
Powered by ViewVC 1.1.22