/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/fs.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/fs.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 ** This program and library is free software; you can redistribute it and/or
3 ** modify it under the terms of the GNU (Library) General Public License
4 ** as published by the Free Software Foundation; either version 2
5 ** of the License, or any later version.
6 **
7 ** This program is distributed in the hope that it will be useful,
8 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
9 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 ** GNU (Library) General Public License for more details.
11 **
12 ** You should have received a copy of the GNU (Library) General Public License
13 ** long with this program; if not, write to the Free Software
14 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 **--------------------------------------------------------------------
16 **
17 ** change sprintf to snprintf to avoid corruption,
18 ** added safestrcpy() macro to avoid corruption from strcpy overflow,
19 ** and use MAXKEYLEN as string length vs. literal "34"
20 ** SRE 11/17/99
21 **
22 ** 2000-11 jruiz,rasc some redesign
23 ** 2001-04-07 rasc fixed FileRule pathname
24 **
25 */
26
27 #include "swish.h"
28 #include "mem.h"
29 #include "string.h"
30 #include "index.h"
31 #include "hash.h"
32 #include "file.h"
33 #include "list.h"
34 #include "fs.h"
35 #include "check.h"
36 #include "error.h"
37 #include "xml.h"
38 #include "txt.h"
39 #include "parse_conffile.h"
40 #include "swish_qsort.h"
41
42 typedef struct
43 {
44 int currentsize;
45 int maxsize;
46 char **filenames;
47 }
48 DOCENTRYARRAY;
49
50
51
52 #define MAXKEYLEN 34 /* Hash key -- allow for 64 bit inodes */
53
54
55 static int get_rules( char *name, StringList *sl, PATH_LIST *pathlist );
56 static int check_FileTests( char *path, PATH_LIST *test );
57 static void indexadir(SWISH *, char *);
58 static void indexafile(SWISH *, char *);
59 static void printfile(SWISH *, char *);
60 static void printfiles(SWISH *, DOCENTRYARRAY *);
61 static void printdirs(SWISH *, DOCENTRYARRAY *);
62 static DOCENTRYARRAY *adddocentry(DOCENTRYARRAY * e, char *filename);
63 static void split_path(char *path, char **directory, char **file);
64
65 /*
66 -- init structures for this module
67 */
68
69 void initModule_FS(SWISH * sw)
70 {
71 struct MOD_FS *fs;
72
73 fs = (struct MOD_FS *) emalloc(sizeof(struct MOD_FS));
74
75 memset(fs, 0, sizeof(struct MOD_FS));
76 sw->FS = fs;
77
78 }
79
80
81 /*
82 -- release all wired memory for this module
83 */
84
85 void freeModule_FS(SWISH * sw)
86 {
87 struct MOD_FS *fs = sw->FS;
88
89 /* Free fs parameters */
90
91 free_regex_list( &fs->filerules.pathname );
92 free_regex_list( &fs->filerules.dirname );
93 free_regex_list( &fs->filerules.filename );
94 free_regex_list( &fs->filerules.dircontains );
95 free_regex_list( &fs->filerules.title );
96
97 free_regex_list( &fs->filematch.pathname );
98 free_regex_list( &fs->filematch.dirname );
99 free_regex_list( &fs->filematch.filename );
100 free_regex_list( &fs->filematch.dircontains );
101 free_regex_list( &fs->filematch.title );
102
103
104 /* free module data */
105 efree(fs);
106 sw->FS = NULL;
107
108 return;
109 }
110
111
112 /*
113 -- Config Directives
114 -- Configuration directives for this Module
115 -- return: 0/1 = none/config applied
116 Aug 1, 2001 -- these probably should be pre-compiled regular expressions,
117 and their memory should be freed on exit. moseley
118 */
119
120
121
122 int configModule_FS(SWISH * sw, StringList * sl)
123 {
124 struct MOD_FS *fs = sw->FS;
125 char *w0 = sl->word[0];
126
127 if (strcasecmp(w0, "FileRules") == 0)
128 return get_rules( w0, sl, &fs->filerules );
129
130 if (strcasecmp(w0, "FileMatch") == 0)
131 return get_rules( w0, sl, &fs->filematch );
132
133
134 if (strcasecmp(w0, "FollowSymLinks") == 0)
135 {
136 fs->followsymlinks = getYesNoOrAbort(sl, 1, 1);
137 return 1;
138 }
139
140 return 0; /* not one of our parameters */
141
142 }
143
144 static int get_rules( char *name, StringList *sl, PATH_LIST *pathlist )
145 {
146 char *w1;
147 char *both;
148 int regex_pattern = 0;
149
150
151 if (sl->n < 4)
152 {
153 printf("err: Wrong number of parameters in %s\n", name);
154 return 0;
155 }
156
157
158 /* For "is" make sure it matches the entire pattern */
159 /* A bit ugly */
160
161 if ( strcasecmp(sl->word[2], "is") == 0 )
162 {
163 int i;
164 /* make patterns match the full string */
165 for ( i = 3; i < sl->n; i++ )
166 {
167 int len = strlen( sl->word[i] );
168 char *new;
169 char *old = sl->word[i];
170
171 if ( (strcasecmp( old, "not" ) == 0) && i < sl->n-1 )
172 continue;
173
174 new = emalloc( len + 3 );
175
176 if ( sl->word[i][0] != '^' )
177 {
178 strcpy( new, "^" );
179 strcat( new, sl->word[i] );
180 }
181 else
182 strcpy( new, sl->word[i] );
183
184 if ( sl->word[i][len-1] != '$' )
185 strcat( new, "$" );
186
187 sl->word[i] = new;
188 efree( old );
189 }
190
191 }
192
193 else if ( strcasecmp(sl->word[2], "regex") == 0 )
194 regex_pattern++;
195
196
197 else if ( !(strcasecmp(sl->word[2], "contains") == 0) )
198 {
199 printf("err: %s must be followed by [is|contains|regex]\n", name);
200 return 0;
201 }
202
203 w1 = sl->word[1];
204
205 both = emalloc( strlen( name ) + strlen( w1 ) + 2 );
206 strcpy( both, name );
207 strcat( both, " ");
208 strcat( both, w1 );
209
210
211
212 if ( strcasecmp(w1, "pathname") == 0 )
213 add_regex_patterns( both, &pathlist->pathname, &(sl->word)[3], regex_pattern );
214
215 else if ( strcasecmp(w1, "filename") == 0 )
216 add_regex_patterns( both, &pathlist->filename, &(sl->word)[3], regex_pattern );
217
218 else if ( strcasecmp(w1, "dirname") == 0 )
219 add_regex_patterns( both, &pathlist->dirname, &(sl->word)[3], regex_pattern );
220
221 else if ( strcasecmp(w1, "title") == 0 )
222 add_regex_patterns( both, &pathlist->title, &(sl->word)[3], regex_pattern );
223
224 else if ( strcasecmp(w1, "directory") == 0 )
225 add_regex_patterns( both, &pathlist->dircontains, &(sl->word)[3], regex_pattern );
226
227 else
228 {
229 printf("err: '%s' - invalid parameter '%s'\n", both, w1 );
230 return 0;
231 }
232
233
234 efree( both );
235 return 1;
236 }
237
238
239
240
241
242
243
244
245 /* Have we already indexed a file or directory?
246 ** This function is used to avoid multiple index entries
247 ** or endless looping due to symbolic links.
248 */
249
250 static int fs_already_indexed(SWISH * sw, char *path)
251 {
252 #ifndef NO_SYMBOLIC_FILE_LINKS
253 struct dev_ino *p;
254 struct stat buf;
255 char key[MAXKEYLEN]; /* Hash key -- allow for 64 bit inodes */
256 unsigned hashval;
257
258 if (stat(path, &buf))
259 return 0;
260
261 /* Create hash key: string contains device and inode. */
262 /* Avoid snprintf -> MAXKEYLEN is big enough for two longs
263 snprintf( key, MAXKEYLEN, "%lx/%lx", (unsigned long)buf.st_dev,
264 (unsigned long)buf.st_ino );
265 */
266 sprintf(key, "%lx/%lx", (unsigned long) buf.st_dev, (unsigned long) buf.st_ino);
267
268 hashval = bighash(key); /* Search hash for this file. */
269 for (p = sw->Index->inode_hash[hashval]; p != NULL; p = p->next)
270 if (p->dev == buf.st_dev && p->ino == buf.st_ino)
271 { /* We found it. */
272 if (sw->verbose >= 3)
273 printf("Skipping %s: %s\n", path, "Already indexed.");
274 return 1;
275 }
276
277 /* Not found, make new entry. */
278 p = (struct dev_ino *) Mem_ZoneAlloc(sw->Index->entryZone,sizeof(struct dev_ino));
279
280 p->dev = buf.st_dev;
281 p->ino = buf.st_ino;
282 p->next = sw->Index->inode_hash[hashval];
283 sw->Index->inode_hash[hashval] = p; /* Aug 1, 2001 -- this is not freed */
284 #endif
285
286 return 0;
287 }
288
289
290 /* Recursively goes into a directory and calls the word-indexing
291 ** functions for each file that's found.
292 */
293
294 static void indexadir(SWISH * sw, char *dir)
295 {
296 int allgoodfiles = 0;
297 DIR *dfd;
298
299 #ifdef NEXTSTEP
300 struct direct *dp;
301 #else
302 struct dirent *dp;
303 #endif
304 int pathbuflen;
305 char *pathname;
306 DOCENTRYARRAY *sortfilelist = NULL;
307 DOCENTRYARRAY *sortdirlist = NULL;
308 int dirlen = strlen( dir );
309 struct MOD_FS *fs = sw->FS;
310
311 if (fs_already_indexed(sw, dir))
312 return;
313
314 /* and another stat if not set to follow symlinks */
315 if (!fs->followsymlinks && islink(dir))
316 return;
317
318
319 /* logic is not well defined - here we only check dirname */
320 /* but that bypasses pathname checks -- but that's checked per-file */
321 /* This allows one to override File* directory checks */
322 /* but allows a pathname check to be limited to full paths */
323 /* This also means you can avoid indexing an entire directory tree with FileRules dirname, */
324 /* but using a FileRules pathname allows recursion into the directory */
325
326 /* Reject entire directory due to FileRules dirname */
327 if ( *dir && match_regex_list( dir, fs->filerules.dirname ) )
328 return;
329
330
331
332
333 /* Handle "FileRules directory" directive */
334 /* - Check all files within the directory before proceeding -- means reading the directory twice */
335 /* - All files are checked. */
336
337 if (fs->filematch.dircontains || fs->filerules.dircontains )
338 {
339 if ((dfd = opendir(dir)) == NULL)
340 {
341 if ( sw->verbose )
342 progwarnno("Failed to open dir '%s' :", dir );
343 return;
344 }
345
346 while ((dp = readdir(dfd)) != NULL)
347 {
348 if ( match_regex_list( dp->d_name, fs->filerules.dircontains ) )
349 {
350 closedir( dfd );
351 return; /* doesn't recurse */
352 }
353
354 if ( match_regex_list( dp->d_name, fs->filematch.dircontains ) )
355 {
356 allgoodfiles++;
357 break;
358 }
359
360 }
361 closedir(dfd);
362 }
363
364
365 /* Now, build list of files and directories */
366
367 pathbuflen = MAXFILELEN;
368 pathname = (char *) emalloc( pathbuflen + 1 );
369
370 if ((dfd = opendir(dir)) == NULL)
371 {
372 if ( sw->verbose )
373 progwarnno("Failed to open dir '%s' :", dir );
374 return;
375 }
376
377
378 if ( dirlen == 1 && *dir == '/' ) /* case of root dir */
379 dirlen = 0;
380
381 while ((dp = readdir(dfd)) != NULL)
382 {
383 int filelen = strlen( dp->d_name );
384
385 /* For security reasons, don't index dot files */
386 /* Check for hidden under Windows? */
387
388 if ((dp->d_name)[0] == '.')
389 continue;
390
391
392 /* Build full path to file */
393
394 /* reallocate filename buffer, if needed (dir + path + '/' ) */
395 if ( dirlen + filelen + 1 > pathbuflen )
396 {
397 pathbuflen = dirlen + filelen + 200;
398 pathname = (char *) erealloc(pathname, pathbuflen + 1);
399 }
400
401 if ( dirlen )
402 memcpy(pathname, dir, dirlen);
403
404 pathname[dirlen] = '/'; // Add path separator
405 memcpy(pathname + dirlen + 1, dp->d_name, filelen);
406 pathname[ dirlen + filelen + 1] = '\0';
407
408 /* Check if the path is a symlink */
409 if ( !fs->followsymlinks && islink( pathname ) )
410 continue;
411
412
413 if ( isdirectory(pathname) )
414 {
415 sortdirlist = (DOCENTRYARRAY *) adddocentry(sortdirlist, pathname);
416 }
417 else
418 {
419 if (fs_already_indexed(sw, pathname))
420 continue;
421
422 if ( allgoodfiles || check_FileTests( pathname, &fs->filematch ) )
423 {
424 sortfilelist = (DOCENTRYARRAY *) adddocentry(sortfilelist, pathname);
425 continue;
426 }
427
428
429 if (!isoksuffix(dp->d_name, sw->suffixlist))
430 continue;
431
432
433 /* Check FileRules for rejects */
434 if ( check_FileTests( pathname, &fs->filerules ) )
435 continue;
436
437 sortfilelist = (DOCENTRYARRAY *) adddocentry(sortfilelist, pathname);
438 }
439 }
440
441 efree(pathname);
442
443 closedir(dfd);
444
445 printfiles(sw, sortfilelist);
446 printdirs(sw, sortdirlist);
447 }
448
449 /* Calls the word-indexing function for a single file.
450 */
451
452 static void indexafile(SWISH * sw, char *path)
453 {
454 struct MOD_FS *fs = sw->FS;
455
456
457 if (!fs->followsymlinks && islink(path))
458 return;
459
460
461 /* This only means "IndexDir test.html test.html test.html" will only index test.html once */
462 if (fs_already_indexed(sw, path))
463 return;
464
465 /* Check for File|Pathmatch, and index if any match */
466 if ( check_FileTests( path, &fs->filematch ) )
467 {
468 printfile(sw, path);
469 return;
470 }
471
472 /* This is likely faster, so do it first */
473 if (!isoksuffix(path, sw->suffixlist))
474 return;
475
476 /* Check FileRules for rejects */
477 if ( check_FileTests( path, &fs->filerules ) )
478 return;
479
480
481 /* Passed all tests, so index */
482 printfile(sw, path);
483 }
484
485 /**********************************************************
486 * Process FileTests
487 *
488 * Returns 1 = something matched
489 *
490 **********************************************************/
491 static int check_FileTests( char *path, PATH_LIST *test )
492 {
493 char *dir;
494 char *file;
495
496
497 if ( match_regex_list( path, test->pathname ) )
498 return 1;
499
500 if ( !( test->dirname || test->filename ) )
501 return 0;
502
503 split_path( path, &dir, &file );
504
505 if ( *dir && match_regex_list( dir, test->dirname ) )
506 {
507 efree( dir );
508 efree( file );
509 return 1;
510 }
511
512 if ( *file && match_regex_list( file, test->filename ) )
513 {
514 efree( dir );
515 efree( file );
516 return 1;
517 }
518
519 efree( dir );
520 efree( file );
521 return 0;
522 }
523
524 /***************************************************
525 * Note that this is mostly a duplicate of above,
526 * but was designed to work with both path and URLs
527 *
528 * Probably should settle on one
529 * Also, this returns "" on empty dirs, where above returns " "
530 * Mar 2002 -- and is only called by fs.c...
531 * May 2002, moved to fs.c. Why isn't basename library used for this?
532 ***************************************************/
533
534 static void split_path(char *path, char **directory, char **file)
535 {
536 char *p1,
537 *p2,
538 *p3;
539
540 /* look for last DIRDELIMITER (FS) and last / (HTTP) */
541 //p1 = strrchr( path, DIRDELIMITER);
542 p1 = strrchr( path, '/');
543 p2 = strrchr( path, '/');
544
545 if (p1 && p2)
546 { /* if both are found, use the longest. */
547 if (p1 >= p2)
548 p3 = p1;
549 else
550 p3 = p2;
551 } else if (p1 && !p2)
552 p3 = p1;
553 else if (!p1 && p2)
554 p3 = p2;
555 else
556 p3 = NULL;
557
558 /* Set directory */
559 if (!p3)
560 *directory = (char *) estrdup((char *) "");
561 else
562 {
563 char c = *++p3;
564
565 *p3 = '\0';
566 *directory = (char *) estrdup((char *) path);
567 *p3 = c;
568 path = p3;
569 }
570
571 *file = (char *) estrdup((char *) path);
572 }
573
574
575
576
577 /* Indexes the words in the file
578 */
579
580 static void printfile(SWISH * sw, char *filename)
581 {
582 char *s;
583 FileProp *fprop;
584
585
586 if (filename)
587 {
588 if (sw->verbose >= 3)
589 {
590 /* Only display file name */
591 if ((s = (char *) strrchr(filename, '/')) == NULL)
592 printf(" %s", filename);
593 else
594 printf(" %s", s + 1);
595 fflush(stdout);
596 }
597
598
599 fprop = file_properties(filename, filename, sw);
600
601 do_index_file(sw, fprop);
602
603 free_file_properties(fprop);
604 }
605 }
606
607 /* 2001-08 Jose Ruiz */
608 /* function for comparing filenames to get all filenames in a dir sorted
609 ** Original addsortentry used strcmp - So, I use the same routine here
610 ** What about Win32?
611 */
612 int compfilenames(const void *s1, const void *s2)
613 {
614 char *r1 = *(char * const *) s1;
615 char *r2 = *(char * const *) s2;
616
617 return strcmp(r1,r2);
618 }
619
620 /* Indexes the words in all the files in the array of files
621 ** The array is sorted alphabetically
622 */
623
624 static void printfiles(SWISH * sw, DOCENTRYARRAY * e)
625 {
626 int i;
627
628 if (e)
629 {
630 /* 2001-08 sorting of filenames moved here - Do we really
631 ** need to sort them? - Adjust it in config.h */
632 if(e->currentsize)
633 {
634 if(SORT_FILENAMES)
635 {
636 swish_qsort(e->filenames, e->currentsize, sizeof(char *), compfilenames);
637 }
638 }
639
640 for (i = 0; i < e->currentsize; i++)
641 {
642 printfile(sw, e->filenames[i]);
643 efree( e->filenames[i] );
644 }
645
646 /* free the array and filenames */
647 efree(e->filenames);
648 efree(e);
649 }
650 }
651
652 /* Prints out the directory names as things are getting indexed.
653 ** Calls indexadir() so directories in the array are indexed,
654 ** in alphabetical order...
655 */
656
657 void printdirs(SWISH * sw, DOCENTRYARRAY * e)
658 {
659 int i;
660
661 if (e)
662 {
663 /* 2001-08 sorting of dirs moved here - Do we really
664 ** need to sort them? - Adjust it in config.h */
665 if(e->currentsize)
666 {
667 if(SORT_FILENAMES)
668 {
669 swish_qsort(e->filenames, e->currentsize, sizeof(char *), compfilenames);
670 }
671 }
672
673 for (i = 0; i < e->currentsize; i++)
674 {
675 if (sw->verbose >= 3)
676 printf("\nIn dir \"%s\":\n", e->filenames[i]);
677 else if (sw->verbose >= 2)
678 printf("Checking dir \"%s\"...\n", e->filenames[i]);
679
680 indexadir(sw, e->filenames[i]);
681 efree(e->filenames[i]);
682 }
683 efree(e->filenames);
684 efree(e);
685 }
686 }
687
688 /* Stores file names in alphabetical order so they can be
689 ** indexed alphabetically. No big whoop.
690 */
691
692 static DOCENTRYARRAY *adddocentry(DOCENTRYARRAY * e, char *filename)
693 {
694 if (e == NULL)
695 {
696 e = (DOCENTRYARRAY *) emalloc(sizeof(DOCENTRYARRAY));
697 e->maxsize = VERYBIGHASHSIZE; /* Put what you like */
698 e->filenames = (char **) emalloc(e->maxsize * sizeof(char *));
699
700 e->currentsize = 1;
701 e->filenames[0] = (char *) estrdup(filename);
702 }
703 else
704 {
705 if ((e->currentsize + 1) == e->maxsize)
706 {
707 e->maxsize += 1000;
708 e->filenames = (char **) erealloc(e->filenames, e->maxsize * sizeof(char *));
709 }
710 e->filenames[e->currentsize++] = (char *) estrdup(filename);
711 }
712 return e;
713 }
714
715
716
717
718 /********************************************************/
719 /* "Public" functions */
720 /********************************************************/
721
722 void fs_indexpath(SWISH * sw, char *path)
723 {
724
725 normalize_path( path ); /* flip backslashes and remove trailing slash */
726
727
728 if (isdirectory(path))
729 {
730 if (sw->verbose >= 2)
731 printf("\nChecking dir \"%s\"...\n", path);
732
733 indexadir(sw, path);
734 }
735
736 else if (isfile(path))
737 {
738 if (sw->verbose >= 2)
739 printf("\nChecking file \"%s\"...\n", path);
740 indexafile(sw, path);
741 }
742 else
743 progwarnno("Invalid path '%s': ", path);
744 }
745
746
747
748
749 struct _indexing_data_source_def FileSystemIndexingDataSource = {
750 "File-System",
751 "fs",
752 fs_indexpath,
753 configModule_FS
754 };

  ViewVC Help
Powered by ViewVC 1.1.22