/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/fs.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/fs.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     ** This program and library is free software; you can redistribute it and/or
3     ** modify it under the terms of the GNU (Library) General Public License
4     ** as published by the Free Software Foundation; either version 2
5     ** of the License, or any later version.
6     **
7     ** This program is distributed in the hope that it will be useful,
8     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
9     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10     ** GNU (Library) General Public License for more details.
11     **
12     ** You should have received a copy of the GNU (Library) General Public License
13     ** long with this program; if not, write to the Free Software
14     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15     **--------------------------------------------------------------------
16     **
17     ** change sprintf to snprintf to avoid corruption,
18     ** added safestrcpy() macro to avoid corruption from strcpy overflow,
19     ** and use MAXKEYLEN as string length vs. literal "34"
20     ** SRE 11/17/99
21     **
22     ** 2000-11 jruiz,rasc some redesign
23     ** 2001-04-07 rasc fixed FileRule pathname
24     **
25     */
26    
27     #include "swish.h"
28     #include "mem.h"
29     #include "string.h"
30     #include "index.h"
31     #include "hash.h"
32     #include "file.h"
33     #include "list.h"
34     #include "fs.h"
35     #include "check.h"
36     #include "error.h"
37     #include "xml.h"
38     #include "txt.h"
39     #include "parse_conffile.h"
40     #include "swish_qsort.h"
41    
42     typedef struct
43     {
44     int currentsize;
45     int maxsize;
46     char **filenames;
47     }
48     DOCENTRYARRAY;
49    
50    
51    
52     #define MAXKEYLEN 34 /* Hash key -- allow for 64 bit inodes */
53    
54    
55     static int get_rules( char *name, StringList *sl, PATH_LIST *pathlist );
56     static int check_FileTests( char *path, PATH_LIST *test );
57     static void indexadir(SWISH *, char *);
58     static void indexafile(SWISH *, char *);
59     static void printfile(SWISH *, char *);
60     static void printfiles(SWISH *, DOCENTRYARRAY *);
61     static void printdirs(SWISH *, DOCENTRYARRAY *);
62     static DOCENTRYARRAY *adddocentry(DOCENTRYARRAY * e, char *filename);
63     static void split_path(char *path, char **directory, char **file);
64    
65     /*
66     -- init structures for this module
67     */
68    
69     void initModule_FS(SWISH * sw)
70     {
71     struct MOD_FS *fs;
72    
73     fs = (struct MOD_FS *) emalloc(sizeof(struct MOD_FS));
74    
75     memset(fs, 0, sizeof(struct MOD_FS));
76     sw->FS = fs;
77    
78     }
79    
80    
81     /*
82     -- release all wired memory for this module
83     */
84    
85     void freeModule_FS(SWISH * sw)
86     {
87     struct MOD_FS *fs = sw->FS;
88    
89     /* Free fs parameters */
90    
91     free_regex_list( &fs->filerules.pathname );
92     free_regex_list( &fs->filerules.dirname );
93     free_regex_list( &fs->filerules.filename );
94     free_regex_list( &fs->filerules.dircontains );
95     free_regex_list( &fs->filerules.title );
96    
97     free_regex_list( &fs->filematch.pathname );
98     free_regex_list( &fs->filematch.dirname );
99     free_regex_list( &fs->filematch.filename );
100     free_regex_list( &fs->filematch.dircontains );
101     free_regex_list( &fs->filematch.title );
102    
103    
104     /* free module data */
105     efree(fs);
106     sw->FS = NULL;
107    
108     return;
109     }
110    
111    
112     /*
113     -- Config Directives
114     -- Configuration directives for this Module
115     -- return: 0/1 = none/config applied
116     Aug 1, 2001 -- these probably should be pre-compiled regular expressions,
117     and their memory should be freed on exit. moseley
118     */
119    
120    
121    
122     int configModule_FS(SWISH * sw, StringList * sl)
123     {
124     struct MOD_FS *fs = sw->FS;
125     char *w0 = sl->word[0];
126    
127     if (strcasecmp(w0, "FileRules") == 0)
128     return get_rules( w0, sl, &fs->filerules );
129    
130     if (strcasecmp(w0, "FileMatch") == 0)
131     return get_rules( w0, sl, &fs->filematch );
132    
133    
134     if (strcasecmp(w0, "FollowSymLinks") == 0)
135     {
136     fs->followsymlinks = getYesNoOrAbort(sl, 1, 1);
137     return 1;
138     }
139    
140     return 0; /* not one of our parameters */
141    
142     }
143    
144     static int get_rules( char *name, StringList *sl, PATH_LIST *pathlist )
145     {
146     char *w1;
147     char *both;
148     int regex_pattern = 0;
149    
150    
151     if (sl->n < 4)
152     {
153     printf("err: Wrong number of parameters in %s\n", name);
154     return 0;
155     }
156    
157    
158     /* For "is" make sure it matches the entire pattern */
159     /* A bit ugly */
160    
161     if ( strcasecmp(sl->word[2], "is") == 0 )
162     {
163     int i;
164     /* make patterns match the full string */
165     for ( i = 3; i < sl->n; i++ )
166     {
167     int len = strlen( sl->word[i] );
168     char *new;
169     char *old = sl->word[i];
170    
171     if ( (strcasecmp( old, "not" ) == 0) && i < sl->n-1 )
172     continue;
173    
174     new = emalloc( len + 3 );
175    
176     if ( sl->word[i][0] != '^' )
177     {
178     strcpy( new, "^" );
179     strcat( new, sl->word[i] );
180     }
181     else
182     strcpy( new, sl->word[i] );
183    
184     if ( sl->word[i][len-1] != '$' )
185     strcat( new, "$" );
186    
187     sl->word[i] = new;
188     efree( old );
189     }
190    
191     }
192    
193     else if ( strcasecmp(sl->word[2], "regex") == 0 )
194     regex_pattern++;
195    
196    
197     else if ( !(strcasecmp(sl->word[2], "contains") == 0) )
198     {
199     printf("err: %s must be followed by [is|contains|regex]\n", name);
200     return 0;
201     }
202    
203     w1 = sl->word[1];
204    
205     both = emalloc( strlen( name ) + strlen( w1 ) + 2 );
206     strcpy( both, name );
207     strcat( both, " ");
208     strcat( both, w1 );
209    
210    
211    
212     if ( strcasecmp(w1, "pathname") == 0 )
213     add_regex_patterns( both, &pathlist->pathname, &(sl->word)[3], regex_pattern );
214    
215     else if ( strcasecmp(w1, "filename") == 0 )
216     add_regex_patterns( both, &pathlist->filename, &(sl->word)[3], regex_pattern );
217    
218     else if ( strcasecmp(w1, "dirname") == 0 )
219     add_regex_patterns( both, &pathlist->dirname, &(sl->word)[3], regex_pattern );
220    
221     else if ( strcasecmp(w1, "title") == 0 )
222     add_regex_patterns( both, &pathlist->title, &(sl->word)[3], regex_pattern );
223    
224     else if ( strcasecmp(w1, "directory") == 0 )
225     add_regex_patterns( both, &pathlist->dircontains, &(sl->word)[3], regex_pattern );
226    
227     else
228     {
229     printf("err: '%s' - invalid parameter '%s'\n", both, w1 );
230     return 0;
231     }
232    
233    
234     efree( both );
235     return 1;
236     }
237    
238    
239    
240    
241    
242    
243    
244    
245     /* Have we already indexed a file or directory?
246     ** This function is used to avoid multiple index entries
247     ** or endless looping due to symbolic links.
248     */
249    
250     static int fs_already_indexed(SWISH * sw, char *path)
251     {
252     #ifndef NO_SYMBOLIC_FILE_LINKS
253     struct dev_ino *p;
254     struct stat buf;
255     char key[MAXKEYLEN]; /* Hash key -- allow for 64 bit inodes */
256     unsigned hashval;
257    
258     if (stat(path, &buf))
259     return 0;
260    
261     /* Create hash key: string contains device and inode. */
262     /* Avoid snprintf -> MAXKEYLEN is big enough for two longs
263     snprintf( key, MAXKEYLEN, "%lx/%lx", (unsigned long)buf.st_dev,
264     (unsigned long)buf.st_ino );
265     */
266     sprintf(key, "%lx/%lx", (unsigned long) buf.st_dev, (unsigned long) buf.st_ino);
267    
268     hashval = bighash(key); /* Search hash for this file. */
269     for (p = sw->Index->inode_hash[hashval]; p != NULL; p = p->next)
270     if (p->dev == buf.st_dev && p->ino == buf.st_ino)
271     { /* We found it. */
272     if (sw->verbose >= 3)
273     printf("Skipping %s: %s\n", path, "Already indexed.");
274     return 1;
275     }
276    
277     /* Not found, make new entry. */
278     p = (struct dev_ino *) Mem_ZoneAlloc(sw->Index->entryZone,sizeof(struct dev_ino));
279    
280     p->dev = buf.st_dev;
281     p->ino = buf.st_ino;
282     p->next = sw->Index->inode_hash[hashval];
283     sw->Index->inode_hash[hashval] = p; /* Aug 1, 2001 -- this is not freed */
284     #endif
285    
286     return 0;
287     }
288    
289    
290     /* Recursively goes into a directory and calls the word-indexing
291     ** functions for each file that's found.
292     */
293    
294     static void indexadir(SWISH * sw, char *dir)
295     {
296     int allgoodfiles = 0;
297     DIR *dfd;
298    
299     #ifdef NEXTSTEP
300     struct direct *dp;
301     #else
302     struct dirent *dp;
303     #endif
304     int pathbuflen;
305     char *pathname;
306     DOCENTRYARRAY *sortfilelist = NULL;
307     DOCENTRYARRAY *sortdirlist = NULL;
308     int dirlen = strlen( dir );
309     struct MOD_FS *fs = sw->FS;
310    
311     if (fs_already_indexed(sw, dir))
312     return;
313    
314     /* and another stat if not set to follow symlinks */
315     if (!fs->followsymlinks && islink(dir))
316     return;
317    
318    
319     /* logic is not well defined - here we only check dirname */
320     /* but that bypasses pathname checks -- but that's checked per-file */
321     /* This allows one to override File* directory checks */
322     /* but allows a pathname check to be limited to full paths */
323     /* This also means you can avoid indexing an entire directory tree with FileRules dirname, */
324     /* but using a FileRules pathname allows recursion into the directory */
325    
326     /* Reject entire directory due to FileRules dirname */
327     if ( *dir && match_regex_list( dir, fs->filerules.dirname ) )
328     return;
329    
330    
331    
332    
333     /* Handle "FileRules directory" directive */
334     /* - Check all files within the directory before proceeding -- means reading the directory twice */
335     /* - All files are checked. */
336    
337     if (fs->filematch.dircontains || fs->filerules.dircontains )
338     {
339     if ((dfd = opendir(dir)) == NULL)
340     {
341     if ( sw->verbose )
342     progwarnno("Failed to open dir '%s' :", dir );
343     return;
344     }
345    
346     while ((dp = readdir(dfd)) != NULL)
347     {
348     if ( match_regex_list( dp->d_name, fs->filerules.dircontains ) )
349     {
350     closedir( dfd );
351     return; /* doesn't recurse */
352     }
353    
354     if ( match_regex_list( dp->d_name, fs->filematch.dircontains ) )
355     {
356     allgoodfiles++;
357     break;
358     }
359    
360     }
361     closedir(dfd);
362     }
363    
364    
365     /* Now, build list of files and directories */
366    
367     pathbuflen = MAXFILELEN;
368     pathname = (char *) emalloc( pathbuflen + 1 );
369    
370     if ((dfd = opendir(dir)) == NULL)
371     {
372     if ( sw->verbose )
373     progwarnno("Failed to open dir '%s' :", dir );
374     return;
375     }
376    
377    
378     if ( dirlen == 1 && *dir == '/' ) /* case of root dir */
379     dirlen = 0;
380    
381     while ((dp = readdir(dfd)) != NULL)
382     {
383     int filelen = strlen( dp->d_name );
384    
385     /* For security reasons, don't index dot files */
386     /* Check for hidden under Windows? */
387    
388     if ((dp->d_name)[0] == '.')
389     continue;
390    
391    
392     /* Build full path to file */
393    
394     /* reallocate filename buffer, if needed (dir + path + '/' ) */
395     if ( dirlen + filelen + 1 > pathbuflen )
396     {
397     pathbuflen = dirlen + filelen + 200;
398     pathname = (char *) erealloc(pathname, pathbuflen + 1);
399     }
400    
401     if ( dirlen )
402     memcpy(pathname, dir, dirlen);
403    
404     pathname[dirlen] = '/'; // Add path separator
405     memcpy(pathname + dirlen + 1, dp->d_name, filelen);
406     pathname[ dirlen + filelen + 1] = '\0';
407    
408     /* Check if the path is a symlink */
409     if ( !fs->followsymlinks && islink( pathname ) )
410     continue;
411    
412    
413     if ( isdirectory(pathname) )
414     {
415     sortdirlist = (DOCENTRYARRAY *) adddocentry(sortdirlist, pathname);
416     }
417     else
418     {
419     if (fs_already_indexed(sw, pathname))
420     continue;
421    
422     if ( allgoodfiles || check_FileTests( pathname, &fs->filematch ) )
423     {
424     sortfilelist = (DOCENTRYARRAY *) adddocentry(sortfilelist, pathname);
425     continue;
426     }
427    
428    
429     if (!isoksuffix(dp->d_name, sw->suffixlist))
430     continue;
431    
432    
433     /* Check FileRules for rejects */
434     if ( check_FileTests( pathname, &fs->filerules ) )
435     continue;
436    
437     sortfilelist = (DOCENTRYARRAY *) adddocentry(sortfilelist, pathname);
438     }
439     }
440    
441     efree(pathname);
442    
443     closedir(dfd);
444    
445     printfiles(sw, sortfilelist);
446     printdirs(sw, sortdirlist);
447     }
448    
449     /* Calls the word-indexing function for a single file.
450     */
451    
452     static void indexafile(SWISH * sw, char *path)
453     {
454     struct MOD_FS *fs = sw->FS;
455    
456    
457     if (!fs->followsymlinks && islink(path))
458     return;
459    
460    
461     /* This only means "IndexDir test.html test.html test.html" will only index test.html once */
462     if (fs_already_indexed(sw, path))
463     return;
464    
465     /* Check for File|Pathmatch, and index if any match */
466     if ( check_FileTests( path, &fs->filematch ) )
467     {
468     printfile(sw, path);
469     return;
470     }
471    
472     /* This is likely faster, so do it first */
473     if (!isoksuffix(path, sw->suffixlist))
474     return;
475    
476     /* Check FileRules for rejects */
477     if ( check_FileTests( path, &fs->filerules ) )
478     return;
479    
480    
481     /* Passed all tests, so index */
482     printfile(sw, path);
483     }
484    
485     /**********************************************************
486     * Process FileTests
487     *
488     * Returns 1 = something matched
489     *
490     **********************************************************/
491     static int check_FileTests( char *path, PATH_LIST *test )
492     {
493     char *dir;
494     char *file;
495    
496    
497     if ( match_regex_list( path, test->pathname ) )
498     return 1;
499    
500     if ( !( test->dirname || test->filename ) )
501     return 0;
502    
503     split_path( path, &dir, &file );
504    
505     if ( *dir && match_regex_list( dir, test->dirname ) )
506     {
507     efree( dir );
508     efree( file );
509     return 1;
510     }
511    
512     if ( *file && match_regex_list( file, test->filename ) )
513     {
514     efree( dir );
515     efree( file );
516     return 1;
517     }
518    
519     efree( dir );
520     efree( file );
521     return 0;
522     }
523    
524     /***************************************************
525     * Note that this is mostly a duplicate of above,
526     * but was designed to work with both path and URLs
527     *
528     * Probably should settle on one
529     * Also, this returns "" on empty dirs, where above returns " "
530     * Mar 2002 -- and is only called by fs.c...
531     * May 2002, moved to fs.c. Why isn't basename library used for this?
532     ***************************************************/
533    
534     static void split_path(char *path, char **directory, char **file)
535     {
536     char *p1,
537     *p2,
538     *p3;
539    
540     /* look for last DIRDELIMITER (FS) and last / (HTTP) */
541     //p1 = strrchr( path, DIRDELIMITER);
542     p1 = strrchr( path, '/');
543     p2 = strrchr( path, '/');
544    
545     if (p1 && p2)
546     { /* if both are found, use the longest. */
547     if (p1 >= p2)
548     p3 = p1;
549     else
550     p3 = p2;
551     } else if (p1 && !p2)
552     p3 = p1;
553     else if (!p1 && p2)
554     p3 = p2;
555     else
556     p3 = NULL;
557    
558     /* Set directory */
559     if (!p3)
560     *directory = (char *) estrdup((char *) "");
561     else
562     {
563     char c = *++p3;
564    
565     *p3 = '\0';
566     *directory = (char *) estrdup((char *) path);
567     *p3 = c;
568     path = p3;
569     }
570    
571     *file = (char *) estrdup((char *) path);
572     }
573    
574    
575    
576    
577     /* Indexes the words in the file
578     */
579    
580     static void printfile(SWISH * sw, char *filename)
581     {
582     char *s;
583     FileProp *fprop;
584    
585    
586     if (filename)
587     {
588     if (sw->verbose >= 3)
589     {
590     /* Only display file name */
591     if ((s = (char *) strrchr(filename, '/')) == NULL)
592     printf(" %s", filename);
593     else
594     printf(" %s", s + 1);
595     fflush(stdout);
596     }
597    
598    
599     fprop = file_properties(filename, filename, sw);
600    
601     do_index_file(sw, fprop);
602    
603     free_file_properties(fprop);
604     }
605     }
606    
607     /* 2001-08 Jose Ruiz */
608     /* function for comparing filenames to get all filenames in a dir sorted
609     ** Original addsortentry used strcmp - So, I use the same routine here
610     ** What about Win32?
611     */
612     int compfilenames(const void *s1, const void *s2)
613     {
614     char *r1 = *(char * const *) s1;
615     char *r2 = *(char * const *) s2;
616    
617     return strcmp(r1,r2);
618     }
619    
620     /* Indexes the words in all the files in the array of files
621     ** The array is sorted alphabetically
622     */
623    
624     static void printfiles(SWISH * sw, DOCENTRYARRAY * e)
625     {
626     int i;
627    
628     if (e)
629     {
630     /* 2001-08 sorting of filenames moved here - Do we really
631     ** need to sort them? - Adjust it in config.h */
632     if(e->currentsize)
633     {
634     if(SORT_FILENAMES)
635     {
636     swish_qsort(e->filenames, e->currentsize, sizeof(char *), compfilenames);
637     }
638     }
639    
640     for (i = 0; i < e->currentsize; i++)
641     {
642     printfile(sw, e->filenames[i]);
643     efree( e->filenames[i] );
644     }
645    
646     /* free the array and filenames */
647     efree(e->filenames);
648     efree(e);
649     }
650     }
651    
652     /* Prints out the directory names as things are getting indexed.
653     ** Calls indexadir() so directories in the array are indexed,
654     ** in alphabetical order...
655     */
656    
657     void printdirs(SWISH * sw, DOCENTRYARRAY * e)
658     {
659     int i;
660    
661     if (e)
662     {
663     /* 2001-08 sorting of dirs moved here - Do we really
664     ** need to sort them? - Adjust it in config.h */
665     if(e->currentsize)
666     {
667     if(SORT_FILENAMES)
668     {
669     swish_qsort(e->filenames, e->currentsize, sizeof(char *), compfilenames);
670     }
671     }
672    
673     for (i = 0; i < e->currentsize; i++)
674     {
675     if (sw->verbose >= 3)
676     printf("\nIn dir \"%s\":\n", e->filenames[i]);
677     else if (sw->verbose >= 2)
678     printf("Checking dir \"%s\"...\n", e->filenames[i]);
679    
680     indexadir(sw, e->filenames[i]);
681     efree(e->filenames[i]);
682     }
683     efree(e->filenames);
684     efree(e);
685     }
686     }
687    
688     /* Stores file names in alphabetical order so they can be
689     ** indexed alphabetically. No big whoop.
690     */
691    
692     static DOCENTRYARRAY *adddocentry(DOCENTRYARRAY * e, char *filename)
693     {
694     if (e == NULL)
695     {
696     e = (DOCENTRYARRAY *) emalloc(sizeof(DOCENTRYARRAY));
697     e->maxsize = VERYBIGHASHSIZE; /* Put what you like */
698     e->filenames = (char **) emalloc(e->maxsize * sizeof(char *));
699    
700     e->currentsize = 1;
701     e->filenames[0] = (char *) estrdup(filename);
702     }
703     else
704     {
705     if ((e->currentsize + 1) == e->maxsize)
706     {
707     e->maxsize += 1000;
708     e->filenames = (char **) erealloc(e->filenames, e->maxsize * sizeof(char *));
709     }
710     e->filenames[e->currentsize++] = (char *) estrdup(filename);
711     }
712     return e;
713     }
714    
715    
716    
717    
718     /********************************************************/
719     /* "Public" functions */
720     /********************************************************/
721    
722     void fs_indexpath(SWISH * sw, char *path)
723     {
724    
725     normalize_path( path ); /* flip backslashes and remove trailing slash */
726    
727    
728     if (isdirectory(path))
729     {
730     if (sw->verbose >= 2)
731     printf("\nChecking dir \"%s\"...\n", path);
732    
733     indexadir(sw, path);
734     }
735    
736     else if (isfile(path))
737     {
738     if (sw->verbose >= 2)
739     printf("\nChecking file \"%s\"...\n", path);
740     indexafile(sw, path);
741     }
742     else
743     progwarnno("Invalid path '%s': ", path);
744     }
745    
746    
747    
748    
749     struct _indexing_data_source_def FileSystemIndexingDataSource = {
750     "File-System",
751     "fs",
752     fs_indexpath,
753     configModule_FS
754     };

  ViewVC Help
Powered by ViewVC 1.1.22