1 |
/* |
2 |
$Id: file.c,v 1.43 2002/07/09 16:14:21 whmoseley Exp $ |
3 |
** |
4 |
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
5 |
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
6 |
** |
7 |
** This program and library is free software; you can redistribute it and/or |
8 |
** modify it under the terms of the GNU (Library) General Public License |
9 |
** as published by the Free Software Foundation; either version 2 |
10 |
** of the License, or any later version. |
11 |
** |
12 |
** This program is distributed in the hope that it will be useful, |
13 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 |
** GNU (Library) General Public License for more details. |
16 |
** |
17 |
** You should have received a copy of the GNU (Library) General Public License |
18 |
** along with this program; if not, write to the Free Software |
19 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
20 |
**------------------------------------------------------------- |
21 |
** Changed getdefaults to allow metaNames in the user |
22 |
** configuration file |
23 |
** G.Hill 4/16/97 ghill@library.berkeley.edu |
24 |
** |
25 |
** change sprintf to snprintf to avoid corruption, and use MAXSTRLEN from swish.h |
26 |
** added safestrcpy() macro to avoid corruption from strcpy overflow |
27 |
** SRE 11/17/99 |
28 |
** |
29 |
** added buffer size arg to grabStringValue - core dumping from overrun |
30 |
** fixed logical OR and other problems pointed out by "gcc -Wall" |
31 |
** SRE 2/22/00 |
32 |
** |
33 |
** counter modulo 128 had parens typo |
34 |
** SRE 2/23/00 |
35 |
** |
36 |
** read stopwords from file |
37 |
** Rainer Scherg (rasc) 2000-06-15 |
38 |
** |
39 |
** 2000-11-15 rasc |
40 |
** file_properties retrieves last mod date, filesize, and evals some swish |
41 |
** config flags for this file! |
42 |
** |
43 |
** 2001-02-12 rasc errormsg "print" changed... |
44 |
** 2001-03-16 rasc truncateDoc [read_stream] (if doc to large, truncate... ) |
45 |
** 2001-03-17 rasc fprop enhanced by "real_filename" |
46 |
** |
47 |
*/ |
48 |
|
49 |
#ifdef HAVE_CONFIG_H |
50 |
#include "acconfig.h" |
51 |
#endif |
52 |
|
53 |
#ifdef HAVE_STDLIB_H |
54 |
#include <stdlib.h> |
55 |
#endif |
56 |
#ifdef HAVE_UNISTD_H |
57 |
#include <unistd.h> |
58 |
#endif |
59 |
#include "swish.h" |
60 |
#include "mem.h" |
61 |
#include "string.h" |
62 |
#include "file.h" |
63 |
#include "error.h" |
64 |
#include "list.h" |
65 |
#include "hash.h" |
66 |
#include "check.h" |
67 |
#include "index.h" |
68 |
#include "filter.h" |
69 |
#include "metanames.h" |
70 |
|
71 |
|
72 |
/* Cough, hack, cough - convert slash to backslash for programs that are run via the shell */ |
73 |
#ifdef _WIN32 |
74 |
void make_windows_path( char *path ) |
75 |
{ |
76 |
char *c; |
77 |
|
78 |
for ( c = path; *c; c++ ) |
79 |
if ( '/' == *c ) |
80 |
*c = '\\'; |
81 |
} |
82 |
#endif |
83 |
|
84 |
/* Flip any backslashes to forward slashes, and remove trailing slash */ |
85 |
|
86 |
|
87 |
void normalize_path(char *path) |
88 |
{ |
89 |
int len = strlen( path ); |
90 |
char *c; |
91 |
|
92 |
/* For windows users */ |
93 |
for ( c = path; *c; c++ ) |
94 |
if ( '\\' == *c ) |
95 |
*c = '/'; |
96 |
|
97 |
while( len > 1 && path[len-1] == '/' ) |
98 |
{ |
99 |
path[len-1] = '\0'; |
100 |
len--; |
101 |
} |
102 |
} |
103 |
|
104 |
|
105 |
|
106 |
/* Is a file a directory? |
107 |
*/ |
108 |
|
109 |
int isdirectory(char *path) |
110 |
{ |
111 |
struct stat stbuf; |
112 |
|
113 |
if (stat(path, &stbuf)) |
114 |
return 0; |
115 |
return ((stbuf.st_mode & S_IFMT) == S_IFDIR) ? 1 : 0; |
116 |
} |
117 |
|
118 |
/* Is a file a regular file? |
119 |
*/ |
120 |
|
121 |
int isfile(char *path) |
122 |
{ |
123 |
struct stat stbuf; |
124 |
|
125 |
if (stat(path, &stbuf)) |
126 |
return 0; |
127 |
return ((stbuf.st_mode & S_IFMT) == S_IFREG) ? 1 : 0; |
128 |
} |
129 |
|
130 |
/* Is a file a link? |
131 |
*/ |
132 |
|
133 |
int islink(char *path) |
134 |
{ |
135 |
#ifndef NO_SYMBOLIC_FILE_LINKS |
136 |
struct stat stbuf; |
137 |
|
138 |
if (lstat(path, &stbuf)) |
139 |
return 0; |
140 |
return ((stbuf.st_mode & S_IFLNK) == S_IFLNK) ? 1 : 0; |
141 |
#else |
142 |
return 0; |
143 |
#endif |
144 |
} |
145 |
|
146 |
/* Get the size, in bytes, of a file. |
147 |
** Return -1 if there's a problem. |
148 |
*/ |
149 |
|
150 |
int getsize(char *path) |
151 |
{ |
152 |
struct stat stbuf; |
153 |
|
154 |
if (stat(path, &stbuf)) |
155 |
return -1; |
156 |
return stbuf.st_size; |
157 |
} |
158 |
|
159 |
|
160 |
|
161 |
FILE *openIndexFILEForWrite(char *filename) |
162 |
{ |
163 |
return fopen(filename, F_WRITE_BINARY); |
164 |
} |
165 |
|
166 |
FILE *openIndexFILEForRead(char *filename) |
167 |
{ |
168 |
return fopen(filename, F_READ_BINARY); |
169 |
} |
170 |
|
171 |
FILE *openIndexFILEForReadAndWrite(char *filename) |
172 |
{ |
173 |
return fopen(filename, F_READWRITE_BINARY); |
174 |
} |
175 |
|
176 |
void CreateEmptyFile(char *filename) |
177 |
{ |
178 |
FILE *fp; |
179 |
|
180 |
if (!(fp = openIndexFILEForWrite(filename))) |
181 |
{ |
182 |
progerrno("Couldn't write the file \"%s\": ", filename); |
183 |
} |
184 |
fclose(fp); |
185 |
} |
186 |
|
187 |
/* |
188 |
* Invoke the methods of the current Indexing Data Source |
189 |
*/ |
190 |
void indexpath(SWISH * sw, char *path) |
191 |
{ |
192 |
/* invoke routine to index a "path" */ |
193 |
(*IndexingDataSource->indexpath_fn) (sw, path); |
194 |
} |
195 |
|
196 |
|
197 |
/* |
198 |
-- read file into a buffer |
199 |
-- truncate file if necessary (truncateDocSize) |
200 |
-- return: buffer |
201 |
-- 2001-03-16 rasc truncateDoc |
202 |
*/ |
203 |
|
204 |
/* maybe some day this could be chunked reading? */ |
205 |
|
206 |
char *read_stream(SWISH *sw, char *name, FILE * fp, long filelen, long max_size) |
207 |
{ |
208 |
long c, |
209 |
offset; |
210 |
long bufferlen; |
211 |
unsigned char *buffer, *tmp = NULL; |
212 |
size_t bytes_read; |
213 |
|
214 |
|
215 |
if (filelen) |
216 |
{ |
217 |
|
218 |
/* truncate doc? */ |
219 |
if (max_size && (filelen > max_size)) |
220 |
{ |
221 |
filelen = max_size; |
222 |
} |
223 |
|
224 |
buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, filelen + 1); |
225 |
*buffer = '\0'; |
226 |
bytes_read = fread(buffer, 1, filelen, fp); |
227 |
|
228 |
|
229 |
buffer[filelen] = '\0'; |
230 |
|
231 |
/* JFP - substitute null chars, VFC record may have null char in reclen word, try to discard them */ |
232 |
if ( strlen( (char *)buffer ) < bytes_read ) |
233 |
{ |
234 |
int i; |
235 |
progwarn("Substituted possible embedded null character(s) in file '%s'\n", name); |
236 |
for (i = 0; i < bytes_read; ++i) |
237 |
if (buffer[i] == '\0') buffer[i] = '\n'; |
238 |
} |
239 |
|
240 |
} |
241 |
else |
242 |
{ /* if we are reading from a popen call, filelen is 0 */ |
243 |
|
244 |
buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,(bufferlen = RD_BUFFER_SIZE) + 1); |
245 |
*buffer = '\0'; |
246 |
for (offset = 0; (c = fread(buffer + offset, 1, RD_BUFFER_SIZE, fp)) == RD_BUFFER_SIZE; offset += RD_BUFFER_SIZE) |
247 |
{ |
248 |
/* truncate? break if to much read */ |
249 |
if (max_size && (bufferlen > max_size)) |
250 |
{ |
251 |
break; |
252 |
} |
253 |
tmp = (unsigned char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, bufferlen + RD_BUFFER_SIZE + 1); |
254 |
memcpy(tmp,buffer,bufferlen+1); |
255 |
buffer = tmp; |
256 |
bufferlen += RD_BUFFER_SIZE; |
257 |
} |
258 |
filelen = offset + c; |
259 |
|
260 |
if (max_size && (filelen > max_size)) |
261 |
{ |
262 |
filelen = max_size; |
263 |
} |
264 |
buffer[filelen] = '\0'; |
265 |
} |
266 |
return (char *) buffer; |
267 |
} |
268 |
|
269 |
/* Sept 25, 2001 - moseley |
270 |
* Flush the file -- for use with -S prog, when either Truncate is in use, or |
271 |
* the parser aborted for some reason (e.g. !isoktitle). |
272 |
*/ |
273 |
|
274 |
void flush_stream( FileProp *fprop ) |
275 |
{ |
276 |
char tmpbuf[4096]; |
277 |
int read; |
278 |
|
279 |
while ( fprop->bytes_read < fprop->fsize ) |
280 |
{ |
281 |
if ( ( fprop->fsize - fprop->bytes_read ) > 4096 ) |
282 |
{ |
283 |
if ( !(read = fread(tmpbuf, 1, 4096, fprop->fp))) |
284 |
break; |
285 |
|
286 |
fprop->bytes_read += read; |
287 |
} |
288 |
else |
289 |
{ |
290 |
read = fread(tmpbuf, 1, fprop->fsize - fprop->bytes_read, fprop->fp); |
291 |
break; |
292 |
} |
293 |
} |
294 |
} |
295 |
|
296 |
|
297 |
/* Mar 27, 2001 - moseley |
298 |
* Separate out the creation of the file properties |
299 |
* |
300 |
*/ |
301 |
|
302 |
FileProp *init_file_properties(SWISH * sw) |
303 |
{ |
304 |
FileProp *fprop; |
305 |
|
306 |
fprop = (FileProp *) emalloc(sizeof(FileProp)); |
307 |
/* emalloc checks fail and aborts... */ |
308 |
|
309 |
memset( fprop, 0, sizeof(FileProp) ); |
310 |
|
311 |
return fprop; |
312 |
} |
313 |
|
314 |
|
315 |
/* Mar 27, 2001 - moseley |
316 |
* Separate out the adjusting of file properties by config settings |
317 |
* 2001-04-09 rasc changed filters |
318 |
*/ |
319 |
|
320 |
void init_file_prop_settings(SWISH * sw, FileProp * fprop) |
321 |
{ |
322 |
|
323 |
/* Basename of document path => document filename */ |
324 |
fprop->real_filename = str_basename(fprop->real_path); |
325 |
|
326 |
|
327 |
/* -- get Doc Type as is in IndexContents or Defaultcontents |
328 |
-- doctypes by jruiz |
329 |
*/ |
330 |
|
331 |
/* Might already be set by a header in extpro.c */ |
332 |
if ( !fprop->doctype ) |
333 |
{ |
334 |
/* Get the type by file extension -- or return NODOCTYPE */ |
335 |
fprop->doctype = getdoctype(fprop->real_path, sw->indexcontents); |
336 |
|
337 |
/* If was not set by getdoctype() then assign it the default parser */ |
338 |
/* This could still be NODOCTYPE, or it might be something set by DefaultContents */ |
339 |
|
340 |
if (fprop->doctype == NODOCTYPE) |
341 |
fprop->doctype = sw->DefaultDocType; |
342 |
} |
343 |
|
344 |
|
345 |
/* -- index just the filename (or doc title tags)? |
346 |
-- this param was "wrongly" named indextitleonly */ |
347 |
|
348 |
fprop->index_no_content = (sw->nocontentslist != NULL) && isoksuffix(fprop->real_path, sw->nocontentslist); |
349 |
|
350 |
/* -- Any filter for this file type? |
351 |
-- NULL = No Filter, (char *) path to filter prog. |
352 |
*/ |
353 |
|
354 |
fprop->hasfilter = hasfilter(sw, fprop->real_path); |
355 |
|
356 |
fprop->stordesc = hasdescription(fprop->doctype, sw->storedescription); |
357 |
|
358 |
} |
359 |
|
360 |
|
361 |
|
362 |
/* |
363 |
-- file_properties |
364 |
-- Get/eval information about a file and return it. |
365 |
-- Some flags are calculated from swish configs for this "real_path" |
366 |
-- Structure has to be freed using free_file_properties |
367 |
-- 2000-11-15 rasc |
368 |
-- return: (FileProp *) |
369 |
-- A failed stat returns an empty (default) structure |
370 |
|
371 |
-- 2000-12 |
372 |
-- Added StoreDescription |
373 |
*/ |
374 |
|
375 |
FileProp *file_properties(char *real_path, char *work_file, SWISH * sw) |
376 |
{ |
377 |
FileProp *fprop; |
378 |
struct stat stbuf; |
379 |
|
380 |
/* create an initilized fprop structure */ |
381 |
|
382 |
fprop = init_file_properties(sw); |
383 |
|
384 |
|
385 |
/* Dup these, since the real_path may be reallocated by FileRules */ |
386 |
fprop->real_path = estrdup( real_path ); |
387 |
fprop->work_path = estrdup( work_file ? work_file : real_path ); |
388 |
fprop->orig_path = estrdup( real_path ); |
389 |
|
390 |
|
391 |
/* Stat the file */ |
392 |
/* This is really the wrong place for this, as it's really only useful for fs.c method */ |
393 |
/* for http.c it means the last mod date is the temp file date */ |
394 |
/* Probably this entire function isn't needed - moseley */ |
395 |
|
396 |
if (!stat(fprop->work_path, &stbuf)) |
397 |
{ |
398 |
fprop->fsize = (long) stbuf.st_size; |
399 |
fprop->mtime = stbuf.st_mtime; |
400 |
} |
401 |
|
402 |
|
403 |
/* Now set various fprop settings based mostly on file name */ |
404 |
|
405 |
init_file_prop_settings(sw, fprop); |
406 |
|
407 |
|
408 |
|
409 |
#ifdef DEBUG |
410 |
fprintf(stderr, "file_properties: path=%s, (workpath=%s), fsize=%ld, last_mod=%ld Doctype: %d Filter: %p\n", |
411 |
fprop->real_path, fprop->work_path, (long) fprop->fsize, (long) fprop->mtime, fprop->doctype, fprop->filterprog); |
412 |
#endif |
413 |
|
414 |
return fprop; |
415 |
} |
416 |
|
417 |
|
418 |
/* -- Free FileProp structure |
419 |
-- unless no alloc for strings simple free structure |
420 |
*/ |
421 |
|
422 |
void free_file_properties(FileProp * fprop) |
423 |
{ |
424 |
efree( fprop->real_path ); |
425 |
efree( fprop->work_path ); |
426 |
efree( fprop->orig_path ); |
427 |
efree(fprop); |
428 |
} |
429 |
|
430 |
|
431 |
static char *temp_file_template = "XXXXXX"; |
432 |
/*********************************************************************** |
433 |
* Create a temporary file |
434 |
* |
435 |
* Call With: |
436 |
* *SWISH = to get at the TmpDir config setting which I don't like |
437 |
* *prefix = chars to prepend to the file name |
438 |
* **file_name_buffer = where to store address of file name |
439 |
* unlink = if true, will unlink file |
440 |
* if not unlinked, then caller must free the name |
441 |
* Return: |
442 |
* *FILE |
443 |
* modified file_name_buffer |
444 |
* |
445 |
* Will create temp files in the directory specified by environment vars |
446 |
* TMPDIR and TMP, and by the config.h setting of TMPDIR in that order. |
447 |
* |
448 |
* Note: |
449 |
* It's expected that swish is not run suid, so |
450 |
* (getuid()==geteuid()) && (getgid()==getegid()) |
451 |
* if not checked. I'm not sure if that would choke on other platforms. |
452 |
* |
453 |
* |
454 |
* Source: |
455 |
* http://www.linuxdoc.org/HOWTO/Secure-Programs-HOWTO/avoid-race.html |
456 |
* |
457 |
* Questions: |
458 |
* Can non-unix OS unlink the file and continue to hold the fd? |
459 |
* |
460 |
***********************************************************************/ |
461 |
|
462 |
FILE *create_tempfile(SWISH *sw, const char *f_mode, char *prefix, char **file_name_buffer, int remove_file_name ) |
463 |
{ |
464 |
int temp_fd; |
465 |
mode_t old_mode; |
466 |
FILE *temp_file; |
467 |
char *file_name; |
468 |
int file_name_len; |
469 |
struct MOD_Index *idx = sw->Index; |
470 |
char *tmpdir = NULL; |
471 |
file_name_len = (prefix ? strlen(prefix) : 0) + strlen( temp_file_template ) + strlen( TEMP_FILE_PREFIX ); |
472 |
|
473 |
|
474 |
|
475 |
/* Perl is nice sometimes */ |
476 |
if ( !( tmpdir = getenv("TMPDIR")) ) |
477 |
if ( !(tmpdir = getenv("TMP")) ) |
478 |
if( !(tmpdir = getenv("TEMP")) ) |
479 |
tmpdir = idx->tmpdir; |
480 |
|
481 |
if ( tmpdir && !*tmpdir ) |
482 |
tmpdir = NULL; // just in case it's the empty string |
483 |
|
484 |
if ( tmpdir ) |
485 |
file_name_len += strlen( tmpdir ) + 1; // for path separator |
486 |
|
487 |
|
488 |
|
489 |
file_name = emalloc( file_name_len + 1 ); |
490 |
|
491 |
*file_name = '\0'; |
492 |
|
493 |
if ( tmpdir ) |
494 |
{ |
495 |
strcat( file_name, tmpdir ); |
496 |
normalize_path( file_name ); |
497 |
strcat( file_name, "/" ); |
498 |
} |
499 |
|
500 |
strcat( file_name, TEMP_FILE_PREFIX ); |
501 |
|
502 |
if ( prefix ) |
503 |
strcat( file_name, prefix ); |
504 |
|
505 |
strcat( file_name, temp_file_template ); |
506 |
|
507 |
old_mode = umask(077); /* Create file with restrictive permissions */ |
508 |
|
509 |
temp_fd = mkstemp( file_name ); |
510 |
|
511 |
(void) umask(old_mode); |
512 |
|
513 |
if (temp_fd == -1) |
514 |
progerrno("Couldn't open temporary file '%s': ", file_name ); |
515 |
|
516 |
if (!(temp_file = fdopen(temp_fd, f_mode))) |
517 |
progerrno("Couldn't create temporary file '%s' file descriptor: ", file_name); |
518 |
|
519 |
if ( remove_file_name ) |
520 |
{ |
521 |
if ( remove( file_name ) == -1 ) |
522 |
progerrno("Couldn't unlink temporary file '%s' :", file_name); |
523 |
|
524 |
efree( file_name ); |
525 |
} |
526 |
else |
527 |
*file_name_buffer = file_name; |
528 |
|
529 |
|
530 |
return temp_file; |
531 |
} |
532 |
|
533 |
|