1 |
/* |
2 |
$Id: swregex.c,v 1.1 2002/03/17 03:54:19 whmoseley Exp $ |
3 |
** |
4 |
** |
5 |
** This program and library is free software; you can redistribute it and/or |
6 |
** modify it under the terms of the GNU (Library) General Public License |
7 |
** as published by the Free Software Foundation; either version 2 |
8 |
** of the License, or any later version. |
9 |
** |
10 |
** This program is distributed in the hope that it will be useful, |
11 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
** GNU (Library) General Public License for more details. |
14 |
** |
15 |
** You should have received a copy of the GNU (Library) General Public License |
16 |
** along with this program; if not, write to the Free Software |
17 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
18 |
**--------------------------------------------------------- |
19 |
** |
20 |
** |
21 |
** March 16, 2002 - Bill Moseley: moved regex routines out of string.c |
22 |
** |
23 |
** This is a collection of routines for building and testing regular expressions |
24 |
** for use with swish-e. |
25 |
** |
26 |
*/ |
27 |
|
28 |
//#include <ctype.h> |
29 |
#include "swish.h" |
30 |
#include "mem.h" |
31 |
//#include "index.h" |
32 |
//#include "swish_qsort.h" |
33 |
#include "string.h" |
34 |
#include "error.h" |
35 |
#include "swregex.h" |
36 |
|
37 |
static char *regex_replace( char *str, regex_list *regex, int offset, int *matched ); |
38 |
|
39 |
|
40 |
/********************************************************************* |
41 |
* Adds a list of patterns to a reg_list. Calls progerr on failure. |
42 |
* Call With: |
43 |
* name = Descriptive name for errors - e.g. the name of the directive currently being processed |
44 |
* regex_list = pointer to the list of regular expressions |
45 |
* params = null-terminated list of pointers to strings |
46 |
* regex_pattern = flag to indicate that it's a delimited pattern (instead of just the pattern) |
47 |
* |
48 |
* Returns: |
49 |
* void |
50 |
* |
51 |
* ToDO: |
52 |
* Really should get passed in *SWISH so can set error string and return |
53 |
* |
54 |
* Notes: |
55 |
* An expression can be proceeded by the word "not" to negate the matching of the pattern. |
56 |
* |
57 |
* |
58 |
**********************************************************************/ |
59 |
void add_regex_patterns( char *name, regex_list **reg_list, char **params, int regex_pattern ) |
60 |
{ |
61 |
int negate; |
62 |
char *word; |
63 |
char *pos; |
64 |
char *ptr; |
65 |
int delimiter; |
66 |
int cflags; |
67 |
int global; |
68 |
|
69 |
|
70 |
while ( *params ) |
71 |
{ |
72 |
negate = 0; |
73 |
global = 0; |
74 |
cflags = REG_EXTENDED; |
75 |
|
76 |
|
77 |
if ( (strcasecmp( *params, "not" ) == 0) && *(params+1) ) |
78 |
{ |
79 |
negate = 1; |
80 |
params++; |
81 |
} |
82 |
|
83 |
/* Simple case of a string pattern */ |
84 |
if ( !regex_pattern ) |
85 |
{ |
86 |
add_regular_expression( reg_list, *params, NULL, cflags, global, negate ); |
87 |
params++; |
88 |
continue; |
89 |
} |
90 |
|
91 |
word = *params; |
92 |
delimiter = (int)*word; |
93 |
|
94 |
word++; /* past the first delimiter */ |
95 |
|
96 |
if ( !(pos = strchr( word, delimiter ))) |
97 |
progerr("%s regex: failed to find search pattern delimiter '%c' in pattern '%s'", name, (char)delimiter, *params ); |
98 |
|
99 |
*pos = '\0'; |
100 |
|
101 |
|
102 |
/* now check for flags */ |
103 |
for ( ptr = pos + 1; *ptr; ptr++ ) |
104 |
{ |
105 |
if ( *ptr == 'i' ) |
106 |
cflags |= REG_ICASE; |
107 |
else if ( *ptr == 'm' ) |
108 |
cflags |= REG_NEWLINE; |
109 |
else |
110 |
progerr("%s regexp %s: unknown flag '%c'", name, *params, *ptr ); |
111 |
} |
112 |
|
113 |
add_regular_expression( reg_list, word, NULL, cflags, global, negate ); |
114 |
|
115 |
*pos = delimiter; /* put it back */ |
116 |
params++; |
117 |
} |
118 |
} |
119 |
|
120 |
/********************************************************************* |
121 |
* Adds a single regex replacement pattern |
122 |
* |
123 |
* Call With: |
124 |
* name = Descriptive name for errors - e.g. the name of the directive currently being processed |
125 |
* regex_list = pointer to the list of regular expressions |
126 |
* word = delimited regex pattern |
127 |
* |
128 |
* Returns: |
129 |
* void |
130 |
* |
131 |
* |
132 |
* |
133 |
**********************************************************************/ |
134 |
|
135 |
void add_replace_expression( char *name, regex_list **reg_list, char *expression ) |
136 |
|
137 |
{ |
138 |
char *word = estrdup( expression ); |
139 |
char *save = word; |
140 |
int delimiter = (int)*word; |
141 |
char *pos; |
142 |
char *pattern = NULL; |
143 |
char *replace = NULL; |
144 |
int cflags = REG_EXTENDED; |
145 |
int global = 0; |
146 |
char *ptr; |
147 |
|
148 |
|
149 |
word++; /* past the first delimiter */ |
150 |
|
151 |
if ( !(pos = strchr( word, delimiter ))) |
152 |
progerr("%s regex: failed to find search pattern delimiter '%c' in pattern '%s'", name, (char)delimiter, word ); |
153 |
|
154 |
*pos = '\0'; |
155 |
pattern = estrdup(word); |
156 |
|
157 |
word = pos + 1; /* now at replace pattern */ |
158 |
|
159 |
if ( !(pos = strchr( word, delimiter ))) |
160 |
progerr("%s regex: failed to find replace pattern delimiter '%c' in pattern '%s'", name, (char)delimiter, word ); |
161 |
|
162 |
*pos = '\0'; |
163 |
replace = estrdup(word); |
164 |
|
165 |
|
166 |
/* now check for flags */ |
167 |
for ( ptr = pos + 1; *ptr; ptr++ ) |
168 |
{ |
169 |
if ( *ptr == 'i' ) |
170 |
cflags |= REG_ICASE; |
171 |
|
172 |
else if ( *ptr == 'm' ) |
173 |
cflags |= REG_NEWLINE; |
174 |
|
175 |
else if ( *ptr == 'g' ) |
176 |
global++; |
177 |
else |
178 |
progerr("%s regexp %s: unknown flag '%c'", name, expression, *ptr ); |
179 |
} |
180 |
|
181 |
add_regular_expression( reg_list, pattern, replace, cflags, global, 0 ); |
182 |
|
183 |
efree( pattern ); |
184 |
efree( replace ); |
185 |
efree( save ); |
186 |
} |
187 |
|
188 |
|
189 |
|
190 |
/********************************************************************* |
191 |
* Match regular expressions |
192 |
* Works on a list of expressions, and returns true if *ANY* match |
193 |
* |
194 |
* |
195 |
**********************************************************************/ |
196 |
int match_regex_list( char *str, regex_list *regex ) |
197 |
{ |
198 |
regmatch_t pmatch[1]; |
199 |
int matched; |
200 |
|
201 |
while ( regex ) |
202 |
{ |
203 |
matched = regex->negate |
204 |
? regexec(®ex->re, str, (size_t) 1, pmatch, 0) != 0 |
205 |
: regexec(®ex->re, str, (size_t) 1, pmatch, 0) == 0; |
206 |
|
207 |
if ( DEBUG_MASK & DEBUG_REGEX ) |
208 |
printf("match %s %c~ m[%s] : %s\n", str, (int)(regex->negate ? '!' : '='), regex->pattern, matched ? "matched" : "nope" ); |
209 |
|
210 |
if ( matched ) |
211 |
return 1; |
212 |
|
213 |
regex = regex->next; |
214 |
} |
215 |
|
216 |
return 0; |
217 |
} |
218 |
|
219 |
|
220 |
/********************************************************************* |
221 |
* Process all the regular expressions in a regex_list |
222 |
* |
223 |
* |
224 |
**********************************************************************/ |
225 |
char *process_regex_list( char *str, regex_list *regex, int *matched ) |
226 |
{ |
227 |
if ( DEBUG_MASK & DEBUG_REGEX && regex ) |
228 |
printf("\nOriginal String: '%s'\n", str ); |
229 |
|
230 |
while ( regex ) |
231 |
{ |
232 |
str = regex_replace( str, regex, 0, matched ); |
233 |
regex = regex->next; |
234 |
|
235 |
if ( DEBUG_MASK & DEBUG_REGEX ) |
236 |
printf(" Result String: '%s'\n", str ); |
237 |
|
238 |
} |
239 |
|
240 |
return str; |
241 |
} |
242 |
|
243 |
/********************************************************************* |
244 |
* Regular Expression Substitution |
245 |
* |
246 |
* Rewritten 7/31/2001 - general purpose regexp |
247 |
* |
248 |
* Pass in a string and a regex_list pointer |
249 |
* |
250 |
* Returns: |
251 |
* a string. Either the original, or a replacement string |
252 |
* Frees passed in string if return is different. |
253 |
* |
254 |
* Notes: |
255 |
* Clearly, there must be a library to do this already. For /g I'm |
256 |
* recursively calling this. |
257 |
* |
258 |
* |
259 |
**********************************************************************/ |
260 |
static char *regex_replace( char *str, regex_list *regex, int offset, int *matched ) |
261 |
{ |
262 |
regmatch_t pmatch[MAXPAR]; |
263 |
char *c; |
264 |
char *newstr; |
265 |
int escape = 0; |
266 |
int pos = 0; |
267 |
int j; |
268 |
int last_offset = 0; |
269 |
|
270 |
if ( DEBUG_MASK & DEBUG_REGEX ) |
271 |
printf("replace %s =~ m[%s][%s]: %s\n", str + offset, regex->pattern, regex->replace, |
272 |
regexec(®ex->re, str + offset, (size_t) MAXPAR, pmatch, 0) ? "No Match" : "Matched" ); |
273 |
|
274 |
/* Run regex - return original string if no match (might be nice to print error msg? */ |
275 |
if ( regexec(®ex->re, str + offset, (size_t) MAXPAR, pmatch, 0) ) |
276 |
return str; |
277 |
|
278 |
|
279 |
/* Flag that a pattern matched */ |
280 |
(*matched)++; |
281 |
|
282 |
|
283 |
/* allocate a string long enough */ |
284 |
newstr = (char *) emalloc( offset + strlen( str ) + regex->replace_length + (regex->replace_count * strlen( str )) + 1 ); |
285 |
|
286 |
/* Copy everything before string */ |
287 |
for ( j=0; j < offset; j++ ) |
288 |
newstr[pos++] = str[j]; |
289 |
|
290 |
|
291 |
/* Copy everything before the match */ |
292 |
if ( pmatch[0].rm_so > 0 ) |
293 |
for ( j = offset; j < pmatch[0].rm_so + offset; j++ ) |
294 |
newstr[pos++] = str[j]; |
295 |
|
296 |
|
297 |
/* ugly section */ |
298 |
for ( c = regex->replace; *c; c++ ) |
299 |
{ |
300 |
if ( escape ) |
301 |
{ |
302 |
newstr[pos++] = *c; |
303 |
last_offset = pos; |
304 |
escape = 0; |
305 |
continue; |
306 |
} |
307 |
|
308 |
if ( *c == '\\' && *(c+1) ) |
309 |
{ |
310 |
escape = 1; |
311 |
continue; |
312 |
} |
313 |
|
314 |
if ( '$' == *c && *(c+1) ) |
315 |
{ |
316 |
char *start = NULL; |
317 |
char *end = NULL; |
318 |
|
319 |
c++; |
320 |
|
321 |
/* chars before match */ |
322 |
if ( '`' == *c ) |
323 |
{ |
324 |
if ( pmatch[0].rm_so + offset > 0 ) |
325 |
{ |
326 |
start = str; |
327 |
end = str + pmatch[0].rm_so + offset; |
328 |
} |
329 |
} |
330 |
|
331 |
/* chars after match */ |
332 |
else if ( '\'' == *c ) |
333 |
{ |
334 |
start = str + pmatch[0].rm_eo + offset; |
335 |
end = str + strlen( str ); |
336 |
} |
337 |
|
338 |
else if ( *c >= '0' && *c <= '9' ) |
339 |
{ |
340 |
int i = (int)( *c ) - (int)'0'; |
341 |
|
342 |
if ( pmatch[i].rm_so != -1 ) |
343 |
{ |
344 |
start = str + pmatch[i].rm_so + offset; |
345 |
end = str + pmatch[i].rm_eo + offset; |
346 |
} |
347 |
} |
348 |
|
349 |
else /* just copy the pattern */ |
350 |
{ |
351 |
start = c - 1; |
352 |
end = c + 1; |
353 |
} |
354 |
|
355 |
if ( start ) |
356 |
for ( ; start < end; start++ ) |
357 |
newstr[pos++] = *start; |
358 |
} |
359 |
|
360 |
/* not a replace pattern, just copy the char */ |
361 |
else |
362 |
newstr[pos++] = *c; |
363 |
|
364 |
last_offset = pos; |
365 |
} |
366 |
|
367 |
newstr[pos] = '\0'; |
368 |
|
369 |
/* Append any pattern after the string */ |
370 |
strcat( newstr, str+pmatch[0].rm_eo + offset ); |
371 |
|
372 |
|
373 |
efree( str ); |
374 |
|
375 |
|
376 |
/* This allow /g processing to match repeatedly */ |
377 |
/* I'm sure there a way to mess this up and end up with a regex loop... */ |
378 |
|
379 |
if ( regex->global && last_offset < strlen( newstr ) ) |
380 |
newstr = regex_replace( newstr, regex, last_offset, matched ); |
381 |
|
382 |
return newstr; |
383 |
} |
384 |
|
385 |
/********************************************************* |
386 |
* Free a regular express list |
387 |
* |
388 |
*********************************************************/ |
389 |
|
390 |
void free_regex_list( regex_list **reg_list ) |
391 |
{ |
392 |
regex_list *list = *reg_list; |
393 |
regex_list *next; |
394 |
while ( list ) |
395 |
{ |
396 |
if ( list->replace ) |
397 |
efree( list->replace ); |
398 |
|
399 |
if ( list->pattern ) |
400 |
efree( list->pattern ); |
401 |
|
402 |
regfree(&list->re); |
403 |
|
404 |
next = list->next; |
405 |
efree( list ); |
406 |
list = next; |
407 |
} |
408 |
*reg_list = NULL; |
409 |
} |
410 |
|
411 |
/**************************************************************************** |
412 |
* Create or Add a regular expression to a list |
413 |
* pre-compiles expression to check for errors and for speed |
414 |
* |
415 |
* Pattern and replace string passed in are duplicated |
416 |
* |
417 |
* |
418 |
*****************************************************************************/ |
419 |
|
420 |
void add_regular_expression( regex_list **reg_list, char *pattern, char *replace, int cflags, int global, int negate ) |
421 |
{ |
422 |
regex_list *new_node = emalloc( sizeof( regex_list ) ); |
423 |
regex_list *last; |
424 |
char *c; |
425 |
int status; |
426 |
int escape = 0; |
427 |
|
428 |
if ( (status = regcomp( &new_node->re, pattern, cflags ))) |
429 |
progerr("Failed to complie regular expression '%s', pattern. Error: %d", pattern, status ); |
430 |
|
431 |
|
432 |
|
433 |
new_node->pattern = pattern ? estrdup(pattern) : estrdup(""); /* only used for -T debugging */ |
434 |
new_node->replace = replace ? estrdup(replace) : estrdup(""); |
435 |
new_node->negate = negate; |
436 |
|
437 |
new_node->global = global; /* repeat flag */ |
438 |
|
439 |
new_node->replace_length = strlen( new_node->replace ); |
440 |
|
441 |
new_node->replace_count = 0; |
442 |
for ( c = new_node->replace; *c; c++ ) |
443 |
{ |
444 |
if ( escape ) |
445 |
{ |
446 |
escape = 0; |
447 |
continue; |
448 |
} |
449 |
|
450 |
if ( *c == '\\' ) |
451 |
{ |
452 |
escape = 1; |
453 |
continue; |
454 |
} |
455 |
|
456 |
if ( *c == '$' && *(c+1) ) |
457 |
new_node->replace_count++; |
458 |
} |
459 |
|
460 |
|
461 |
new_node->next = NULL; |
462 |
|
463 |
|
464 |
if ( *reg_list == NULL ) |
465 |
*reg_list = new_node; |
466 |
else |
467 |
{ |
468 |
/* get end of list */ |
469 |
for ( last = *reg_list; last->next; last = last->next ); |
470 |
|
471 |
last->next = new_node; |
472 |
} |
473 |
|
474 |
} |
475 |
|