1 |
/* |
2 |
$Id: entities.c,v 1.16 2002/05/17 22:38:18 whmoseley Exp $ |
3 |
** |
4 |
** This program and library is free software; you can redistribute it and/or |
5 |
** modify it under the terms of the GNU (Library) General Public License |
6 |
** as published by the Free Software Foundation; either version 2 |
7 |
** of the License, or any later version. |
8 |
** |
9 |
** This program is distributed in the hope that it will be useful, |
10 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 |
** GNU (Library) General Public License for more details. |
13 |
** |
14 |
** You should have received a copy of the GNU (Library) General Public License |
15 |
** along with this program; if not, write to the Free Software |
16 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 |
** |
18 |
** (c) Rainer.Scherg |
19 |
** |
20 |
** |
21 |
** HTML entity routines (encoding, etc.): |
22 |
** |
23 |
** internally we are working with int/wchar_t to support unicode-16 for future |
24 |
** enhancements of swish (rasc - Rainer Scherg). |
25 |
** |
26 |
** 2001-05-05 rasc |
27 |
** |
28 |
*/ |
29 |
|
30 |
|
31 |
|
32 |
|
33 |
#include <stdlib.h> |
34 |
#include "swish.h" |
35 |
#include "mem.h" |
36 |
#include "string.h" |
37 |
#include "parse_conffile.h" |
38 |
#include "config.h" |
39 |
#include "entities.h" |
40 |
|
41 |
|
42 |
/* |
43 |
** ---------------------------------------------- |
44 |
** |
45 |
** Private Module Data |
46 |
** |
47 |
** ---------------------------------------------- |
48 |
*/ |
49 |
|
50 |
/* Prototypes */ |
51 |
|
52 |
static int is_EOE(int c); /* is_EndOfEntity */ |
53 |
|
54 |
|
55 |
|
56 |
|
57 |
#define MAX_ENTITY_LEN 16 /* max chars after where we have to see the EOE */ |
58 |
|
59 |
/* |
60 |
-- Entity encoding/decoding structure |
61 |
*/ |
62 |
|
63 |
/* #define IS_EOE(a) ((a)==';') -- be W3C compliant */ |
64 |
#define IS_EOE(a) (is_EOE((int)(a))) /* tolerant routine */ |
65 |
|
66 |
|
67 |
typedef struct |
68 |
{ |
69 |
char *name; |
70 |
int code; |
71 |
} |
72 |
CEntity; |
73 |
|
74 |
|
75 |
/* |
76 |
-- CEntity Quick Hash structure |
77 |
-- works like follow: Array of ASCII-7 start "positions" (1. char of entity name) |
78 |
-- each entry can have a chain of pointers |
79 |
-- e.g. "e; --> ['q']->ce(.name .code) |
80 |
-- ->next (chains all &q...;) |
81 |
-- lots of slots in the array will be empty because only [A-Z] and [a-z] |
82 |
-- is needed. But this cost hardly any memory, and is convenient... (rasc) |
83 |
-- The hash sequence list will be re-sequenced during(!) usage (dynamic re-chaining). |
84 |
-- This brings down compares to almost 1 strcmp on entity checks. |
85 |
-- |
86 |
-- Warning: don't change this (ce_hasharray,etc) unless you know how this really works! |
87 |
-- |
88 |
-- 2001-05-14 Rainer.Scherg@rexroth.de (rasc) |
89 |
-- |
90 |
*/ |
91 |
|
92 |
struct CEHE |
93 |
{ /* CharEntityHashEntry */ |
94 |
CEntity *ce; |
95 |
struct CEHE *next; |
96 |
}; |
97 |
|
98 |
static struct CEHE *ce_hasharray[128]; |
99 |
static int ce_hasharray_initialized = 0; |
100 |
|
101 |
|
102 |
/* |
103 |
-- the following table is retrieved from HTML4.x / SGML definitions |
104 |
-- of the W3C (did it automated 2001-05-05). |
105 |
-- http://www.w3.org/TR/html40/ |
106 |
-- http://www.w3.org/TR/1999/REC-html401-19991224/sgml/entities.html |
107 |
-- |
108 |
-- 2001-05-07 Rainer.Scherg |
109 |
*/ |
110 |
|
111 |
|
112 |
static CEntity entity_table[] = { |
113 |
{"quot", 0x0022}, /* quotation mark = APL quote, U+0022 ISOnum */ |
114 |
{"amp", 0x0026}, /* ampersand, U+0026 ISOnum */ |
115 |
{"apos", 0x0027}, /* single quote */ |
116 |
{"lt", 0x003C}, /* less-than sign, U+003C ISOnum */ |
117 |
{"gt", 0x003E}, /* greater-than sign, U+003E ISOnum */ |
118 |
|
119 |
/* |
120 |
* A bunch still in the 128-255 range |
121 |
* Replacing them depend really on the charset used. |
122 |
*/ |
123 |
{"nbsp", 0x00A0}, /* no-break space = non-breaking space, U+00A0 ISOnum */ |
124 |
{"iexcl", 0x00A1}, /* inverted exclamation mark, U+00A1 ISOnum */ |
125 |
{"cent", 0x00A2}, /* cent sign, U+00A2 ISOnum */ |
126 |
{"pound", 0x00A3}, /* pound sign, U+00A3 ISOnum */ |
127 |
{"curren", 0x00A4}, /* currency sign, U+00A4 ISOnum */ |
128 |
{"yen", 0x00A5}, /* yen sign = yuan sign, U+00A5 ISOnum */ |
129 |
{"brvbar", 0x00A6}, /* broken bar = broken vertical bar, U+00A6 ISOnum */ |
130 |
{"sect", 0x00A7}, /* section sign, U+00A7 ISOnum */ |
131 |
{"uml", 0x00A8}, /* diaeresis = spacing diaeresis, U+00A8 ISOdia */ |
132 |
{"copy", 0x00A9}, /* copyright sign, U+00A9 ISOnum */ |
133 |
{"ordf", 0x00AA}, /* feminine ordinal indicator, U+00AA ISOnum */ |
134 |
{"laquo", 0x00AB}, /* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */ |
135 |
{"not", 0x00AC}, /* not sign, U+00AC ISOnum */ |
136 |
{"shy", 0x00AD}, /* soft hyphen = discretionary hyphen, U+00AD ISOnum */ |
137 |
{"reg", 0x00AE}, /* registered sign = registered trade mark sign, U+00AE ISOnum */ |
138 |
{"macr", 0x00AF}, /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */ |
139 |
{"deg", 0x00B0}, /* degree sign, U+00B0 ISOnum */ |
140 |
{"plusmn", 0x00B1}, /* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */ |
141 |
{"sup2", 0x00B2}, /* superscript two = superscript digit two = squared, U+00B2 ISOnum */ |
142 |
{"sup3", 0x00B3}, /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */ |
143 |
{"acute", 0x00B4}, /* acute accent = spacing acute, U+00B4 ISOdia */ |
144 |
{"micro", 0x00B5}, /* micro sign, U+00B5 ISOnum */ |
145 |
{"para", 0x00B6}, /* pilcrow sign = paragraph sign, U+00B6 ISOnum */ |
146 |
{"middot", 0x00B7}, /* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */ |
147 |
{"cedil", 0x00B8}, /* cedilla = spacing cedilla, U+00B8 ISOdia */ |
148 |
{"sup1", 0x00B9}, /* superscript one = superscript digit one, U+00B9 ISOnum */ |
149 |
{"ordm", 0x00BA}, /* masculine ordinal indicator, U+00BA ISOnum */ |
150 |
{"raquo", 0x00BB}, /* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */ |
151 |
{"frac14", 0x00BC}, /* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */ |
152 |
{"frac12", 0x00BD}, /* vulgar fraction one half = fraction one half, U+00BD ISOnum */ |
153 |
{"frac34", 0x00BE}, /* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */ |
154 |
{"iquest", 0x00BF}, /* inverted question mark = turned question mark, U+00BF ISOnum */ |
155 |
{"Agrave", 0x00C0}, /* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */ |
156 |
{"Aacute", 0x00C1}, /* latin capital letter A with acute, U+00C1 ISOlat1 */ |
157 |
{"Acirc", 0x00C2}, /* latin capital letter A with circumflex, U+00C2 ISOlat1 */ |
158 |
{"Atilde", 0x00C3}, /* latin capital letter A with tilde, U+00C3 ISOlat1 */ |
159 |
{"Auml", 0x00C4}, /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */ |
160 |
{"Aring", 0x00C5}, /* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */ |
161 |
{"AElig", 0x00C6}, /* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */ |
162 |
{"Ccedil", 0x00C7}, /* latin capital letter C with cedilla, U+00C7 ISOlat1 */ |
163 |
{"Egrave", 0x00C8}, /* latin capital letter E with grave, U+00C8 ISOlat1 */ |
164 |
{"Eacute", 0x00C9}, /* latin capital letter E with acute, U+00C9 ISOlat1 */ |
165 |
{"Ecirc", 0x00CA}, /* latin capital letter E with circumflex, U+00CA ISOlat1 */ |
166 |
{"Euml", 0x00CB}, /* latin capital letter E with diaeresis, U+00CB ISOlat1 */ |
167 |
{"Igrave", 0x00CC}, /* latin capital letter I with grave, U+00CC ISOlat1 */ |
168 |
{"Iacute", 0x00CD}, /* latin capital letter I with acute, U+00CD ISOlat1 */ |
169 |
{"Icirc", 0x00CE}, /* latin capital letter I with circumflex, U+00CE ISOlat1 */ |
170 |
{"Iuml", 0x00CF}, /* latin capital letter I with diaeresis, U+00CF ISOlat1 */ |
171 |
{"ETH", 0x00D0}, /* latin capital letter ETH, U+00D0 ISOlat1 */ |
172 |
{"Ntilde", 0x00D1}, /* latin capital letter N with tilde, U+00D1 ISOlat1 */ |
173 |
{"Ograve", 0x00D2}, /* latin capital letter O with grave, U+00D2 ISOlat1 */ |
174 |
{"Oacute", 0x00D3}, /* latin capital letter O with acute, U+00D3 ISOlat1 */ |
175 |
{"Ocirc", 0x00D4}, /* latin capital letter O with circumflex, U+00D4 ISOlat1 */ |
176 |
{"Otilde", 0x00D5}, /* latin capital letter O with tilde, U+00D5 ISOlat1 */ |
177 |
{"Ouml", 0x00D6}, /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */ |
178 |
{"times", 0x00D7}, /* multiplication sign, U+00D7 ISOnum */ |
179 |
{"Oslash", 0x00D8}, /* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */ |
180 |
{"Ugrave", 0x00D9}, /* latin capital letter U with grave, U+00D9 ISOlat1 */ |
181 |
{"Uacute", 0x00DA}, /* latin capital letter U with acute, U+00DA ISOlat1 */ |
182 |
{"Ucirc", 0x00DB}, /* latin capital letter U with circumflex, U+00DB ISOlat1 */ |
183 |
{"Uuml", 0x00DC}, /* latin capital letter U with diaeresis, U+00DC ISOlat1 */ |
184 |
{"Yacute", 0x00DD}, /* latin capital letter Y with acute, U+00DD ISOlat1 */ |
185 |
{"THORN", 0x00DE}, /* latin capital letter THORN, U+00DE ISOlat1 */ |
186 |
{"szlig", 0x00DF}, /* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */ |
187 |
{"agrave", 0x00E0}, /* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */ |
188 |
{"aacute", 0x00E1}, /* latin small letter a with acute, U+00E1 ISOlat1 */ |
189 |
{"acirc", 0x00E2}, /* latin small letter a with circumflex, U+00E2 ISOlat1 */ |
190 |
{"atilde", 0x00E3}, /* latin small letter a with tilde, U+00E3 ISOlat1 */ |
191 |
{"auml", 0x00E4}, /* latin small letter a with diaeresis, U+00E4 ISOlat1 */ |
192 |
{"aring", 0x00E5}, /* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */ |
193 |
{"aelig", 0x00E6}, /* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */ |
194 |
{"ccedil", 0x00E7}, /* latin small letter c with cedilla, U+00E7 ISOlat1 */ |
195 |
{"egrave", 0x00E8}, /* latin small letter e with grave, U+00E8 ISOlat1 */ |
196 |
{"eacute", 0x00E9}, /* latin small letter e with acute, U+00E9 ISOlat1 */ |
197 |
{"ecirc", 0x00EA}, /* latin small letter e with circumflex, U+00EA ISOlat1 */ |
198 |
{"euml", 0x00EB}, /* latin small letter e with diaeresis, U+00EB ISOlat1 */ |
199 |
{"igrave", 0x00EC}, /* latin small letter i with grave, U+00EC ISOlat1 */ |
200 |
{"iacute", 0x00ED}, /* latin small letter i with acute, U+00ED ISOlat1 */ |
201 |
{"icirc", 0x00EE}, /* latin small letter i with circumflex, U+00EE ISOlat1 */ |
202 |
{"iuml", 0x00EF}, /* latin small letter i with diaeresis, U+00EF ISOlat1 */ |
203 |
{"eth", 0x00F0}, /* latin small letter eth, U+00F0 ISOlat1 */ |
204 |
{"ntilde", 0x00F1}, /* latin small letter n with tilde, U+00F1 ISOlat1 */ |
205 |
{"ograve", 0x00F2}, /* latin small letter o with grave, U+00F2 ISOlat1 */ |
206 |
{"oacute", 0x00F3}, /* latin small letter o with acute, U+00F3 ISOlat1 */ |
207 |
{"ocirc", 0x00F4}, /* latin small letter o with circumflex, U+00F4 ISOlat1 */ |
208 |
{"otilde", 0x00F5}, /* latin small letter o with tilde, U+00F5 ISOlat1 */ |
209 |
{"ouml", 0x00F6}, /* latin small letter o with diaeresis, U+00F6 ISOlat1 */ |
210 |
{"divide", 0x00F7}, /* division sign, U+00F7 ISOnum */ |
211 |
{"oslash", 0x00F8}, /* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */ |
212 |
{"ugrave", 0x00F9}, /* latin small letter u with grave, U+00F9 ISOlat1 */ |
213 |
{"uacute", 0x00FA}, /* latin small letter u with acute, U+00FA ISOlat1 */ |
214 |
{"ucirc", 0x00FB}, /* latin small letter u with circumflex, U+00FB ISOlat1 */ |
215 |
{"uuml", 0x00FC}, /* latin small letter u with diaeresis, U+00FC ISOlat1 */ |
216 |
{"yacute", 0x00FD}, /* latin small letter y with acute, U+00FD ISOlat1 */ |
217 |
{"thorn", 0x00FE}, /* latin small letter thorn with, U+00FE ISOlat1 */ |
218 |
{"yuml", 0x00FF}, /* latin small letter y with diaeresis, U+00FF ISOlat1 */ |
219 |
|
220 |
{"OElig", 0x0152}, /* latin capital ligature OE, U+0152 ISOlat2 */ |
221 |
{"oelig", 0x0153}, /* latin small ligature oe, U+0153 ISOlat2 */ |
222 |
{"Scaron", 0x0160}, /* latin capital letter S with caron, U+0160 ISOlat2 */ |
223 |
{"scaron", 0x0161}, /* latin small letter s with caron, U+0161 ISOlat2 */ |
224 |
{"Yuml", 0x0178}, /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */ |
225 |
|
226 |
/* |
227 |
* Anything below should really be kept as entities references |
228 |
*/ |
229 |
|
230 |
/* |
231 |
-- Latin Extended-B |
232 |
*/ |
233 |
{"fnof", 0x0192}, /* latin small f with hook = function = florin, U+0192 ISOtech */ |
234 |
|
235 |
{"circ", 0x02C6}, /* modifier letter circumflex accent, U+02C6 ISOpub */ |
236 |
{"tilde", 0x02DC}, /* small tilde, U+02DC ISOdia */ |
237 |
|
238 |
/* |
239 |
-- Greek symbols |
240 |
*/ |
241 |
{"Alpha", 0x0391}, /* greek capital letter alpha, U+0391 */ |
242 |
{"Beta", 0x0392}, /* greek capital letter beta, U+0392 */ |
243 |
{"Gamma", 0x0393}, /* greek capital letter gamma, U+0393 ISOgrk3 */ |
244 |
{"Delta", 0x0394}, /* greek capital letter delta, U+0394 ISOgrk3 */ |
245 |
{"Epsilon", 0x0395}, /* greek capital letter epsilon, U+0395 */ |
246 |
{"Zeta", 0x0396}, /* greek capital letter zeta, U+0396 */ |
247 |
{"Eta", 0x0397}, /* greek capital letter eta, U+0397 */ |
248 |
{"Theta", 0x0398}, /* greek capital letter theta, U+0398 ISOgrk3 */ |
249 |
{"Iota", 0x0399}, /* greek capital letter iota, U+0399 */ |
250 |
{"Kappa", 0x039A}, /* greek capital letter kappa, U+039A */ |
251 |
{"Lambda", 0x039B}, /* greek capital letter lambda, U+039B ISOgrk3 */ |
252 |
{"Mu", 0x039C}, /* greek capital letter mu, U+039C */ |
253 |
{"Nu", 0x039D}, /* greek capital letter nu, U+039D */ |
254 |
{"Xi", 0x039E}, /* greek capital letter xi, U+039E ISOgrk3 */ |
255 |
{"Omicron", 0x039F}, /* greek capital letter omicron, U+039F */ |
256 |
{"Pi", 0x03A0}, /* greek capital letter pi, U+03A0 ISOgrk3 */ |
257 |
{"Rho", 0x03A1}, /* greek capital letter rho, U+03A1 */ |
258 |
/* -- there is no Sigmaf, and no U+03A2 character either */ |
259 |
{"Sigma", 0x03A3}, /* greek capital letter sigma, U+03A3 ISOgrk3 */ |
260 |
{"Tau", 0x03A4}, /* greek capital letter tau, U+03A4 */ |
261 |
{"Upsilon", 0x03A5}, /* greek capital letter upsilon, U+03A5 ISOgrk3 */ |
262 |
{"Phi", 0x03A6}, /* greek capital letter phi, U+03A6 ISOgrk3 */ |
263 |
{"Chi", 0x03A7}, /* greek capital letter chi, U+03A7 */ |
264 |
{"Psi", 0x03A8}, /* greek capital letter psi, U+03A8 ISOgrk3 */ |
265 |
{"Omega", 0x03A9}, /* greek capital letter omega, U+03A9 ISOgrk3 */ |
266 |
|
267 |
{"alpha", 0x03B1}, /* greek small letter alpha, U+03B1 ISOgrk3 */ |
268 |
{"beta", 0x03B2}, /* greek small letter beta, U+03B2 ISOgrk3 */ |
269 |
{"gamma", 0x03B3}, /* greek small letter gamma, U+03B3 ISOgrk3 */ |
270 |
{"delta", 0x03B4}, /* greek small letter delta, U+03B4 ISOgrk3 */ |
271 |
{"epsilon", 0x03B5}, /* greek small letter epsilon, U+03B5 ISOgrk3 */ |
272 |
{"zeta", 0x03B6}, /* greek small letter zeta, U+03B6 ISOgrk3 */ |
273 |
{"eta", 0x03B7}, /* greek small letter eta, U+03B7 ISOgrk3 */ |
274 |
{"theta", 0x03B8}, /* greek small letter theta, U+03B8 ISOgrk3 */ |
275 |
{"iota", 0x03B9}, /* greek small letter iota, U+03B9 ISOgrk3 */ |
276 |
{"kappa", 0x03BA}, /* greek small letter kappa, U+03BA ISOgrk3 */ |
277 |
{"lambda", 0x03BB}, /* greek small letter lambda, U+03BB ISOgrk3 */ |
278 |
{"mu", 0x03BC}, /* greek small letter mu, U+03BC ISOgrk3 */ |
279 |
{"nu", 0x03BD}, /* greek small letter nu, U+03BD ISOgrk3 */ |
280 |
{"xi", 0x03BE}, /* greek small letter xi, U+03BE ISOgrk3 */ |
281 |
{"omicron", 0x03BF}, /* greek small letter omicron, U+03BF NEW */ |
282 |
{"pi", 0x03C0}, /* greek small letter pi, U+03C0 ISOgrk3 */ |
283 |
{"rho", 0x03C1}, /* greek small letter rho, U+03C1 ISOgrk3 */ |
284 |
{"sigmaf", 0x03C2}, /* greek small letter final sigma, U+03C2 ISOgrk3 */ |
285 |
{"sigma", 0x03C3}, /* greek small letter sigma, U+03C3 ISOgrk3 */ |
286 |
{"tau", 0x03C4}, /* greek small letter tau, U+03C4 ISOgrk3 */ |
287 |
{"upsilon", 0x03C5}, /* greek small letter upsilon, U+03C5 ISOgrk3 */ |
288 |
{"phi", 0x03C6}, /* greek small letter phi, U+03C6 ISOgrk3 */ |
289 |
{"chi", 0x03C7}, /* greek small letter chi, U+03C7 ISOgrk3 */ |
290 |
{"psi", 0x03C8}, /* greek small letter psi, U+03C8 ISOgrk3 */ |
291 |
{"omega", 0x03C9}, /* greek small letter omega, U+03C9 ISOgrk3 */ |
292 |
{"thetasym", 0x03D1}, /* greek small letter theta symbol, U+03D1 NEW */ |
293 |
{"upsih", 0x03D2}, /* greek upsilon with hook symbol, U+03D2 NEW */ |
294 |
{"piv", 0x03D6}, /* greek pi symbol, U+03D6 ISOgrk3 */ |
295 |
|
296 |
{"ensp", 0x2002}, /* en space, U+2002 ISOpub */ |
297 |
{"emsp", 0x2003}, /* em space, U+2003 ISOpub */ |
298 |
{"thinsp", 0x2009}, /* thin space, U+2009 ISOpub */ |
299 |
{"zwnj", 0x200C}, /* zero width non-joiner, U+200C NEW RFC 2070 */ |
300 |
{"zwj", 0x200D}, /* zero width joiner, U+200D NEW RFC 2070 */ |
301 |
{"lrm", 0x200E}, /* left-to-right mark, U+200E NEW RFC 2070 */ |
302 |
{"rlm", 0x200F}, /* right-to-left mark, U+200F NEW RFC 2070 */ |
303 |
{"ndash", 0x2013}, /* en dash, U+2013 ISOpub */ |
304 |
{"mdash", 0x2014}, /* em dash, U+2014 ISOpub */ |
305 |
{"lsquo", 0x2018}, /* left single quotation mark, U+2018 ISOnum */ |
306 |
{"rsquo", 0x2019}, /* right single quotation mark, U+2019 ISOnum */ |
307 |
{"sbquo", 0x201A}, /* single low-9 quotation mark, U+201A NEW */ |
308 |
{"ldquo", 0x201C}, /* left double quotation mark, U+201C ISOnum */ |
309 |
{"rdquo", 0x201D}, /* right double quotation mark, U+201D ISOnum */ |
310 |
{"bdquo", 0x201E}, /* double low-9 quotation mark, U+201E NEW */ |
311 |
{"dagger", 0x2020}, /* dagger, U+2020 ISOpub */ |
312 |
{"Dagger", 0x2021}, /* double dagger, U+2021 ISOpub */ |
313 |
|
314 |
{"bull", 0x2022}, /* bullet = black small circle, U+2022 ISOpub */ |
315 |
{"hellip", 0x2026}, /* horizontal ellipsis = three dot leader, U+2026 ISOpub */ |
316 |
|
317 |
{"permil", 0x2030}, /* per mille sign, U+2030 ISOtech */ |
318 |
|
319 |
{"prime", 0x2032}, /* prime = minutes = feet, U+2032 ISOtech */ |
320 |
{"Prime", 0x2033}, /* double prime = seconds = inches, U+2033 ISOtech */ |
321 |
|
322 |
{"lsaquo", 0x2039}, /* single left-pointing angle quotation mark, U+2039 ISO proposed */ |
323 |
{"rsaquo", 0x203A}, /* single right-pointing angle quotation mark, U+203A ISO proposed */ |
324 |
|
325 |
{"oline", 0x203E}, /* overline = spacing overscore, U+203E NEW */ |
326 |
{"frasl", 0x2044}, /* fraction slash, U+2044 NEW */ |
327 |
|
328 |
{"euro", 0x20AC}, /* euro sign, U+20AC NEW */ |
329 |
|
330 |
/* -- Letterlike Symbols */ |
331 |
{"image", 0x2111}, /* blackletter capital I = imaginary part, U+2111 ISOamso */ |
332 |
{"weierp", 0x2118}, /* script capital P = power set = Weierstrass p, U+2118 ISOamso */ |
333 |
{"real", 0x211C}, /* blackletter capital R = real part symbol, U+211C ISOamso */ |
334 |
{"trade", 0x2122}, /* trade mark sign, U+2122 ISOnum */ |
335 |
|
336 |
/* -- alef symbol is NOT the same as hebrew letter alef, U+05D0 */ |
337 |
{"alefsym", 0x2135}, /* alef symbol = first transfinite cardinal, U+2135 NEW */ |
338 |
|
339 |
/* -- Arrow Symbols */ |
340 |
{"larr", 0x2190}, /* leftwards arrow, U+2190 ISOnum */ |
341 |
{"uarr", 0x2191}, /* upwards arrow, U+2191 ISOnum */ |
342 |
{"rarr", 0x2192}, /* rightwards arrow, U+2192 ISOnum */ |
343 |
{"darr", 0x2193}, /* downwards arrow, U+2193 ISOnum */ |
344 |
{"harr", 0x2194}, /* left right arrow, U+2194 ISOamsa */ |
345 |
{"crarr", 0x21B5}, /* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */ |
346 |
{"lArr", 0x21D0}, /* leftwards double arrow, U+21D0 ISOtech */ |
347 |
{"uArr", 0x21D1}, /* upwards double arrow, U+21D1 ISOamsa */ |
348 |
{"rArr", 0x21D2}, /* rightwards double arrow, U+21D2 ISOtech */ |
349 |
{"dArr", 0x21D3}, /* downwards double arrow, U+21D3 ISOamsa */ |
350 |
{"hArr", 0x21D4}, /* left right double arrow, U+21D4 ISOamsa */ |
351 |
|
352 |
/* -- Mathematical Operators */ |
353 |
{"forall", 0x2200}, /* for all, U+2200 ISOtech */ |
354 |
{"part", 0x2202}, /* partial differential, U+2202 ISOtech */ |
355 |
{"exist", 0x2203}, /* there exists, U+2203 ISOtech */ |
356 |
{"empty", 0x2205}, /* empty set = null set = diameter, U+2205 ISOamso */ |
357 |
{"nabla", 0x2207}, /* nabla = backward difference, U+2207 ISOtech */ |
358 |
{"isin", 0x2208}, /* element of, U+2208 ISOtech */ |
359 |
{"notin", 0x2209}, /* not an element of, U+2209 ISOtech */ |
360 |
{"ni", 0x220B}, /* contains as member, U+220B ISOtech */ |
361 |
{"prod", 0x220F}, /* n-ary product = product sign, U+220F ISOamsb */ |
362 |
{"sum", 0x2211}, /* n-ary sumation, U+2211 ISOamsb */ |
363 |
{"minus", 0x2212}, /* minus sign, U+2212 ISOtech */ |
364 |
{"lowast", 0x2217}, /* asterisk operator, U+2217 ISOtech */ |
365 |
{"radic", 0x221A}, /* square root = radical sign, U+221A ISOtech */ |
366 |
{"prop", 0x221D}, /* proportional to, U+221D ISOtech */ |
367 |
{"infin", 0x221E}, /* infinity, U+221E ISOtech */ |
368 |
{"ang", 0x2220}, /* angle, U+2220 ISOamso */ |
369 |
{"and", 0x2227}, /* logical and = wedge, U+2227 ISOtech */ |
370 |
{"or", 0x2228}, /* logical or = vee, U+2228 ISOtech */ |
371 |
{"cap", 0x2229}, /* intersection = cap, U+2229 ISOtech */ |
372 |
{"cup", 0x222A}, /* union = cup, U+222A ISOtech */ |
373 |
{"int", 0x222B}, /* integral, U+222B ISOtech */ |
374 |
{"there4", 0x2234}, /* therefore, U+2234 ISOtech */ |
375 |
{"sim", 0x223C}, /* tilde operator = varies with = similar to, U+223C ISOtech */ |
376 |
{"cong", 0x2245}, /* approximately equal to, U+2245 ISOtech */ |
377 |
{"asymp", 0x2248}, /* almost equal to = asymptotic to, U+2248 ISOamsr */ |
378 |
{"ne", 0x2260}, /* not equal to, U+2260 ISOtech */ |
379 |
{"equiv", 0x2261}, /* identical to, U+2261 ISOtech */ |
380 |
{"le", 0x2264}, /* less-than or equal to, U+2264 ISOtech */ |
381 |
{"ge", 0x2265}, /* greater-than or equal to, U+2265 ISOtech */ |
382 |
{"sub", 0x2282}, /* subset of, U+2282 ISOtech */ |
383 |
{"sup", 0x2283}, /* superset of, U+2283 ISOtech */ |
384 |
{"nsub", 0x2284}, /* not a subset of, U+2284 ISOamsn */ |
385 |
{"sube", 0x2286}, /* subset of or equal to, U+2286 ISOtech */ |
386 |
{"supe", 0x2287}, /* superset of or equal to, U+2287 ISOtech */ |
387 |
{"oplus", 0x2295}, /* circled plus = direct sum, U+2295 ISOamsb */ |
388 |
{"otimes", 0x2297}, /* circled times = vector product, U+2297 ISOamsb */ |
389 |
{"perp", 0x22A5}, /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */ |
390 |
{"sdot", 0x22C5}, /* dot operator, U+22C5 ISOamsb */ |
391 |
{"lceil", 0x2308}, /* left ceiling = apl upstile, U+2308 ISOamsc */ |
392 |
{"rceil", 0x2309}, /* right ceiling, U+2309 ISOamsc */ |
393 |
{"lfloor", 0x230A}, /* left floor = apl downstile, U+230A ISOamsc */ |
394 |
{"rfloor", 0x230B}, /* right floor, U+230B ISOamsc */ |
395 |
{"lang", 0x2329}, /* left-pointing angle bracket = bra, U+2329 ISOtech */ |
396 |
{"rang", 0x232A}, /* right-pointing angle bracket = ket, U+232A ISOtech */ |
397 |
{"loz", 0x25CA}, /* lozenge, U+25CA ISOpub */ |
398 |
|
399 |
/* -- Miscellaneous Symbols */ |
400 |
{"spades", 0x2660}, /* black spade suit, U+2660 ISOpub */ |
401 |
{"clubs", 0x2663}, /* black club suit = shamrock, U+2663 ISOpub */ |
402 |
{"hearts", 0x2665}, /* black heart suit = valentine, U+2665 ISOpub */ |
403 |
{"diams", 0x2666}, /* black diamond suit, U+2666 ISOpub */ |
404 |
|
405 |
}; |
406 |
|
407 |
|
408 |
/* |
409 |
** ---------------------------------------------- |
410 |
** |
411 |
** Module management code starts here |
412 |
** |
413 |
** ---------------------------------------------- |
414 |
*/ |
415 |
|
416 |
/* |
417 |
-- init structures for Entities |
418 |
*/ |
419 |
|
420 |
void initModule_Entities(SWISH * sw) |
421 |
{ |
422 |
struct MOD_Entities *md; |
423 |
|
424 |
|
425 |
md = (struct MOD_Entities *) emalloc(sizeof(struct MOD_Entities)); |
426 |
|
427 |
sw->Entities = md; |
428 |
|
429 |
md->convertEntities = CONVERTHTMLENTITIES; |
430 |
|
431 |
/* |
432 |
-- init entity hash |
433 |
-- this is module local only |
434 |
*/ |
435 |
|
436 |
if ( !ce_hasharray_initialized++ ) |
437 |
{ |
438 |
int i, |
439 |
tab_len; |
440 |
CEntity *ce_p; |
441 |
struct CEHE **hash_pp, |
442 |
*tmp_p; |
443 |
|
444 |
/* empty positions */ |
445 |
for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++) |
446 |
ce_hasharray[i] = (struct CEHE *) NULL; |
447 |
|
448 |
|
449 |
/* |
450 |
-- fill entity table into hash |
451 |
-- process from end to start of entity_table, because most used |
452 |
-- entities are at the beginning (iso) |
453 |
-- this is due to "insert in hash sequence" behavior in hashtab (performance!) |
454 |
-- The improvement is minimal, because the hash table re-chains during usage. |
455 |
*/ |
456 |
|
457 |
tab_len = sizeof(entity_table) / sizeof(entity_table[0]); |
458 |
|
459 |
for (i = tab_len - 1; i >= 0; i--) |
460 |
{ |
461 |
ce_p = &entity_table[i]; |
462 |
hash_pp = &ce_hasharray[(int) *(ce_p->name) & 0x7F]; |
463 |
/* insert entity-ptr at start of ptr sequence in hash */ |
464 |
tmp_p = *hash_pp; |
465 |
*hash_pp = (struct CEHE *) emalloc(sizeof(struct CEHE)); |
466 |
(*hash_pp)->ce = ce_p; |
467 |
(*hash_pp)->next = tmp_p; |
468 |
} |
469 |
|
470 |
} /* end init hash block */ |
471 |
|
472 |
} |
473 |
|
474 |
|
475 |
|
476 |
/* |
477 |
-- release structures for Entities |
478 |
-- release all wired memory |
479 |
*/ |
480 |
|
481 |
void freeModule_Entities(SWISH * sw) |
482 |
{ |
483 |
|
484 |
/* free module data structure */ |
485 |
|
486 |
efree(sw->Entities); |
487 |
sw->Entities = NULL; |
488 |
|
489 |
|
490 |
/* |
491 |
-- free local entity hash table |
492 |
*/ |
493 |
{ |
494 |
int i; |
495 |
struct CEHE *hash_p, |
496 |
*tmp_p; |
497 |
/* free ptr "chains" in array */ |
498 |
for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++) |
499 |
{ |
500 |
hash_p = ce_hasharray[i]; |
501 |
while (hash_p) |
502 |
{ |
503 |
tmp_p = hash_p->next; |
504 |
efree(hash_p); |
505 |
hash_p = tmp_p; |
506 |
} |
507 |
ce_hasharray[i] = (struct CEHE *) NULL; |
508 |
} |
509 |
|
510 |
} /* end free hash block */ |
511 |
|
512 |
} |
513 |
|
514 |
|
515 |
/* |
516 |
** ---------------------------------------------- |
517 |
** |
518 |
** Module config code starts here |
519 |
** |
520 |
** ---------------------------------------------- |
521 |
*/ |
522 |
|
523 |
|
524 |
/* |
525 |
-- Config Directives |
526 |
-- Configuration directives for this Module |
527 |
-- return: 0/1 = none/config applied |
528 |
*/ |
529 |
|
530 |
int configModule_Entities(SWISH * sw, StringList * sl) |
531 |
{ |
532 |
struct MOD_Entities *md = sw->Entities; |
533 |
char *w0; |
534 |
int retval; |
535 |
|
536 |
|
537 |
w0 = sl->word[0]; |
538 |
retval = 1; |
539 |
|
540 |
|
541 |
if (strcasecmp(w0, "ConvertHTMLEntities") == 0) |
542 |
{ |
543 |
md->convertEntities = getYesNoOrAbort(sl, 1, 1); |
544 |
} |
545 |
else |
546 |
{ |
547 |
retval = 0; /* not a Entities directive */ |
548 |
} |
549 |
|
550 |
|
551 |
return retval; |
552 |
} |
553 |
|
554 |
|
555 |
|
556 |
|
557 |
|
558 |
|
559 |
/* |
560 |
** ---------------------------------------------- |
561 |
** |
562 |
** Module code starts here |
563 |
** |
564 |
** ---------------------------------------------- |
565 |
*/ |
566 |
|
567 |
|
568 |
/* |
569 |
-- convert a string containing HTML/XML entities |
570 |
-- conversion is done on the string itsself. |
571 |
-- conversion is only done, if config directive is set to "YES" |
572 |
-- return ptr to converted string. |
573 |
*/ |
574 |
|
575 |
unsigned char *sw_ConvHTMLEntities2ISO(SWISH * sw, unsigned char *s) |
576 |
{ |
577 |
return (sw->Entities->convertEntities) ? strConvHTMLEntities2ISO(s) : s; |
578 |
} |
579 |
|
580 |
|
581 |
|
582 |
/* |
583 |
-- convert a string containing HTML/XML entities |
584 |
-- conversion is done on the string itsself. |
585 |
-- return ptr to converted string. |
586 |
*/ |
587 |
|
588 |
unsigned char *strConvHTMLEntities2ISO(unsigned char *buf) |
589 |
{ |
590 |
unsigned char *s, |
591 |
*t; |
592 |
unsigned char *d; |
593 |
int code; |
594 |
|
595 |
|
596 |
s = d = buf; |
597 |
|
598 |
while (*s) |
599 |
{ |
600 |
|
601 |
/* if not entity start, next */ |
602 |
if (*s != '&') |
603 |
{ |
604 |
*d++ = *s++; |
605 |
} |
606 |
else |
607 |
{ |
608 |
/* entity found, identify and decode */ |
609 |
/* ignore zero entities and UNICODE ! */ |
610 |
code = charEntityDecode(s, &t); |
611 |
if (code && (code < 256)) |
612 |
*d++ = (unsigned char) code; |
613 |
s = t; |
614 |
} |
615 |
} |
616 |
*d = '\0'; |
617 |
|
618 |
return buf; |
619 |
} |
620 |
|
621 |
|
622 |
|
623 |
/* |
624 |
-- decode entity string to character code: |
625 |
-- &#dec; &#xhex; &#Xhex; &named; |
626 |
-- Decoding is hash optimized with dynamic re-chaining for |
627 |
-- performance improvement... |
628 |
-- return: entity character (decoded) |
629 |
-- position "end" (if != NULL) past "entity" or behind ret. char |
630 |
-- on illegal entities, just return the char... |
631 |
*/ |
632 |
|
633 |
int charEntityDecode(unsigned char *s, unsigned char **end) |
634 |
{ |
635 |
unsigned char *s1, |
636 |
*t; |
637 |
unsigned char *e_end; |
638 |
unsigned char s_cmp[MAX_ENTITY_LEN + 1]; |
639 |
int len; |
640 |
int code; |
641 |
|
642 |
|
643 |
/* |
644 |
-- no entity ctrl start char?, err: return char |
645 |
*/ |
646 |
if (*s != '&') |
647 |
{ |
648 |
if (end) |
649 |
*end = s + 1; |
650 |
return (int) *s; |
651 |
} |
652 |
|
653 |
|
654 |
|
655 |
/* ok, seems valid entity starting char */ |
656 |
code = 0; |
657 |
e_end = NULL; |
658 |
|
659 |
if (*(s + 1) == '#') |
660 |
{ /* numeric entity "&#" */ |
661 |
|
662 |
s += 2; /* after "&#" */ |
663 |
switch (*s) |
664 |
{ |
665 |
case 'x': |
666 |
case 'X': |
667 |
++s; /* skip x */ |
668 |
code = (int) strtoul((char *)s, (char **) &e_end, (int) 16); |
669 |
break; |
670 |
default: |
671 |
code = (int) strtoul((char *)s, (char **) &e_end, (int) 10); |
672 |
break; |
673 |
} |
674 |
|
675 |
} |
676 |
else |
677 |
{ |
678 |
|
679 |
/* |
680 |
-- ok, seems to be a named entity, find terminating char |
681 |
-- t = NULL if not found... |
682 |
-- if no char found: return '&' (illegal entity) |
683 |
*/ |
684 |
|
685 |
len = 0; |
686 |
t = NULL; |
687 |
s1 = s; |
688 |
while (len < MAX_ENTITY_LEN) |
689 |
{ |
690 |
s_cmp[len] = *(++s1); |
691 |
if (IS_EOE(*s1)) |
692 |
{ |
693 |
t = s1; /* End of named entity */ |
694 |
break; |
695 |
} |
696 |
if (!*s1) |
697 |
break; /* maybe this is also checked by is_EOE! */ |
698 |
len++; |
699 |
} |
700 |
s_cmp[len] = '\0'; |
701 |
|
702 |
/* |
703 |
-- hash search block |
704 |
-- case sensitiv search (hashvalue = 1 entity name char) |
705 |
-- (& 0x7F to prevent hashtable mem coredumps by illegal chars) |
706 |
-- improve performance, by rechaining found elements |
707 |
*/ |
708 |
|
709 |
if (t) |
710 |
{ |
711 |
struct CEHE *hash_p; |
712 |
struct CEHE **hash_pp, |
713 |
*last_p; |
714 |
|
715 |
hash_pp = &ce_hasharray[*(s + 1) & 0x7F]; |
716 |
last_p = NULL; |
717 |
hash_p = *hash_pp; |
718 |
while (hash_p) |
719 |
{ |
720 |
if (!strcmp( (char *)hash_p->ce->name, (char *)s_cmp)) |
721 |
{ |
722 |
code = hash_p->ce->code; |
723 |
if (last_p) |
724 |
{ /* rechain hash sequence list (last found = first) */ |
725 |
last_p->next = hash_p->next; /* take elem out of seq */ |
726 |
hash_p->next = *hash_pp; /* old 1. = 2. */ |
727 |
*hash_pp = hash_p; /* found = 1st */ |
728 |
} |
729 |
e_end = t; /* found -> set end */ |
730 |
break; |
731 |
} |
732 |
last_p = hash_p; |
733 |
hash_p = hash_p->next; |
734 |
} |
735 |
|
736 |
} |
737 |
} /* end if */ |
738 |
|
739 |
|
740 |
if (!e_end) |
741 |
{ |
742 |
code = *s; |
743 |
e_end = s + 1; |
744 |
} |
745 |
else |
746 |
{ |
747 |
if (*e_end == ';') |
748 |
e_end++; /* W3C EndOfEntity */ |
749 |
} |
750 |
|
751 |
|
752 |
if (end) |
753 |
*end = e_end; |
754 |
return code; |
755 |
} |
756 |
|
757 |
|
758 |
/* |
759 |
-- check if a char is the end of a html entity. |
760 |
-- behavior can be W3C pedantic or tolerant. |
761 |
-- mapped via macro to avoid function calls on strict ==';' behavior |
762 |
-- return: cmp value |
763 |
*/ |
764 |
|
765 |
static int is_EOE(int c) |
766 |
{ |
767 |
/* be tolerant ! */ |
768 |
return ((!isprint(c)) || ispunct(c) || isspace(c)) ? 1 : 0; |
769 |
} |