/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/entities.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/entities.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Error occurred while calculating annotation data.
Importing web-site building process.

1 /*
2 $Id: entities.c,v 1.16 2002/05/17 22:38:18 whmoseley Exp $
3 **
4 ** This program and library is free software; you can redistribute it and/or
5 ** modify it under the terms of the GNU (Library) General Public License
6 ** as published by the Free Software Foundation; either version 2
7 ** of the License, or any later version.
8 **
9 ** This program is distributed in the hope that it will be useful,
10 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
11 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 ** GNU (Library) General Public License for more details.
13 **
14 ** You should have received a copy of the GNU (Library) General Public License
15 ** along with this program; if not, write to the Free Software
16 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 **
18 ** (c) Rainer.Scherg
19 **
20 **
21 ** HTML entity routines (encoding, etc.):
22 **
23 ** internally we are working with int/wchar_t to support unicode-16 for future
24 ** enhancements of swish (rasc - Rainer Scherg).
25 **
26 ** 2001-05-05 rasc
27 **
28 */
29
30
31
32
33 #include <stdlib.h>
34 #include "swish.h"
35 #include "mem.h"
36 #include "string.h"
37 #include "parse_conffile.h"
38 #include "config.h"
39 #include "entities.h"
40
41
42 /*
43 ** ----------------------------------------------
44 **
45 ** Private Module Data
46 **
47 ** ----------------------------------------------
48 */
49
50 /* Prototypes */
51
52 static int is_EOE(int c); /* is_EndOfEntity */
53
54
55
56
57 #define MAX_ENTITY_LEN 16 /* max chars after where we have to see the EOE */
58
59 /*
60 -- Entity encoding/decoding structure
61 */
62
63 /* #define IS_EOE(a) ((a)==';') -- be W3C compliant */
64 #define IS_EOE(a) (is_EOE((int)(a))) /* tolerant routine */
65
66
67 typedef struct
68 {
69 char *name;
70 int code;
71 }
72 CEntity;
73
74
75 /*
76 -- CEntity Quick Hash structure
77 -- works like follow: Array of ASCII-7 start "positions" (1. char of entity name)
78 -- each entry can have a chain of pointers
79 -- e.g. &quote; --> ['q']->ce(.name .code)
80 -- ->next (chains all &q...;)
81 -- lots of slots in the array will be empty because only [A-Z] and [a-z]
82 -- is needed. But this cost hardly any memory, and is convenient... (rasc)
83 -- The hash sequence list will be re-sequenced during(!) usage (dynamic re-chaining).
84 -- This brings down compares to almost 1 strcmp on entity checks.
85 --
86 -- Warning: don't change this (ce_hasharray,etc) unless you know how this really works!
87 --
88 -- 2001-05-14 Rainer.Scherg@rexroth.de (rasc)
89 --
90 */
91
92 struct CEHE
93 { /* CharEntityHashEntry */
94 CEntity *ce;
95 struct CEHE *next;
96 };
97
98 static struct CEHE *ce_hasharray[128];
99 static int ce_hasharray_initialized = 0;
100
101
102 /*
103 -- the following table is retrieved from HTML4.x / SGML definitions
104 -- of the W3C (did it automated 2001-05-05).
105 -- http://www.w3.org/TR/html40/
106 -- http://www.w3.org/TR/1999/REC-html401-19991224/sgml/entities.html
107 --
108 -- 2001-05-07 Rainer.Scherg
109 */
110
111
112 static CEntity entity_table[] = {
113 {"quot", 0x0022}, /* quotation mark = APL quote, U+0022 ISOnum */
114 {"amp", 0x0026}, /* ampersand, U+0026 ISOnum */
115 {"apos", 0x0027}, /* single quote */
116 {"lt", 0x003C}, /* less-than sign, U+003C ISOnum */
117 {"gt", 0x003E}, /* greater-than sign, U+003E ISOnum */
118
119 /*
120 * A bunch still in the 128-255 range
121 * Replacing them depend really on the charset used.
122 */
123 {"nbsp", 0x00A0}, /* no-break space = non-breaking space, U+00A0 ISOnum */
124 {"iexcl", 0x00A1}, /* inverted exclamation mark, U+00A1 ISOnum */
125 {"cent", 0x00A2}, /* cent sign, U+00A2 ISOnum */
126 {"pound", 0x00A3}, /* pound sign, U+00A3 ISOnum */
127 {"curren", 0x00A4}, /* currency sign, U+00A4 ISOnum */
128 {"yen", 0x00A5}, /* yen sign = yuan sign, U+00A5 ISOnum */
129 {"brvbar", 0x00A6}, /* broken bar = broken vertical bar, U+00A6 ISOnum */
130 {"sect", 0x00A7}, /* section sign, U+00A7 ISOnum */
131 {"uml", 0x00A8}, /* diaeresis = spacing diaeresis, U+00A8 ISOdia */
132 {"copy", 0x00A9}, /* copyright sign, U+00A9 ISOnum */
133 {"ordf", 0x00AA}, /* feminine ordinal indicator, U+00AA ISOnum */
134 {"laquo", 0x00AB}, /* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */
135 {"not", 0x00AC}, /* not sign, U+00AC ISOnum */
136 {"shy", 0x00AD}, /* soft hyphen = discretionary hyphen, U+00AD ISOnum */
137 {"reg", 0x00AE}, /* registered sign = registered trade mark sign, U+00AE ISOnum */
138 {"macr", 0x00AF}, /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */
139 {"deg", 0x00B0}, /* degree sign, U+00B0 ISOnum */
140 {"plusmn", 0x00B1}, /* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */
141 {"sup2", 0x00B2}, /* superscript two = superscript digit two = squared, U+00B2 ISOnum */
142 {"sup3", 0x00B3}, /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */
143 {"acute", 0x00B4}, /* acute accent = spacing acute, U+00B4 ISOdia */
144 {"micro", 0x00B5}, /* micro sign, U+00B5 ISOnum */
145 {"para", 0x00B6}, /* pilcrow sign = paragraph sign, U+00B6 ISOnum */
146 {"middot", 0x00B7}, /* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */
147 {"cedil", 0x00B8}, /* cedilla = spacing cedilla, U+00B8 ISOdia */
148 {"sup1", 0x00B9}, /* superscript one = superscript digit one, U+00B9 ISOnum */
149 {"ordm", 0x00BA}, /* masculine ordinal indicator, U+00BA ISOnum */
150 {"raquo", 0x00BB}, /* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */
151 {"frac14", 0x00BC}, /* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */
152 {"frac12", 0x00BD}, /* vulgar fraction one half = fraction one half, U+00BD ISOnum */
153 {"frac34", 0x00BE}, /* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */
154 {"iquest", 0x00BF}, /* inverted question mark = turned question mark, U+00BF ISOnum */
155 {"Agrave", 0x00C0}, /* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */
156 {"Aacute", 0x00C1}, /* latin capital letter A with acute, U+00C1 ISOlat1 */
157 {"Acirc", 0x00C2}, /* latin capital letter A with circumflex, U+00C2 ISOlat1 */
158 {"Atilde", 0x00C3}, /* latin capital letter A with tilde, U+00C3 ISOlat1 */
159 {"Auml", 0x00C4}, /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */
160 {"Aring", 0x00C5}, /* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */
161 {"AElig", 0x00C6}, /* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */
162 {"Ccedil", 0x00C7}, /* latin capital letter C with cedilla, U+00C7 ISOlat1 */
163 {"Egrave", 0x00C8}, /* latin capital letter E with grave, U+00C8 ISOlat1 */
164 {"Eacute", 0x00C9}, /* latin capital letter E with acute, U+00C9 ISOlat1 */
165 {"Ecirc", 0x00CA}, /* latin capital letter E with circumflex, U+00CA ISOlat1 */
166 {"Euml", 0x00CB}, /* latin capital letter E with diaeresis, U+00CB ISOlat1 */
167 {"Igrave", 0x00CC}, /* latin capital letter I with grave, U+00CC ISOlat1 */
168 {"Iacute", 0x00CD}, /* latin capital letter I with acute, U+00CD ISOlat1 */
169 {"Icirc", 0x00CE}, /* latin capital letter I with circumflex, U+00CE ISOlat1 */
170 {"Iuml", 0x00CF}, /* latin capital letter I with diaeresis, U+00CF ISOlat1 */
171 {"ETH", 0x00D0}, /* latin capital letter ETH, U+00D0 ISOlat1 */
172 {"Ntilde", 0x00D1}, /* latin capital letter N with tilde, U+00D1 ISOlat1 */
173 {"Ograve", 0x00D2}, /* latin capital letter O with grave, U+00D2 ISOlat1 */
174 {"Oacute", 0x00D3}, /* latin capital letter O with acute, U+00D3 ISOlat1 */
175 {"Ocirc", 0x00D4}, /* latin capital letter O with circumflex, U+00D4 ISOlat1 */
176 {"Otilde", 0x00D5}, /* latin capital letter O with tilde, U+00D5 ISOlat1 */
177 {"Ouml", 0x00D6}, /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */
178 {"times", 0x00D7}, /* multiplication sign, U+00D7 ISOnum */
179 {"Oslash", 0x00D8}, /* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */
180 {"Ugrave", 0x00D9}, /* latin capital letter U with grave, U+00D9 ISOlat1 */
181 {"Uacute", 0x00DA}, /* latin capital letter U with acute, U+00DA ISOlat1 */
182 {"Ucirc", 0x00DB}, /* latin capital letter U with circumflex, U+00DB ISOlat1 */
183 {"Uuml", 0x00DC}, /* latin capital letter U with diaeresis, U+00DC ISOlat1 */
184 {"Yacute", 0x00DD}, /* latin capital letter Y with acute, U+00DD ISOlat1 */
185 {"THORN", 0x00DE}, /* latin capital letter THORN, U+00DE ISOlat1 */
186 {"szlig", 0x00DF}, /* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */
187 {"agrave", 0x00E0}, /* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */
188 {"aacute", 0x00E1}, /* latin small letter a with acute, U+00E1 ISOlat1 */
189 {"acirc", 0x00E2}, /* latin small letter a with circumflex, U+00E2 ISOlat1 */
190 {"atilde", 0x00E3}, /* latin small letter a with tilde, U+00E3 ISOlat1 */
191 {"auml", 0x00E4}, /* latin small letter a with diaeresis, U+00E4 ISOlat1 */
192 {"aring", 0x00E5}, /* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */
193 {"aelig", 0x00E6}, /* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */
194 {"ccedil", 0x00E7}, /* latin small letter c with cedilla, U+00E7 ISOlat1 */
195 {"egrave", 0x00E8}, /* latin small letter e with grave, U+00E8 ISOlat1 */
196 {"eacute", 0x00E9}, /* latin small letter e with acute, U+00E9 ISOlat1 */
197 {"ecirc", 0x00EA}, /* latin small letter e with circumflex, U+00EA ISOlat1 */
198 {"euml", 0x00EB}, /* latin small letter e with diaeresis, U+00EB ISOlat1 */
199 {"igrave", 0x00EC}, /* latin small letter i with grave, U+00EC ISOlat1 */
200 {"iacute", 0x00ED}, /* latin small letter i with acute, U+00ED ISOlat1 */
201 {"icirc", 0x00EE}, /* latin small letter i with circumflex, U+00EE ISOlat1 */
202 {"iuml", 0x00EF}, /* latin small letter i with diaeresis, U+00EF ISOlat1 */
203 {"eth", 0x00F0}, /* latin small letter eth, U+00F0 ISOlat1 */
204 {"ntilde", 0x00F1}, /* latin small letter n with tilde, U+00F1 ISOlat1 */
205 {"ograve", 0x00F2}, /* latin small letter o with grave, U+00F2 ISOlat1 */
206 {"oacute", 0x00F3}, /* latin small letter o with acute, U+00F3 ISOlat1 */
207 {"ocirc", 0x00F4}, /* latin small letter o with circumflex, U+00F4 ISOlat1 */
208 {"otilde", 0x00F5}, /* latin small letter o with tilde, U+00F5 ISOlat1 */
209 {"ouml", 0x00F6}, /* latin small letter o with diaeresis, U+00F6 ISOlat1 */
210 {"divide", 0x00F7}, /* division sign, U+00F7 ISOnum */
211 {"oslash", 0x00F8}, /* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */
212 {"ugrave", 0x00F9}, /* latin small letter u with grave, U+00F9 ISOlat1 */
213 {"uacute", 0x00FA}, /* latin small letter u with acute, U+00FA ISOlat1 */
214 {"ucirc", 0x00FB}, /* latin small letter u with circumflex, U+00FB ISOlat1 */
215 {"uuml", 0x00FC}, /* latin small letter u with diaeresis, U+00FC ISOlat1 */
216 {"yacute", 0x00FD}, /* latin small letter y with acute, U+00FD ISOlat1 */
217 {"thorn", 0x00FE}, /* latin small letter thorn with, U+00FE ISOlat1 */
218 {"yuml", 0x00FF}, /* latin small letter y with diaeresis, U+00FF ISOlat1 */
219
220 {"OElig", 0x0152}, /* latin capital ligature OE, U+0152 ISOlat2 */
221 {"oelig", 0x0153}, /* latin small ligature oe, U+0153 ISOlat2 */
222 {"Scaron", 0x0160}, /* latin capital letter S with caron, U+0160 ISOlat2 */
223 {"scaron", 0x0161}, /* latin small letter s with caron, U+0161 ISOlat2 */
224 {"Yuml", 0x0178}, /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */
225
226 /*
227 * Anything below should really be kept as entities references
228 */
229
230 /*
231 -- Latin Extended-B
232 */
233 {"fnof", 0x0192}, /* latin small f with hook = function = florin, U+0192 ISOtech */
234
235 {"circ", 0x02C6}, /* modifier letter circumflex accent, U+02C6 ISOpub */
236 {"tilde", 0x02DC}, /* small tilde, U+02DC ISOdia */
237
238 /*
239 -- Greek symbols
240 */
241 {"Alpha", 0x0391}, /* greek capital letter alpha, U+0391 */
242 {"Beta", 0x0392}, /* greek capital letter beta, U+0392 */
243 {"Gamma", 0x0393}, /* greek capital letter gamma, U+0393 ISOgrk3 */
244 {"Delta", 0x0394}, /* greek capital letter delta, U+0394 ISOgrk3 */
245 {"Epsilon", 0x0395}, /* greek capital letter epsilon, U+0395 */
246 {"Zeta", 0x0396}, /* greek capital letter zeta, U+0396 */
247 {"Eta", 0x0397}, /* greek capital letter eta, U+0397 */
248 {"Theta", 0x0398}, /* greek capital letter theta, U+0398 ISOgrk3 */
249 {"Iota", 0x0399}, /* greek capital letter iota, U+0399 */
250 {"Kappa", 0x039A}, /* greek capital letter kappa, U+039A */
251 {"Lambda", 0x039B}, /* greek capital letter lambda, U+039B ISOgrk3 */
252 {"Mu", 0x039C}, /* greek capital letter mu, U+039C */
253 {"Nu", 0x039D}, /* greek capital letter nu, U+039D */
254 {"Xi", 0x039E}, /* greek capital letter xi, U+039E ISOgrk3 */
255 {"Omicron", 0x039F}, /* greek capital letter omicron, U+039F */
256 {"Pi", 0x03A0}, /* greek capital letter pi, U+03A0 ISOgrk3 */
257 {"Rho", 0x03A1}, /* greek capital letter rho, U+03A1 */
258 /* -- there is no Sigmaf, and no U+03A2 character either */
259 {"Sigma", 0x03A3}, /* greek capital letter sigma, U+03A3 ISOgrk3 */
260 {"Tau", 0x03A4}, /* greek capital letter tau, U+03A4 */
261 {"Upsilon", 0x03A5}, /* greek capital letter upsilon, U+03A5 ISOgrk3 */
262 {"Phi", 0x03A6}, /* greek capital letter phi, U+03A6 ISOgrk3 */
263 {"Chi", 0x03A7}, /* greek capital letter chi, U+03A7 */
264 {"Psi", 0x03A8}, /* greek capital letter psi, U+03A8 ISOgrk3 */
265 {"Omega", 0x03A9}, /* greek capital letter omega, U+03A9 ISOgrk3 */
266
267 {"alpha", 0x03B1}, /* greek small letter alpha, U+03B1 ISOgrk3 */
268 {"beta", 0x03B2}, /* greek small letter beta, U+03B2 ISOgrk3 */
269 {"gamma", 0x03B3}, /* greek small letter gamma, U+03B3 ISOgrk3 */
270 {"delta", 0x03B4}, /* greek small letter delta, U+03B4 ISOgrk3 */
271 {"epsilon", 0x03B5}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
272 {"zeta", 0x03B6}, /* greek small letter zeta, U+03B6 ISOgrk3 */
273 {"eta", 0x03B7}, /* greek small letter eta, U+03B7 ISOgrk3 */
274 {"theta", 0x03B8}, /* greek small letter theta, U+03B8 ISOgrk3 */
275 {"iota", 0x03B9}, /* greek small letter iota, U+03B9 ISOgrk3 */
276 {"kappa", 0x03BA}, /* greek small letter kappa, U+03BA ISOgrk3 */
277 {"lambda", 0x03BB}, /* greek small letter lambda, U+03BB ISOgrk3 */
278 {"mu", 0x03BC}, /* greek small letter mu, U+03BC ISOgrk3 */
279 {"nu", 0x03BD}, /* greek small letter nu, U+03BD ISOgrk3 */
280 {"xi", 0x03BE}, /* greek small letter xi, U+03BE ISOgrk3 */
281 {"omicron", 0x03BF}, /* greek small letter omicron, U+03BF NEW */
282 {"pi", 0x03C0}, /* greek small letter pi, U+03C0 ISOgrk3 */
283 {"rho", 0x03C1}, /* greek small letter rho, U+03C1 ISOgrk3 */
284 {"sigmaf", 0x03C2}, /* greek small letter final sigma, U+03C2 ISOgrk3 */
285 {"sigma", 0x03C3}, /* greek small letter sigma, U+03C3 ISOgrk3 */
286 {"tau", 0x03C4}, /* greek small letter tau, U+03C4 ISOgrk3 */
287 {"upsilon", 0x03C5}, /* greek small letter upsilon, U+03C5 ISOgrk3 */
288 {"phi", 0x03C6}, /* greek small letter phi, U+03C6 ISOgrk3 */
289 {"chi", 0x03C7}, /* greek small letter chi, U+03C7 ISOgrk3 */
290 {"psi", 0x03C8}, /* greek small letter psi, U+03C8 ISOgrk3 */
291 {"omega", 0x03C9}, /* greek small letter omega, U+03C9 ISOgrk3 */
292 {"thetasym", 0x03D1}, /* greek small letter theta symbol, U+03D1 NEW */
293 {"upsih", 0x03D2}, /* greek upsilon with hook symbol, U+03D2 NEW */
294 {"piv", 0x03D6}, /* greek pi symbol, U+03D6 ISOgrk3 */
295
296 {"ensp", 0x2002}, /* en space, U+2002 ISOpub */
297 {"emsp", 0x2003}, /* em space, U+2003 ISOpub */
298 {"thinsp", 0x2009}, /* thin space, U+2009 ISOpub */
299 {"zwnj", 0x200C}, /* zero width non-joiner, U+200C NEW RFC 2070 */
300 {"zwj", 0x200D}, /* zero width joiner, U+200D NEW RFC 2070 */
301 {"lrm", 0x200E}, /* left-to-right mark, U+200E NEW RFC 2070 */
302 {"rlm", 0x200F}, /* right-to-left mark, U+200F NEW RFC 2070 */
303 {"ndash", 0x2013}, /* en dash, U+2013 ISOpub */
304 {"mdash", 0x2014}, /* em dash, U+2014 ISOpub */
305 {"lsquo", 0x2018}, /* left single quotation mark, U+2018 ISOnum */
306 {"rsquo", 0x2019}, /* right single quotation mark, U+2019 ISOnum */
307 {"sbquo", 0x201A}, /* single low-9 quotation mark, U+201A NEW */
308 {"ldquo", 0x201C}, /* left double quotation mark, U+201C ISOnum */
309 {"rdquo", 0x201D}, /* right double quotation mark, U+201D ISOnum */
310 {"bdquo", 0x201E}, /* double low-9 quotation mark, U+201E NEW */
311 {"dagger", 0x2020}, /* dagger, U+2020 ISOpub */
312 {"Dagger", 0x2021}, /* double dagger, U+2021 ISOpub */
313
314 {"bull", 0x2022}, /* bullet = black small circle, U+2022 ISOpub */
315 {"hellip", 0x2026}, /* horizontal ellipsis = three dot leader, U+2026 ISOpub */
316
317 {"permil", 0x2030}, /* per mille sign, U+2030 ISOtech */
318
319 {"prime", 0x2032}, /* prime = minutes = feet, U+2032 ISOtech */
320 {"Prime", 0x2033}, /* double prime = seconds = inches, U+2033 ISOtech */
321
322 {"lsaquo", 0x2039}, /* single left-pointing angle quotation mark, U+2039 ISO proposed */
323 {"rsaquo", 0x203A}, /* single right-pointing angle quotation mark, U+203A ISO proposed */
324
325 {"oline", 0x203E}, /* overline = spacing overscore, U+203E NEW */
326 {"frasl", 0x2044}, /* fraction slash, U+2044 NEW */
327
328 {"euro", 0x20AC}, /* euro sign, U+20AC NEW */
329
330 /* -- Letterlike Symbols */
331 {"image", 0x2111}, /* blackletter capital I = imaginary part, U+2111 ISOamso */
332 {"weierp", 0x2118}, /* script capital P = power set = Weierstrass p, U+2118 ISOamso */
333 {"real", 0x211C}, /* blackletter capital R = real part symbol, U+211C ISOamso */
334 {"trade", 0x2122}, /* trade mark sign, U+2122 ISOnum */
335
336 /* -- alef symbol is NOT the same as hebrew letter alef, U+05D0 */
337 {"alefsym", 0x2135}, /* alef symbol = first transfinite cardinal, U+2135 NEW */
338
339 /* -- Arrow Symbols */
340 {"larr", 0x2190}, /* leftwards arrow, U+2190 ISOnum */
341 {"uarr", 0x2191}, /* upwards arrow, U+2191 ISOnum */
342 {"rarr", 0x2192}, /* rightwards arrow, U+2192 ISOnum */
343 {"darr", 0x2193}, /* downwards arrow, U+2193 ISOnum */
344 {"harr", 0x2194}, /* left right arrow, U+2194 ISOamsa */
345 {"crarr", 0x21B5}, /* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */
346 {"lArr", 0x21D0}, /* leftwards double arrow, U+21D0 ISOtech */
347 {"uArr", 0x21D1}, /* upwards double arrow, U+21D1 ISOamsa */
348 {"rArr", 0x21D2}, /* rightwards double arrow, U+21D2 ISOtech */
349 {"dArr", 0x21D3}, /* downwards double arrow, U+21D3 ISOamsa */
350 {"hArr", 0x21D4}, /* left right double arrow, U+21D4 ISOamsa */
351
352 /* -- Mathematical Operators */
353 {"forall", 0x2200}, /* for all, U+2200 ISOtech */
354 {"part", 0x2202}, /* partial differential, U+2202 ISOtech */
355 {"exist", 0x2203}, /* there exists, U+2203 ISOtech */
356 {"empty", 0x2205}, /* empty set = null set = diameter, U+2205 ISOamso */
357 {"nabla", 0x2207}, /* nabla = backward difference, U+2207 ISOtech */
358 {"isin", 0x2208}, /* element of, U+2208 ISOtech */
359 {"notin", 0x2209}, /* not an element of, U+2209 ISOtech */
360 {"ni", 0x220B}, /* contains as member, U+220B ISOtech */
361 {"prod", 0x220F}, /* n-ary product = product sign, U+220F ISOamsb */
362 {"sum", 0x2211}, /* n-ary sumation, U+2211 ISOamsb */
363 {"minus", 0x2212}, /* minus sign, U+2212 ISOtech */
364 {"lowast", 0x2217}, /* asterisk operator, U+2217 ISOtech */
365 {"radic", 0x221A}, /* square root = radical sign, U+221A ISOtech */
366 {"prop", 0x221D}, /* proportional to, U+221D ISOtech */
367 {"infin", 0x221E}, /* infinity, U+221E ISOtech */
368 {"ang", 0x2220}, /* angle, U+2220 ISOamso */
369 {"and", 0x2227}, /* logical and = wedge, U+2227 ISOtech */
370 {"or", 0x2228}, /* logical or = vee, U+2228 ISOtech */
371 {"cap", 0x2229}, /* intersection = cap, U+2229 ISOtech */
372 {"cup", 0x222A}, /* union = cup, U+222A ISOtech */
373 {"int", 0x222B}, /* integral, U+222B ISOtech */
374 {"there4", 0x2234}, /* therefore, U+2234 ISOtech */
375 {"sim", 0x223C}, /* tilde operator = varies with = similar to, U+223C ISOtech */
376 {"cong", 0x2245}, /* approximately equal to, U+2245 ISOtech */
377 {"asymp", 0x2248}, /* almost equal to = asymptotic to, U+2248 ISOamsr */
378 {"ne", 0x2260}, /* not equal to, U+2260 ISOtech */
379 {"equiv", 0x2261}, /* identical to, U+2261 ISOtech */
380 {"le", 0x2264}, /* less-than or equal to, U+2264 ISOtech */
381 {"ge", 0x2265}, /* greater-than or equal to, U+2265 ISOtech */
382 {"sub", 0x2282}, /* subset of, U+2282 ISOtech */
383 {"sup", 0x2283}, /* superset of, U+2283 ISOtech */
384 {"nsub", 0x2284}, /* not a subset of, U+2284 ISOamsn */
385 {"sube", 0x2286}, /* subset of or equal to, U+2286 ISOtech */
386 {"supe", 0x2287}, /* superset of or equal to, U+2287 ISOtech */
387 {"oplus", 0x2295}, /* circled plus = direct sum, U+2295 ISOamsb */
388 {"otimes", 0x2297}, /* circled times = vector product, U+2297 ISOamsb */
389 {"perp", 0x22A5}, /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */
390 {"sdot", 0x22C5}, /* dot operator, U+22C5 ISOamsb */
391 {"lceil", 0x2308}, /* left ceiling = apl upstile, U+2308 ISOamsc */
392 {"rceil", 0x2309}, /* right ceiling, U+2309 ISOamsc */
393 {"lfloor", 0x230A}, /* left floor = apl downstile, U+230A ISOamsc */
394 {"rfloor", 0x230B}, /* right floor, U+230B ISOamsc */
395 {"lang", 0x2329}, /* left-pointing angle bracket = bra, U+2329 ISOtech */
396 {"rang", 0x232A}, /* right-pointing angle bracket = ket, U+232A ISOtech */
397 {"loz", 0x25CA}, /* lozenge, U+25CA ISOpub */
398
399 /* -- Miscellaneous Symbols */
400 {"spades", 0x2660}, /* black spade suit, U+2660 ISOpub */
401 {"clubs", 0x2663}, /* black club suit = shamrock, U+2663 ISOpub */
402 {"hearts", 0x2665}, /* black heart suit = valentine, U+2665 ISOpub */
403 {"diams", 0x2666}, /* black diamond suit, U+2666 ISOpub */
404
405 };
406
407
408 /*
409 ** ----------------------------------------------
410 **
411 ** Module management code starts here
412 **
413 ** ----------------------------------------------
414 */
415
416 /*
417 -- init structures for Entities
418 */
419
420 void initModule_Entities(SWISH * sw)
421 {
422 struct MOD_Entities *md;
423
424
425 md = (struct MOD_Entities *) emalloc(sizeof(struct MOD_Entities));
426
427 sw->Entities = md;
428
429 md->convertEntities = CONVERTHTMLENTITIES;
430
431 /*
432 -- init entity hash
433 -- this is module local only
434 */
435
436 if ( !ce_hasharray_initialized++ )
437 {
438 int i,
439 tab_len;
440 CEntity *ce_p;
441 struct CEHE **hash_pp,
442 *tmp_p;
443
444 /* empty positions */
445 for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++)
446 ce_hasharray[i] = (struct CEHE *) NULL;
447
448
449 /*
450 -- fill entity table into hash
451 -- process from end to start of entity_table, because most used
452 -- entities are at the beginning (iso)
453 -- this is due to "insert in hash sequence" behavior in hashtab (performance!)
454 -- The improvement is minimal, because the hash table re-chains during usage.
455 */
456
457 tab_len = sizeof(entity_table) / sizeof(entity_table[0]);
458
459 for (i = tab_len - 1; i >= 0; i--)
460 {
461 ce_p = &entity_table[i];
462 hash_pp = &ce_hasharray[(int) *(ce_p->name) & 0x7F];
463 /* insert entity-ptr at start of ptr sequence in hash */
464 tmp_p = *hash_pp;
465 *hash_pp = (struct CEHE *) emalloc(sizeof(struct CEHE));
466 (*hash_pp)->ce = ce_p;
467 (*hash_pp)->next = tmp_p;
468 }
469
470 } /* end init hash block */
471
472 }
473
474
475
476 /*
477 -- release structures for Entities
478 -- release all wired memory
479 */
480
481 void freeModule_Entities(SWISH * sw)
482 {
483
484 /* free module data structure */
485
486 efree(sw->Entities);
487 sw->Entities = NULL;
488
489
490 /*
491 -- free local entity hash table
492 */
493 {
494 int i;
495 struct CEHE *hash_p,
496 *tmp_p;
497 /* free ptr "chains" in array */
498 for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++)
499 {
500 hash_p = ce_hasharray[i];
501 while (hash_p)
502 {
503 tmp_p = hash_p->next;
504 efree(hash_p);
505 hash_p = tmp_p;
506 }
507 ce_hasharray[i] = (struct CEHE *) NULL;
508 }
509
510 } /* end free hash block */
511
512 }
513
514
515 /*
516 ** ----------------------------------------------
517 **
518 ** Module config code starts here
519 **
520 ** ----------------------------------------------
521 */
522
523
524 /*
525 -- Config Directives
526 -- Configuration directives for this Module
527 -- return: 0/1 = none/config applied
528 */
529
530 int configModule_Entities(SWISH * sw, StringList * sl)
531 {
532 struct MOD_Entities *md = sw->Entities;
533 char *w0;
534 int retval;
535
536
537 w0 = sl->word[0];
538 retval = 1;
539
540
541 if (strcasecmp(w0, "ConvertHTMLEntities") == 0)
542 {
543 md->convertEntities = getYesNoOrAbort(sl, 1, 1);
544 }
545 else
546 {
547 retval = 0; /* not a Entities directive */
548 }
549
550
551 return retval;
552 }
553
554
555
556
557
558
559 /*
560 ** ----------------------------------------------
561 **
562 ** Module code starts here
563 **
564 ** ----------------------------------------------
565 */
566
567
568 /*
569 -- convert a string containing HTML/XML entities
570 -- conversion is done on the string itsself.
571 -- conversion is only done, if config directive is set to "YES"
572 -- return ptr to converted string.
573 */
574
575 unsigned char *sw_ConvHTMLEntities2ISO(SWISH * sw, unsigned char *s)
576 {
577 return (sw->Entities->convertEntities) ? strConvHTMLEntities2ISO(s) : s;
578 }
579
580
581
582 /*
583 -- convert a string containing HTML/XML entities
584 -- conversion is done on the string itsself.
585 -- return ptr to converted string.
586 */
587
588 unsigned char *strConvHTMLEntities2ISO(unsigned char *buf)
589 {
590 unsigned char *s,
591 *t;
592 unsigned char *d;
593 int code;
594
595
596 s = d = buf;
597
598 while (*s)
599 {
600
601 /* if not entity start, next */
602 if (*s != '&')
603 {
604 *d++ = *s++;
605 }
606 else
607 {
608 /* entity found, identify and decode */
609 /* ignore zero entities and UNICODE ! */
610 code = charEntityDecode(s, &t);
611 if (code && (code < 256))
612 *d++ = (unsigned char) code;
613 s = t;
614 }
615 }
616 *d = '\0';
617
618 return buf;
619 }
620
621
622
623 /*
624 -- decode entity string to character code:
625 -- &#dec; &#xhex; &#Xhex; &named;
626 -- Decoding is hash optimized with dynamic re-chaining for
627 -- performance improvement...
628 -- return: entity character (decoded)
629 -- position "end" (if != NULL) past "entity" or behind ret. char
630 -- on illegal entities, just return the char...
631 */
632
633 int charEntityDecode(unsigned char *s, unsigned char **end)
634 {
635 unsigned char *s1,
636 *t;
637 unsigned char *e_end;
638 unsigned char s_cmp[MAX_ENTITY_LEN + 1];
639 int len;
640 int code;
641
642
643 /*
644 -- no entity ctrl start char?, err: return char
645 */
646 if (*s != '&')
647 {
648 if (end)
649 *end = s + 1;
650 return (int) *s;
651 }
652
653
654
655 /* ok, seems valid entity starting char */
656 code = 0;
657 e_end = NULL;
658
659 if (*(s + 1) == '#')
660 { /* numeric entity "&#" */
661
662 s += 2; /* after "&#" */
663 switch (*s)
664 {
665 case 'x':
666 case 'X':
667 ++s; /* skip x */
668 code = (int) strtoul((char *)s, (char **) &e_end, (int) 16);
669 break;
670 default:
671 code = (int) strtoul((char *)s, (char **) &e_end, (int) 10);
672 break;
673 }
674
675 }
676 else
677 {
678
679 /*
680 -- ok, seems to be a named entity, find terminating char
681 -- t = NULL if not found...
682 -- if no char found: return '&' (illegal entity)
683 */
684
685 len = 0;
686 t = NULL;
687 s1 = s;
688 while (len < MAX_ENTITY_LEN)
689 {
690 s_cmp[len] = *(++s1);
691 if (IS_EOE(*s1))
692 {
693 t = s1; /* End of named entity */
694 break;
695 }
696 if (!*s1)
697 break; /* maybe this is also checked by is_EOE! */
698 len++;
699 }
700 s_cmp[len] = '\0';
701
702 /*
703 -- hash search block
704 -- case sensitiv search (hashvalue = 1 entity name char)
705 -- (& 0x7F to prevent hashtable mem coredumps by illegal chars)
706 -- improve performance, by rechaining found elements
707 */
708
709 if (t)
710 {
711 struct CEHE *hash_p;
712 struct CEHE **hash_pp,
713 *last_p;
714
715 hash_pp = &ce_hasharray[*(s + 1) & 0x7F];
716 last_p = NULL;
717 hash_p = *hash_pp;
718 while (hash_p)
719 {
720 if (!strcmp( (char *)hash_p->ce->name, (char *)s_cmp))
721 {
722 code = hash_p->ce->code;
723 if (last_p)
724 { /* rechain hash sequence list (last found = first) */
725 last_p->next = hash_p->next; /* take elem out of seq */
726 hash_p->next = *hash_pp; /* old 1. = 2. */
727 *hash_pp = hash_p; /* found = 1st */
728 }
729 e_end = t; /* found -> set end */
730 break;
731 }
732 last_p = hash_p;
733 hash_p = hash_p->next;
734 }
735
736 }
737 } /* end if */
738
739
740 if (!e_end)
741 {
742 code = *s;
743 e_end = s + 1;
744 }
745 else
746 {
747 if (*e_end == ';')
748 e_end++; /* W3C EndOfEntity */
749 }
750
751
752 if (end)
753 *end = e_end;
754 return code;
755 }
756
757
758 /*
759 -- check if a char is the end of a html entity.
760 -- behavior can be W3C pedantic or tolerant.
761 -- mapped via macro to avoid function calls on strict ==';' behavior
762 -- return: cmp value
763 */
764
765 static int is_EOE(int c)
766 {
767 /* be tolerant ! */
768 return ((!isprint(c)) || ispunct(c) || isspace(c)) ? 1 : 0;
769 }

  ViewVC Help
Powered by ViewVC 1.1.22