1 |
adcroft |
1.1 |
/* |
2 |
|
|
$Id: entities.c,v 1.16 2002/05/17 22:38:18 whmoseley Exp $ |
3 |
|
|
** |
4 |
|
|
** This program and library is free software; you can redistribute it and/or |
5 |
|
|
** modify it under the terms of the GNU (Library) General Public License |
6 |
|
|
** as published by the Free Software Foundation; either version 2 |
7 |
|
|
** of the License, or any later version. |
8 |
|
|
** |
9 |
|
|
** This program is distributed in the hope that it will be useful, |
10 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 |
|
|
** GNU (Library) General Public License for more details. |
13 |
|
|
** |
14 |
|
|
** You should have received a copy of the GNU (Library) General Public License |
15 |
|
|
** along with this program; if not, write to the Free Software |
16 |
|
|
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 |
|
|
** |
18 |
|
|
** (c) Rainer.Scherg |
19 |
|
|
** |
20 |
|
|
** |
21 |
|
|
** HTML entity routines (encoding, etc.): |
22 |
|
|
** |
23 |
|
|
** internally we are working with int/wchar_t to support unicode-16 for future |
24 |
|
|
** enhancements of swish (rasc - Rainer Scherg). |
25 |
|
|
** |
26 |
|
|
** 2001-05-05 rasc |
27 |
|
|
** |
28 |
|
|
*/ |
29 |
|
|
|
30 |
|
|
|
31 |
|
|
|
32 |
|
|
|
33 |
|
|
#include <stdlib.h> |
34 |
|
|
#include "swish.h" |
35 |
|
|
#include "mem.h" |
36 |
|
|
#include "string.h" |
37 |
|
|
#include "parse_conffile.h" |
38 |
|
|
#include "config.h" |
39 |
|
|
#include "entities.h" |
40 |
|
|
|
41 |
|
|
|
42 |
|
|
/* |
43 |
|
|
** ---------------------------------------------- |
44 |
|
|
** |
45 |
|
|
** Private Module Data |
46 |
|
|
** |
47 |
|
|
** ---------------------------------------------- |
48 |
|
|
*/ |
49 |
|
|
|
50 |
|
|
/* Prototypes */ |
51 |
|
|
|
52 |
|
|
static int is_EOE(int c); /* is_EndOfEntity */ |
53 |
|
|
|
54 |
|
|
|
55 |
|
|
|
56 |
|
|
|
57 |
|
|
#define MAX_ENTITY_LEN 16 /* max chars after where we have to see the EOE */ |
58 |
|
|
|
59 |
|
|
/* |
60 |
|
|
-- Entity encoding/decoding structure |
61 |
|
|
*/ |
62 |
|
|
|
63 |
|
|
/* #define IS_EOE(a) ((a)==';') -- be W3C compliant */ |
64 |
|
|
#define IS_EOE(a) (is_EOE((int)(a))) /* tolerant routine */ |
65 |
|
|
|
66 |
|
|
|
67 |
|
|
typedef struct |
68 |
|
|
{ |
69 |
|
|
char *name; |
70 |
|
|
int code; |
71 |
|
|
} |
72 |
|
|
CEntity; |
73 |
|
|
|
74 |
|
|
|
75 |
|
|
/* |
76 |
|
|
-- CEntity Quick Hash structure |
77 |
|
|
-- works like follow: Array of ASCII-7 start "positions" (1. char of entity name) |
78 |
|
|
-- each entry can have a chain of pointers |
79 |
|
|
-- e.g. "e; --> ['q']->ce(.name .code) |
80 |
|
|
-- ->next (chains all &q...;) |
81 |
|
|
-- lots of slots in the array will be empty because only [A-Z] and [a-z] |
82 |
|
|
-- is needed. But this cost hardly any memory, and is convenient... (rasc) |
83 |
|
|
-- The hash sequence list will be re-sequenced during(!) usage (dynamic re-chaining). |
84 |
|
|
-- This brings down compares to almost 1 strcmp on entity checks. |
85 |
|
|
-- |
86 |
|
|
-- Warning: don't change this (ce_hasharray,etc) unless you know how this really works! |
87 |
|
|
-- |
88 |
|
|
-- 2001-05-14 Rainer.Scherg@rexroth.de (rasc) |
89 |
|
|
-- |
90 |
|
|
*/ |
91 |
|
|
|
92 |
|
|
struct CEHE |
93 |
|
|
{ /* CharEntityHashEntry */ |
94 |
|
|
CEntity *ce; |
95 |
|
|
struct CEHE *next; |
96 |
|
|
}; |
97 |
|
|
|
98 |
|
|
static struct CEHE *ce_hasharray[128]; |
99 |
|
|
static int ce_hasharray_initialized = 0; |
100 |
|
|
|
101 |
|
|
|
102 |
|
|
/* |
103 |
|
|
-- the following table is retrieved from HTML4.x / SGML definitions |
104 |
|
|
-- of the W3C (did it automated 2001-05-05). |
105 |
|
|
-- http://www.w3.org/TR/html40/ |
106 |
|
|
-- http://www.w3.org/TR/1999/REC-html401-19991224/sgml/entities.html |
107 |
|
|
-- |
108 |
|
|
-- 2001-05-07 Rainer.Scherg |
109 |
|
|
*/ |
110 |
|
|
|
111 |
|
|
|
112 |
|
|
static CEntity entity_table[] = { |
113 |
|
|
{"quot", 0x0022}, /* quotation mark = APL quote, U+0022 ISOnum */ |
114 |
|
|
{"amp", 0x0026}, /* ampersand, U+0026 ISOnum */ |
115 |
|
|
{"apos", 0x0027}, /* single quote */ |
116 |
|
|
{"lt", 0x003C}, /* less-than sign, U+003C ISOnum */ |
117 |
|
|
{"gt", 0x003E}, /* greater-than sign, U+003E ISOnum */ |
118 |
|
|
|
119 |
|
|
/* |
120 |
|
|
* A bunch still in the 128-255 range |
121 |
|
|
* Replacing them depend really on the charset used. |
122 |
|
|
*/ |
123 |
|
|
{"nbsp", 0x00A0}, /* no-break space = non-breaking space, U+00A0 ISOnum */ |
124 |
|
|
{"iexcl", 0x00A1}, /* inverted exclamation mark, U+00A1 ISOnum */ |
125 |
|
|
{"cent", 0x00A2}, /* cent sign, U+00A2 ISOnum */ |
126 |
|
|
{"pound", 0x00A3}, /* pound sign, U+00A3 ISOnum */ |
127 |
|
|
{"curren", 0x00A4}, /* currency sign, U+00A4 ISOnum */ |
128 |
|
|
{"yen", 0x00A5}, /* yen sign = yuan sign, U+00A5 ISOnum */ |
129 |
|
|
{"brvbar", 0x00A6}, /* broken bar = broken vertical bar, U+00A6 ISOnum */ |
130 |
|
|
{"sect", 0x00A7}, /* section sign, U+00A7 ISOnum */ |
131 |
|
|
{"uml", 0x00A8}, /* diaeresis = spacing diaeresis, U+00A8 ISOdia */ |
132 |
|
|
{"copy", 0x00A9}, /* copyright sign, U+00A9 ISOnum */ |
133 |
|
|
{"ordf", 0x00AA}, /* feminine ordinal indicator, U+00AA ISOnum */ |
134 |
|
|
{"laquo", 0x00AB}, /* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */ |
135 |
|
|
{"not", 0x00AC}, /* not sign, U+00AC ISOnum */ |
136 |
|
|
{"shy", 0x00AD}, /* soft hyphen = discretionary hyphen, U+00AD ISOnum */ |
137 |
|
|
{"reg", 0x00AE}, /* registered sign = registered trade mark sign, U+00AE ISOnum */ |
138 |
|
|
{"macr", 0x00AF}, /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */ |
139 |
|
|
{"deg", 0x00B0}, /* degree sign, U+00B0 ISOnum */ |
140 |
|
|
{"plusmn", 0x00B1}, /* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */ |
141 |
|
|
{"sup2", 0x00B2}, /* superscript two = superscript digit two = squared, U+00B2 ISOnum */ |
142 |
|
|
{"sup3", 0x00B3}, /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */ |
143 |
|
|
{"acute", 0x00B4}, /* acute accent = spacing acute, U+00B4 ISOdia */ |
144 |
|
|
{"micro", 0x00B5}, /* micro sign, U+00B5 ISOnum */ |
145 |
|
|
{"para", 0x00B6}, /* pilcrow sign = paragraph sign, U+00B6 ISOnum */ |
146 |
|
|
{"middot", 0x00B7}, /* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */ |
147 |
|
|
{"cedil", 0x00B8}, /* cedilla = spacing cedilla, U+00B8 ISOdia */ |
148 |
|
|
{"sup1", 0x00B9}, /* superscript one = superscript digit one, U+00B9 ISOnum */ |
149 |
|
|
{"ordm", 0x00BA}, /* masculine ordinal indicator, U+00BA ISOnum */ |
150 |
|
|
{"raquo", 0x00BB}, /* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */ |
151 |
|
|
{"frac14", 0x00BC}, /* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */ |
152 |
|
|
{"frac12", 0x00BD}, /* vulgar fraction one half = fraction one half, U+00BD ISOnum */ |
153 |
|
|
{"frac34", 0x00BE}, /* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */ |
154 |
|
|
{"iquest", 0x00BF}, /* inverted question mark = turned question mark, U+00BF ISOnum */ |
155 |
|
|
{"Agrave", 0x00C0}, /* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */ |
156 |
|
|
{"Aacute", 0x00C1}, /* latin capital letter A with acute, U+00C1 ISOlat1 */ |
157 |
|
|
{"Acirc", 0x00C2}, /* latin capital letter A with circumflex, U+00C2 ISOlat1 */ |
158 |
|
|
{"Atilde", 0x00C3}, /* latin capital letter A with tilde, U+00C3 ISOlat1 */ |
159 |
|
|
{"Auml", 0x00C4}, /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */ |
160 |
|
|
{"Aring", 0x00C5}, /* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */ |
161 |
|
|
{"AElig", 0x00C6}, /* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */ |
162 |
|
|
{"Ccedil", 0x00C7}, /* latin capital letter C with cedilla, U+00C7 ISOlat1 */ |
163 |
|
|
{"Egrave", 0x00C8}, /* latin capital letter E with grave, U+00C8 ISOlat1 */ |
164 |
|
|
{"Eacute", 0x00C9}, /* latin capital letter E with acute, U+00C9 ISOlat1 */ |
165 |
|
|
{"Ecirc", 0x00CA}, /* latin capital letter E with circumflex, U+00CA ISOlat1 */ |
166 |
|
|
{"Euml", 0x00CB}, /* latin capital letter E with diaeresis, U+00CB ISOlat1 */ |
167 |
|
|
{"Igrave", 0x00CC}, /* latin capital letter I with grave, U+00CC ISOlat1 */ |
168 |
|
|
{"Iacute", 0x00CD}, /* latin capital letter I with acute, U+00CD ISOlat1 */ |
169 |
|
|
{"Icirc", 0x00CE}, /* latin capital letter I with circumflex, U+00CE ISOlat1 */ |
170 |
|
|
{"Iuml", 0x00CF}, /* latin capital letter I with diaeresis, U+00CF ISOlat1 */ |
171 |
|
|
{"ETH", 0x00D0}, /* latin capital letter ETH, U+00D0 ISOlat1 */ |
172 |
|
|
{"Ntilde", 0x00D1}, /* latin capital letter N with tilde, U+00D1 ISOlat1 */ |
173 |
|
|
{"Ograve", 0x00D2}, /* latin capital letter O with grave, U+00D2 ISOlat1 */ |
174 |
|
|
{"Oacute", 0x00D3}, /* latin capital letter O with acute, U+00D3 ISOlat1 */ |
175 |
|
|
{"Ocirc", 0x00D4}, /* latin capital letter O with circumflex, U+00D4 ISOlat1 */ |
176 |
|
|
{"Otilde", 0x00D5}, /* latin capital letter O with tilde, U+00D5 ISOlat1 */ |
177 |
|
|
{"Ouml", 0x00D6}, /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */ |
178 |
|
|
{"times", 0x00D7}, /* multiplication sign, U+00D7 ISOnum */ |
179 |
|
|
{"Oslash", 0x00D8}, /* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */ |
180 |
|
|
{"Ugrave", 0x00D9}, /* latin capital letter U with grave, U+00D9 ISOlat1 */ |
181 |
|
|
{"Uacute", 0x00DA}, /* latin capital letter U with acute, U+00DA ISOlat1 */ |
182 |
|
|
{"Ucirc", 0x00DB}, /* latin capital letter U with circumflex, U+00DB ISOlat1 */ |
183 |
|
|
{"Uuml", 0x00DC}, /* latin capital letter U with diaeresis, U+00DC ISOlat1 */ |
184 |
|
|
{"Yacute", 0x00DD}, /* latin capital letter Y with acute, U+00DD ISOlat1 */ |
185 |
|
|
{"THORN", 0x00DE}, /* latin capital letter THORN, U+00DE ISOlat1 */ |
186 |
|
|
{"szlig", 0x00DF}, /* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */ |
187 |
|
|
{"agrave", 0x00E0}, /* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */ |
188 |
|
|
{"aacute", 0x00E1}, /* latin small letter a with acute, U+00E1 ISOlat1 */ |
189 |
|
|
{"acirc", 0x00E2}, /* latin small letter a with circumflex, U+00E2 ISOlat1 */ |
190 |
|
|
{"atilde", 0x00E3}, /* latin small letter a with tilde, U+00E3 ISOlat1 */ |
191 |
|
|
{"auml", 0x00E4}, /* latin small letter a with diaeresis, U+00E4 ISOlat1 */ |
192 |
|
|
{"aring", 0x00E5}, /* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */ |
193 |
|
|
{"aelig", 0x00E6}, /* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */ |
194 |
|
|
{"ccedil", 0x00E7}, /* latin small letter c with cedilla, U+00E7 ISOlat1 */ |
195 |
|
|
{"egrave", 0x00E8}, /* latin small letter e with grave, U+00E8 ISOlat1 */ |
196 |
|
|
{"eacute", 0x00E9}, /* latin small letter e with acute, U+00E9 ISOlat1 */ |
197 |
|
|
{"ecirc", 0x00EA}, /* latin small letter e with circumflex, U+00EA ISOlat1 */ |
198 |
|
|
{"euml", 0x00EB}, /* latin small letter e with diaeresis, U+00EB ISOlat1 */ |
199 |
|
|
{"igrave", 0x00EC}, /* latin small letter i with grave, U+00EC ISOlat1 */ |
200 |
|
|
{"iacute", 0x00ED}, /* latin small letter i with acute, U+00ED ISOlat1 */ |
201 |
|
|
{"icirc", 0x00EE}, /* latin small letter i with circumflex, U+00EE ISOlat1 */ |
202 |
|
|
{"iuml", 0x00EF}, /* latin small letter i with diaeresis, U+00EF ISOlat1 */ |
203 |
|
|
{"eth", 0x00F0}, /* latin small letter eth, U+00F0 ISOlat1 */ |
204 |
|
|
{"ntilde", 0x00F1}, /* latin small letter n with tilde, U+00F1 ISOlat1 */ |
205 |
|
|
{"ograve", 0x00F2}, /* latin small letter o with grave, U+00F2 ISOlat1 */ |
206 |
|
|
{"oacute", 0x00F3}, /* latin small letter o with acute, U+00F3 ISOlat1 */ |
207 |
|
|
{"ocirc", 0x00F4}, /* latin small letter o with circumflex, U+00F4 ISOlat1 */ |
208 |
|
|
{"otilde", 0x00F5}, /* latin small letter o with tilde, U+00F5 ISOlat1 */ |
209 |
|
|
{"ouml", 0x00F6}, /* latin small letter o with diaeresis, U+00F6 ISOlat1 */ |
210 |
|
|
{"divide", 0x00F7}, /* division sign, U+00F7 ISOnum */ |
211 |
|
|
{"oslash", 0x00F8}, /* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */ |
212 |
|
|
{"ugrave", 0x00F9}, /* latin small letter u with grave, U+00F9 ISOlat1 */ |
213 |
|
|
{"uacute", 0x00FA}, /* latin small letter u with acute, U+00FA ISOlat1 */ |
214 |
|
|
{"ucirc", 0x00FB}, /* latin small letter u with circumflex, U+00FB ISOlat1 */ |
215 |
|
|
{"uuml", 0x00FC}, /* latin small letter u with diaeresis, U+00FC ISOlat1 */ |
216 |
|
|
{"yacute", 0x00FD}, /* latin small letter y with acute, U+00FD ISOlat1 */ |
217 |
|
|
{"thorn", 0x00FE}, /* latin small letter thorn with, U+00FE ISOlat1 */ |
218 |
|
|
{"yuml", 0x00FF}, /* latin small letter y with diaeresis, U+00FF ISOlat1 */ |
219 |
|
|
|
220 |
|
|
{"OElig", 0x0152}, /* latin capital ligature OE, U+0152 ISOlat2 */ |
221 |
|
|
{"oelig", 0x0153}, /* latin small ligature oe, U+0153 ISOlat2 */ |
222 |
|
|
{"Scaron", 0x0160}, /* latin capital letter S with caron, U+0160 ISOlat2 */ |
223 |
|
|
{"scaron", 0x0161}, /* latin small letter s with caron, U+0161 ISOlat2 */ |
224 |
|
|
{"Yuml", 0x0178}, /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */ |
225 |
|
|
|
226 |
|
|
/* |
227 |
|
|
* Anything below should really be kept as entities references |
228 |
|
|
*/ |
229 |
|
|
|
230 |
|
|
/* |
231 |
|
|
-- Latin Extended-B |
232 |
|
|
*/ |
233 |
|
|
{"fnof", 0x0192}, /* latin small f with hook = function = florin, U+0192 ISOtech */ |
234 |
|
|
|
235 |
|
|
{"circ", 0x02C6}, /* modifier letter circumflex accent, U+02C6 ISOpub */ |
236 |
|
|
{"tilde", 0x02DC}, /* small tilde, U+02DC ISOdia */ |
237 |
|
|
|
238 |
|
|
/* |
239 |
|
|
-- Greek symbols |
240 |
|
|
*/ |
241 |
|
|
{"Alpha", 0x0391}, /* greek capital letter alpha, U+0391 */ |
242 |
|
|
{"Beta", 0x0392}, /* greek capital letter beta, U+0392 */ |
243 |
|
|
{"Gamma", 0x0393}, /* greek capital letter gamma, U+0393 ISOgrk3 */ |
244 |
|
|
{"Delta", 0x0394}, /* greek capital letter delta, U+0394 ISOgrk3 */ |
245 |
|
|
{"Epsilon", 0x0395}, /* greek capital letter epsilon, U+0395 */ |
246 |
|
|
{"Zeta", 0x0396}, /* greek capital letter zeta, U+0396 */ |
247 |
|
|
{"Eta", 0x0397}, /* greek capital letter eta, U+0397 */ |
248 |
|
|
{"Theta", 0x0398}, /* greek capital letter theta, U+0398 ISOgrk3 */ |
249 |
|
|
{"Iota", 0x0399}, /* greek capital letter iota, U+0399 */ |
250 |
|
|
{"Kappa", 0x039A}, /* greek capital letter kappa, U+039A */ |
251 |
|
|
{"Lambda", 0x039B}, /* greek capital letter lambda, U+039B ISOgrk3 */ |
252 |
|
|
{"Mu", 0x039C}, /* greek capital letter mu, U+039C */ |
253 |
|
|
{"Nu", 0x039D}, /* greek capital letter nu, U+039D */ |
254 |
|
|
{"Xi", 0x039E}, /* greek capital letter xi, U+039E ISOgrk3 */ |
255 |
|
|
{"Omicron", 0x039F}, /* greek capital letter omicron, U+039F */ |
256 |
|
|
{"Pi", 0x03A0}, /* greek capital letter pi, U+03A0 ISOgrk3 */ |
257 |
|
|
{"Rho", 0x03A1}, /* greek capital letter rho, U+03A1 */ |
258 |
|
|
/* -- there is no Sigmaf, and no U+03A2 character either */ |
259 |
|
|
{"Sigma", 0x03A3}, /* greek capital letter sigma, U+03A3 ISOgrk3 */ |
260 |
|
|
{"Tau", 0x03A4}, /* greek capital letter tau, U+03A4 */ |
261 |
|
|
{"Upsilon", 0x03A5}, /* greek capital letter upsilon, U+03A5 ISOgrk3 */ |
262 |
|
|
{"Phi", 0x03A6}, /* greek capital letter phi, U+03A6 ISOgrk3 */ |
263 |
|
|
{"Chi", 0x03A7}, /* greek capital letter chi, U+03A7 */ |
264 |
|
|
{"Psi", 0x03A8}, /* greek capital letter psi, U+03A8 ISOgrk3 */ |
265 |
|
|
{"Omega", 0x03A9}, /* greek capital letter omega, U+03A9 ISOgrk3 */ |
266 |
|
|
|
267 |
|
|
{"alpha", 0x03B1}, /* greek small letter alpha, U+03B1 ISOgrk3 */ |
268 |
|
|
{"beta", 0x03B2}, /* greek small letter beta, U+03B2 ISOgrk3 */ |
269 |
|
|
{"gamma", 0x03B3}, /* greek small letter gamma, U+03B3 ISOgrk3 */ |
270 |
|
|
{"delta", 0x03B4}, /* greek small letter delta, U+03B4 ISOgrk3 */ |
271 |
|
|
{"epsilon", 0x03B5}, /* greek small letter epsilon, U+03B5 ISOgrk3 */ |
272 |
|
|
{"zeta", 0x03B6}, /* greek small letter zeta, U+03B6 ISOgrk3 */ |
273 |
|
|
{"eta", 0x03B7}, /* greek small letter eta, U+03B7 ISOgrk3 */ |
274 |
|
|
{"theta", 0x03B8}, /* greek small letter theta, U+03B8 ISOgrk3 */ |
275 |
|
|
{"iota", 0x03B9}, /* greek small letter iota, U+03B9 ISOgrk3 */ |
276 |
|
|
{"kappa", 0x03BA}, /* greek small letter kappa, U+03BA ISOgrk3 */ |
277 |
|
|
{"lambda", 0x03BB}, /* greek small letter lambda, U+03BB ISOgrk3 */ |
278 |
|
|
{"mu", 0x03BC}, /* greek small letter mu, U+03BC ISOgrk3 */ |
279 |
|
|
{"nu", 0x03BD}, /* greek small letter nu, U+03BD ISOgrk3 */ |
280 |
|
|
{"xi", 0x03BE}, /* greek small letter xi, U+03BE ISOgrk3 */ |
281 |
|
|
{"omicron", 0x03BF}, /* greek small letter omicron, U+03BF NEW */ |
282 |
|
|
{"pi", 0x03C0}, /* greek small letter pi, U+03C0 ISOgrk3 */ |
283 |
|
|
{"rho", 0x03C1}, /* greek small letter rho, U+03C1 ISOgrk3 */ |
284 |
|
|
{"sigmaf", 0x03C2}, /* greek small letter final sigma, U+03C2 ISOgrk3 */ |
285 |
|
|
{"sigma", 0x03C3}, /* greek small letter sigma, U+03C3 ISOgrk3 */ |
286 |
|
|
{"tau", 0x03C4}, /* greek small letter tau, U+03C4 ISOgrk3 */ |
287 |
|
|
{"upsilon", 0x03C5}, /* greek small letter upsilon, U+03C5 ISOgrk3 */ |
288 |
|
|
{"phi", 0x03C6}, /* greek small letter phi, U+03C6 ISOgrk3 */ |
289 |
|
|
{"chi", 0x03C7}, /* greek small letter chi, U+03C7 ISOgrk3 */ |
290 |
|
|
{"psi", 0x03C8}, /* greek small letter psi, U+03C8 ISOgrk3 */ |
291 |
|
|
{"omega", 0x03C9}, /* greek small letter omega, U+03C9 ISOgrk3 */ |
292 |
|
|
{"thetasym", 0x03D1}, /* greek small letter theta symbol, U+03D1 NEW */ |
293 |
|
|
{"upsih", 0x03D2}, /* greek upsilon with hook symbol, U+03D2 NEW */ |
294 |
|
|
{"piv", 0x03D6}, /* greek pi symbol, U+03D6 ISOgrk3 */ |
295 |
|
|
|
296 |
|
|
{"ensp", 0x2002}, /* en space, U+2002 ISOpub */ |
297 |
|
|
{"emsp", 0x2003}, /* em space, U+2003 ISOpub */ |
298 |
|
|
{"thinsp", 0x2009}, /* thin space, U+2009 ISOpub */ |
299 |
|
|
{"zwnj", 0x200C}, /* zero width non-joiner, U+200C NEW RFC 2070 */ |
300 |
|
|
{"zwj", 0x200D}, /* zero width joiner, U+200D NEW RFC 2070 */ |
301 |
|
|
{"lrm", 0x200E}, /* left-to-right mark, U+200E NEW RFC 2070 */ |
302 |
|
|
{"rlm", 0x200F}, /* right-to-left mark, U+200F NEW RFC 2070 */ |
303 |
|
|
{"ndash", 0x2013}, /* en dash, U+2013 ISOpub */ |
304 |
|
|
{"mdash", 0x2014}, /* em dash, U+2014 ISOpub */ |
305 |
|
|
{"lsquo", 0x2018}, /* left single quotation mark, U+2018 ISOnum */ |
306 |
|
|
{"rsquo", 0x2019}, /* right single quotation mark, U+2019 ISOnum */ |
307 |
|
|
{"sbquo", 0x201A}, /* single low-9 quotation mark, U+201A NEW */ |
308 |
|
|
{"ldquo", 0x201C}, /* left double quotation mark, U+201C ISOnum */ |
309 |
|
|
{"rdquo", 0x201D}, /* right double quotation mark, U+201D ISOnum */ |
310 |
|
|
{"bdquo", 0x201E}, /* double low-9 quotation mark, U+201E NEW */ |
311 |
|
|
{"dagger", 0x2020}, /* dagger, U+2020 ISOpub */ |
312 |
|
|
{"Dagger", 0x2021}, /* double dagger, U+2021 ISOpub */ |
313 |
|
|
|
314 |
|
|
{"bull", 0x2022}, /* bullet = black small circle, U+2022 ISOpub */ |
315 |
|
|
{"hellip", 0x2026}, /* horizontal ellipsis = three dot leader, U+2026 ISOpub */ |
316 |
|
|
|
317 |
|
|
{"permil", 0x2030}, /* per mille sign, U+2030 ISOtech */ |
318 |
|
|
|
319 |
|
|
{"prime", 0x2032}, /* prime = minutes = feet, U+2032 ISOtech */ |
320 |
|
|
{"Prime", 0x2033}, /* double prime = seconds = inches, U+2033 ISOtech */ |
321 |
|
|
|
322 |
|
|
{"lsaquo", 0x2039}, /* single left-pointing angle quotation mark, U+2039 ISO proposed */ |
323 |
|
|
{"rsaquo", 0x203A}, /* single right-pointing angle quotation mark, U+203A ISO proposed */ |
324 |
|
|
|
325 |
|
|
{"oline", 0x203E}, /* overline = spacing overscore, U+203E NEW */ |
326 |
|
|
{"frasl", 0x2044}, /* fraction slash, U+2044 NEW */ |
327 |
|
|
|
328 |
|
|
{"euro", 0x20AC}, /* euro sign, U+20AC NEW */ |
329 |
|
|
|
330 |
|
|
/* -- Letterlike Symbols */ |
331 |
|
|
{"image", 0x2111}, /* blackletter capital I = imaginary part, U+2111 ISOamso */ |
332 |
|
|
{"weierp", 0x2118}, /* script capital P = power set = Weierstrass p, U+2118 ISOamso */ |
333 |
|
|
{"real", 0x211C}, /* blackletter capital R = real part symbol, U+211C ISOamso */ |
334 |
|
|
{"trade", 0x2122}, /* trade mark sign, U+2122 ISOnum */ |
335 |
|
|
|
336 |
|
|
/* -- alef symbol is NOT the same as hebrew letter alef, U+05D0 */ |
337 |
|
|
{"alefsym", 0x2135}, /* alef symbol = first transfinite cardinal, U+2135 NEW */ |
338 |
|
|
|
339 |
|
|
/* -- Arrow Symbols */ |
340 |
|
|
{"larr", 0x2190}, /* leftwards arrow, U+2190 ISOnum */ |
341 |
|
|
{"uarr", 0x2191}, /* upwards arrow, U+2191 ISOnum */ |
342 |
|
|
{"rarr", 0x2192}, /* rightwards arrow, U+2192 ISOnum */ |
343 |
|
|
{"darr", 0x2193}, /* downwards arrow, U+2193 ISOnum */ |
344 |
|
|
{"harr", 0x2194}, /* left right arrow, U+2194 ISOamsa */ |
345 |
|
|
{"crarr", 0x21B5}, /* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */ |
346 |
|
|
{"lArr", 0x21D0}, /* leftwards double arrow, U+21D0 ISOtech */ |
347 |
|
|
{"uArr", 0x21D1}, /* upwards double arrow, U+21D1 ISOamsa */ |
348 |
|
|
{"rArr", 0x21D2}, /* rightwards double arrow, U+21D2 ISOtech */ |
349 |
|
|
{"dArr", 0x21D3}, /* downwards double arrow, U+21D3 ISOamsa */ |
350 |
|
|
{"hArr", 0x21D4}, /* left right double arrow, U+21D4 ISOamsa */ |
351 |
|
|
|
352 |
|
|
/* -- Mathematical Operators */ |
353 |
|
|
{"forall", 0x2200}, /* for all, U+2200 ISOtech */ |
354 |
|
|
{"part", 0x2202}, /* partial differential, U+2202 ISOtech */ |
355 |
|
|
{"exist", 0x2203}, /* there exists, U+2203 ISOtech */ |
356 |
|
|
{"empty", 0x2205}, /* empty set = null set = diameter, U+2205 ISOamso */ |
357 |
|
|
{"nabla", 0x2207}, /* nabla = backward difference, U+2207 ISOtech */ |
358 |
|
|
{"isin", 0x2208}, /* element of, U+2208 ISOtech */ |
359 |
|
|
{"notin", 0x2209}, /* not an element of, U+2209 ISOtech */ |
360 |
|
|
{"ni", 0x220B}, /* contains as member, U+220B ISOtech */ |
361 |
|
|
{"prod", 0x220F}, /* n-ary product = product sign, U+220F ISOamsb */ |
362 |
|
|
{"sum", 0x2211}, /* n-ary sumation, U+2211 ISOamsb */ |
363 |
|
|
{"minus", 0x2212}, /* minus sign, U+2212 ISOtech */ |
364 |
|
|
{"lowast", 0x2217}, /* asterisk operator, U+2217 ISOtech */ |
365 |
|
|
{"radic", 0x221A}, /* square root = radical sign, U+221A ISOtech */ |
366 |
|
|
{"prop", 0x221D}, /* proportional to, U+221D ISOtech */ |
367 |
|
|
{"infin", 0x221E}, /* infinity, U+221E ISOtech */ |
368 |
|
|
{"ang", 0x2220}, /* angle, U+2220 ISOamso */ |
369 |
|
|
{"and", 0x2227}, /* logical and = wedge, U+2227 ISOtech */ |
370 |
|
|
{"or", 0x2228}, /* logical or = vee, U+2228 ISOtech */ |
371 |
|
|
{"cap", 0x2229}, /* intersection = cap, U+2229 ISOtech */ |
372 |
|
|
{"cup", 0x222A}, /* union = cup, U+222A ISOtech */ |
373 |
|
|
{"int", 0x222B}, /* integral, U+222B ISOtech */ |
374 |
|
|
{"there4", 0x2234}, /* therefore, U+2234 ISOtech */ |
375 |
|
|
{"sim", 0x223C}, /* tilde operator = varies with = similar to, U+223C ISOtech */ |
376 |
|
|
{"cong", 0x2245}, /* approximately equal to, U+2245 ISOtech */ |
377 |
|
|
{"asymp", 0x2248}, /* almost equal to = asymptotic to, U+2248 ISOamsr */ |
378 |
|
|
{"ne", 0x2260}, /* not equal to, U+2260 ISOtech */ |
379 |
|
|
{"equiv", 0x2261}, /* identical to, U+2261 ISOtech */ |
380 |
|
|
{"le", 0x2264}, /* less-than or equal to, U+2264 ISOtech */ |
381 |
|
|
{"ge", 0x2265}, /* greater-than or equal to, U+2265 ISOtech */ |
382 |
|
|
{"sub", 0x2282}, /* subset of, U+2282 ISOtech */ |
383 |
|
|
{"sup", 0x2283}, /* superset of, U+2283 ISOtech */ |
384 |
|
|
{"nsub", 0x2284}, /* not a subset of, U+2284 ISOamsn */ |
385 |
|
|
{"sube", 0x2286}, /* subset of or equal to, U+2286 ISOtech */ |
386 |
|
|
{"supe", 0x2287}, /* superset of or equal to, U+2287 ISOtech */ |
387 |
|
|
{"oplus", 0x2295}, /* circled plus = direct sum, U+2295 ISOamsb */ |
388 |
|
|
{"otimes", 0x2297}, /* circled times = vector product, U+2297 ISOamsb */ |
389 |
|
|
{"perp", 0x22A5}, /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */ |
390 |
|
|
{"sdot", 0x22C5}, /* dot operator, U+22C5 ISOamsb */ |
391 |
|
|
{"lceil", 0x2308}, /* left ceiling = apl upstile, U+2308 ISOamsc */ |
392 |
|
|
{"rceil", 0x2309}, /* right ceiling, U+2309 ISOamsc */ |
393 |
|
|
{"lfloor", 0x230A}, /* left floor = apl downstile, U+230A ISOamsc */ |
394 |
|
|
{"rfloor", 0x230B}, /* right floor, U+230B ISOamsc */ |
395 |
|
|
{"lang", 0x2329}, /* left-pointing angle bracket = bra, U+2329 ISOtech */ |
396 |
|
|
{"rang", 0x232A}, /* right-pointing angle bracket = ket, U+232A ISOtech */ |
397 |
|
|
{"loz", 0x25CA}, /* lozenge, U+25CA ISOpub */ |
398 |
|
|
|
399 |
|
|
/* -- Miscellaneous Symbols */ |
400 |
|
|
{"spades", 0x2660}, /* black spade suit, U+2660 ISOpub */ |
401 |
|
|
{"clubs", 0x2663}, /* black club suit = shamrock, U+2663 ISOpub */ |
402 |
|
|
{"hearts", 0x2665}, /* black heart suit = valentine, U+2665 ISOpub */ |
403 |
|
|
{"diams", 0x2666}, /* black diamond suit, U+2666 ISOpub */ |
404 |
|
|
|
405 |
|
|
}; |
406 |
|
|
|
407 |
|
|
|
408 |
|
|
/* |
409 |
|
|
** ---------------------------------------------- |
410 |
|
|
** |
411 |
|
|
** Module management code starts here |
412 |
|
|
** |
413 |
|
|
** ---------------------------------------------- |
414 |
|
|
*/ |
415 |
|
|
|
416 |
|
|
/* |
417 |
|
|
-- init structures for Entities |
418 |
|
|
*/ |
419 |
|
|
|
420 |
|
|
void initModule_Entities(SWISH * sw) |
421 |
|
|
{ |
422 |
|
|
struct MOD_Entities *md; |
423 |
|
|
|
424 |
|
|
|
425 |
|
|
md = (struct MOD_Entities *) emalloc(sizeof(struct MOD_Entities)); |
426 |
|
|
|
427 |
|
|
sw->Entities = md; |
428 |
|
|
|
429 |
|
|
md->convertEntities = CONVERTHTMLENTITIES; |
430 |
|
|
|
431 |
|
|
/* |
432 |
|
|
-- init entity hash |
433 |
|
|
-- this is module local only |
434 |
|
|
*/ |
435 |
|
|
|
436 |
|
|
if ( !ce_hasharray_initialized++ ) |
437 |
|
|
{ |
438 |
|
|
int i, |
439 |
|
|
tab_len; |
440 |
|
|
CEntity *ce_p; |
441 |
|
|
struct CEHE **hash_pp, |
442 |
|
|
*tmp_p; |
443 |
|
|
|
444 |
|
|
/* empty positions */ |
445 |
|
|
for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++) |
446 |
|
|
ce_hasharray[i] = (struct CEHE *) NULL; |
447 |
|
|
|
448 |
|
|
|
449 |
|
|
/* |
450 |
|
|
-- fill entity table into hash |
451 |
|
|
-- process from end to start of entity_table, because most used |
452 |
|
|
-- entities are at the beginning (iso) |
453 |
|
|
-- this is due to "insert in hash sequence" behavior in hashtab (performance!) |
454 |
|
|
-- The improvement is minimal, because the hash table re-chains during usage. |
455 |
|
|
*/ |
456 |
|
|
|
457 |
|
|
tab_len = sizeof(entity_table) / sizeof(entity_table[0]); |
458 |
|
|
|
459 |
|
|
for (i = tab_len - 1; i >= 0; i--) |
460 |
|
|
{ |
461 |
|
|
ce_p = &entity_table[i]; |
462 |
|
|
hash_pp = &ce_hasharray[(int) *(ce_p->name) & 0x7F]; |
463 |
|
|
/* insert entity-ptr at start of ptr sequence in hash */ |
464 |
|
|
tmp_p = *hash_pp; |
465 |
|
|
*hash_pp = (struct CEHE *) emalloc(sizeof(struct CEHE)); |
466 |
|
|
(*hash_pp)->ce = ce_p; |
467 |
|
|
(*hash_pp)->next = tmp_p; |
468 |
|
|
} |
469 |
|
|
|
470 |
|
|
} /* end init hash block */ |
471 |
|
|
|
472 |
|
|
} |
473 |
|
|
|
474 |
|
|
|
475 |
|
|
|
476 |
|
|
/* |
477 |
|
|
-- release structures for Entities |
478 |
|
|
-- release all wired memory |
479 |
|
|
*/ |
480 |
|
|
|
481 |
|
|
void freeModule_Entities(SWISH * sw) |
482 |
|
|
{ |
483 |
|
|
|
484 |
|
|
/* free module data structure */ |
485 |
|
|
|
486 |
|
|
efree(sw->Entities); |
487 |
|
|
sw->Entities = NULL; |
488 |
|
|
|
489 |
|
|
|
490 |
|
|
/* |
491 |
|
|
-- free local entity hash table |
492 |
|
|
*/ |
493 |
|
|
{ |
494 |
|
|
int i; |
495 |
|
|
struct CEHE *hash_p, |
496 |
|
|
*tmp_p; |
497 |
|
|
/* free ptr "chains" in array */ |
498 |
|
|
for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++) |
499 |
|
|
{ |
500 |
|
|
hash_p = ce_hasharray[i]; |
501 |
|
|
while (hash_p) |
502 |
|
|
{ |
503 |
|
|
tmp_p = hash_p->next; |
504 |
|
|
efree(hash_p); |
505 |
|
|
hash_p = tmp_p; |
506 |
|
|
} |
507 |
|
|
ce_hasharray[i] = (struct CEHE *) NULL; |
508 |
|
|
} |
509 |
|
|
|
510 |
|
|
} /* end free hash block */ |
511 |
|
|
|
512 |
|
|
} |
513 |
|
|
|
514 |
|
|
|
515 |
|
|
/* |
516 |
|
|
** ---------------------------------------------- |
517 |
|
|
** |
518 |
|
|
** Module config code starts here |
519 |
|
|
** |
520 |
|
|
** ---------------------------------------------- |
521 |
|
|
*/ |
522 |
|
|
|
523 |
|
|
|
524 |
|
|
/* |
525 |
|
|
-- Config Directives |
526 |
|
|
-- Configuration directives for this Module |
527 |
|
|
-- return: 0/1 = none/config applied |
528 |
|
|
*/ |
529 |
|
|
|
530 |
|
|
int configModule_Entities(SWISH * sw, StringList * sl) |
531 |
|
|
{ |
532 |
|
|
struct MOD_Entities *md = sw->Entities; |
533 |
|
|
char *w0; |
534 |
|
|
int retval; |
535 |
|
|
|
536 |
|
|
|
537 |
|
|
w0 = sl->word[0]; |
538 |
|
|
retval = 1; |
539 |
|
|
|
540 |
|
|
|
541 |
|
|
if (strcasecmp(w0, "ConvertHTMLEntities") == 0) |
542 |
|
|
{ |
543 |
|
|
md->convertEntities = getYesNoOrAbort(sl, 1, 1); |
544 |
|
|
} |
545 |
|
|
else |
546 |
|
|
{ |
547 |
|
|
retval = 0; /* not a Entities directive */ |
548 |
|
|
} |
549 |
|
|
|
550 |
|
|
|
551 |
|
|
return retval; |
552 |
|
|
} |
553 |
|
|
|
554 |
|
|
|
555 |
|
|
|
556 |
|
|
|
557 |
|
|
|
558 |
|
|
|
559 |
|
|
/* |
560 |
|
|
** ---------------------------------------------- |
561 |
|
|
** |
562 |
|
|
** Module code starts here |
563 |
|
|
** |
564 |
|
|
** ---------------------------------------------- |
565 |
|
|
*/ |
566 |
|
|
|
567 |
|
|
|
568 |
|
|
/* |
569 |
|
|
-- convert a string containing HTML/XML entities |
570 |
|
|
-- conversion is done on the string itsself. |
571 |
|
|
-- conversion is only done, if config directive is set to "YES" |
572 |
|
|
-- return ptr to converted string. |
573 |
|
|
*/ |
574 |
|
|
|
575 |
|
|
unsigned char *sw_ConvHTMLEntities2ISO(SWISH * sw, unsigned char *s) |
576 |
|
|
{ |
577 |
|
|
return (sw->Entities->convertEntities) ? strConvHTMLEntities2ISO(s) : s; |
578 |
|
|
} |
579 |
|
|
|
580 |
|
|
|
581 |
|
|
|
582 |
|
|
/* |
583 |
|
|
-- convert a string containing HTML/XML entities |
584 |
|
|
-- conversion is done on the string itsself. |
585 |
|
|
-- return ptr to converted string. |
586 |
|
|
*/ |
587 |
|
|
|
588 |
|
|
unsigned char *strConvHTMLEntities2ISO(unsigned char *buf) |
589 |
|
|
{ |
590 |
|
|
unsigned char *s, |
591 |
|
|
*t; |
592 |
|
|
unsigned char *d; |
593 |
|
|
int code; |
594 |
|
|
|
595 |
|
|
|
596 |
|
|
s = d = buf; |
597 |
|
|
|
598 |
|
|
while (*s) |
599 |
|
|
{ |
600 |
|
|
|
601 |
|
|
/* if not entity start, next */ |
602 |
|
|
if (*s != '&') |
603 |
|
|
{ |
604 |
|
|
*d++ = *s++; |
605 |
|
|
} |
606 |
|
|
else |
607 |
|
|
{ |
608 |
|
|
/* entity found, identify and decode */ |
609 |
|
|
/* ignore zero entities and UNICODE ! */ |
610 |
|
|
code = charEntityDecode(s, &t); |
611 |
|
|
if (code && (code < 256)) |
612 |
|
|
*d++ = (unsigned char) code; |
613 |
|
|
s = t; |
614 |
|
|
} |
615 |
|
|
} |
616 |
|
|
*d = '\0'; |
617 |
|
|
|
618 |
|
|
return buf; |
619 |
|
|
} |
620 |
|
|
|
621 |
|
|
|
622 |
|
|
|
623 |
|
|
/* |
624 |
|
|
-- decode entity string to character code: |
625 |
|
|
-- &#dec; &#xhex; &#Xhex; &named; |
626 |
|
|
-- Decoding is hash optimized with dynamic re-chaining for |
627 |
|
|
-- performance improvement... |
628 |
|
|
-- return: entity character (decoded) |
629 |
|
|
-- position "end" (if != NULL) past "entity" or behind ret. char |
630 |
|
|
-- on illegal entities, just return the char... |
631 |
|
|
*/ |
632 |
|
|
|
633 |
|
|
int charEntityDecode(unsigned char *s, unsigned char **end) |
634 |
|
|
{ |
635 |
|
|
unsigned char *s1, |
636 |
|
|
*t; |
637 |
|
|
unsigned char *e_end; |
638 |
|
|
unsigned char s_cmp[MAX_ENTITY_LEN + 1]; |
639 |
|
|
int len; |
640 |
|
|
int code; |
641 |
|
|
|
642 |
|
|
|
643 |
|
|
/* |
644 |
|
|
-- no entity ctrl start char?, err: return char |
645 |
|
|
*/ |
646 |
|
|
if (*s != '&') |
647 |
|
|
{ |
648 |
|
|
if (end) |
649 |
|
|
*end = s + 1; |
650 |
|
|
return (int) *s; |
651 |
|
|
} |
652 |
|
|
|
653 |
|
|
|
654 |
|
|
|
655 |
|
|
/* ok, seems valid entity starting char */ |
656 |
|
|
code = 0; |
657 |
|
|
e_end = NULL; |
658 |
|
|
|
659 |
|
|
if (*(s + 1) == '#') |
660 |
|
|
{ /* numeric entity "&#" */ |
661 |
|
|
|
662 |
|
|
s += 2; /* after "&#" */ |
663 |
|
|
switch (*s) |
664 |
|
|
{ |
665 |
|
|
case 'x': |
666 |
|
|
case 'X': |
667 |
|
|
++s; /* skip x */ |
668 |
|
|
code = (int) strtoul((char *)s, (char **) &e_end, (int) 16); |
669 |
|
|
break; |
670 |
|
|
default: |
671 |
|
|
code = (int) strtoul((char *)s, (char **) &e_end, (int) 10); |
672 |
|
|
break; |
673 |
|
|
} |
674 |
|
|
|
675 |
|
|
} |
676 |
|
|
else |
677 |
|
|
{ |
678 |
|
|
|
679 |
|
|
/* |
680 |
|
|
-- ok, seems to be a named entity, find terminating char |
681 |
|
|
-- t = NULL if not found... |
682 |
|
|
-- if no char found: return '&' (illegal entity) |
683 |
|
|
*/ |
684 |
|
|
|
685 |
|
|
len = 0; |
686 |
|
|
t = NULL; |
687 |
|
|
s1 = s; |
688 |
|
|
while (len < MAX_ENTITY_LEN) |
689 |
|
|
{ |
690 |
|
|
s_cmp[len] = *(++s1); |
691 |
|
|
if (IS_EOE(*s1)) |
692 |
|
|
{ |
693 |
|
|
t = s1; /* End of named entity */ |
694 |
|
|
break; |
695 |
|
|
} |
696 |
|
|
if (!*s1) |
697 |
|
|
break; /* maybe this is also checked by is_EOE! */ |
698 |
|
|
len++; |
699 |
|
|
} |
700 |
|
|
s_cmp[len] = '\0'; |
701 |
|
|
|
702 |
|
|
/* |
703 |
|
|
-- hash search block |
704 |
|
|
-- case sensitiv search (hashvalue = 1 entity name char) |
705 |
|
|
-- (& 0x7F to prevent hashtable mem coredumps by illegal chars) |
706 |
|
|
-- improve performance, by rechaining found elements |
707 |
|
|
*/ |
708 |
|
|
|
709 |
|
|
if (t) |
710 |
|
|
{ |
711 |
|
|
struct CEHE *hash_p; |
712 |
|
|
struct CEHE **hash_pp, |
713 |
|
|
*last_p; |
714 |
|
|
|
715 |
|
|
hash_pp = &ce_hasharray[*(s + 1) & 0x7F]; |
716 |
|
|
last_p = NULL; |
717 |
|
|
hash_p = *hash_pp; |
718 |
|
|
while (hash_p) |
719 |
|
|
{ |
720 |
|
|
if (!strcmp( (char *)hash_p->ce->name, (char *)s_cmp)) |
721 |
|
|
{ |
722 |
|
|
code = hash_p->ce->code; |
723 |
|
|
if (last_p) |
724 |
|
|
{ /* rechain hash sequence list (last found = first) */ |
725 |
|
|
last_p->next = hash_p->next; /* take elem out of seq */ |
726 |
|
|
hash_p->next = *hash_pp; /* old 1. = 2. */ |
727 |
|
|
*hash_pp = hash_p; /* found = 1st */ |
728 |
|
|
} |
729 |
|
|
e_end = t; /* found -> set end */ |
730 |
|
|
break; |
731 |
|
|
} |
732 |
|
|
last_p = hash_p; |
733 |
|
|
hash_p = hash_p->next; |
734 |
|
|
} |
735 |
|
|
|
736 |
|
|
} |
737 |
|
|
} /* end if */ |
738 |
|
|
|
739 |
|
|
|
740 |
|
|
if (!e_end) |
741 |
|
|
{ |
742 |
|
|
code = *s; |
743 |
|
|
e_end = s + 1; |
744 |
|
|
} |
745 |
|
|
else |
746 |
|
|
{ |
747 |
|
|
if (*e_end == ';') |
748 |
|
|
e_end++; /* W3C EndOfEntity */ |
749 |
|
|
} |
750 |
|
|
|
751 |
|
|
|
752 |
|
|
if (end) |
753 |
|
|
*end = e_end; |
754 |
|
|
return code; |
755 |
|
|
} |
756 |
|
|
|
757 |
|
|
|
758 |
|
|
/* |
759 |
|
|
-- check if a char is the end of a html entity. |
760 |
|
|
-- behavior can be W3C pedantic or tolerant. |
761 |
|
|
-- mapped via macro to avoid function calls on strict ==';' behavior |
762 |
|
|
-- return: cmp value |
763 |
|
|
*/ |
764 |
|
|
|
765 |
|
|
static int is_EOE(int c) |
766 |
|
|
{ |
767 |
|
|
/* be tolerant ! */ |
768 |
|
|
return ((!isprint(c)) || ispunct(c) || isspace(c)) ? 1 : 0; |
769 |
|
|
} |