/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/entities.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/entities.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     $Id: entities.c,v 1.16 2002/05/17 22:38:18 whmoseley Exp $
3     **
4     ** This program and library is free software; you can redistribute it and/or
5     ** modify it under the terms of the GNU (Library) General Public License
6     ** as published by the Free Software Foundation; either version 2
7     ** of the License, or any later version.
8     **
9     ** This program is distributed in the hope that it will be useful,
10     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
11     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12     ** GNU (Library) General Public License for more details.
13     **
14     ** You should have received a copy of the GNU (Library) General Public License
15     ** along with this program; if not, write to the Free Software
16     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17     **
18     ** (c) Rainer.Scherg
19     **
20     **
21     ** HTML entity routines (encoding, etc.):
22     **
23     ** internally we are working with int/wchar_t to support unicode-16 for future
24     ** enhancements of swish (rasc - Rainer Scherg).
25     **
26     ** 2001-05-05 rasc
27     **
28     */
29    
30    
31    
32    
33     #include <stdlib.h>
34     #include "swish.h"
35     #include "mem.h"
36     #include "string.h"
37     #include "parse_conffile.h"
38     #include "config.h"
39     #include "entities.h"
40    
41    
42     /*
43     ** ----------------------------------------------
44     **
45     ** Private Module Data
46     **
47     ** ----------------------------------------------
48     */
49    
50     /* Prototypes */
51    
52     static int is_EOE(int c); /* is_EndOfEntity */
53    
54    
55    
56    
57     #define MAX_ENTITY_LEN 16 /* max chars after where we have to see the EOE */
58    
59     /*
60     -- Entity encoding/decoding structure
61     */
62    
63     /* #define IS_EOE(a) ((a)==';') -- be W3C compliant */
64     #define IS_EOE(a) (is_EOE((int)(a))) /* tolerant routine */
65    
66    
67     typedef struct
68     {
69     char *name;
70     int code;
71     }
72     CEntity;
73    
74    
75     /*
76     -- CEntity Quick Hash structure
77     -- works like follow: Array of ASCII-7 start "positions" (1. char of entity name)
78     -- each entry can have a chain of pointers
79     -- e.g. &quote; --> ['q']->ce(.name .code)
80     -- ->next (chains all &q...;)
81     -- lots of slots in the array will be empty because only [A-Z] and [a-z]
82     -- is needed. But this cost hardly any memory, and is convenient... (rasc)
83     -- The hash sequence list will be re-sequenced during(!) usage (dynamic re-chaining).
84     -- This brings down compares to almost 1 strcmp on entity checks.
85     --
86     -- Warning: don't change this (ce_hasharray,etc) unless you know how this really works!
87     --
88     -- 2001-05-14 Rainer.Scherg@rexroth.de (rasc)
89     --
90     */
91    
92     struct CEHE
93     { /* CharEntityHashEntry */
94     CEntity *ce;
95     struct CEHE *next;
96     };
97    
98     static struct CEHE *ce_hasharray[128];
99     static int ce_hasharray_initialized = 0;
100    
101    
102     /*
103     -- the following table is retrieved from HTML4.x / SGML definitions
104     -- of the W3C (did it automated 2001-05-05).
105     -- http://www.w3.org/TR/html40/
106     -- http://www.w3.org/TR/1999/REC-html401-19991224/sgml/entities.html
107     --
108     -- 2001-05-07 Rainer.Scherg
109     */
110    
111    
112     static CEntity entity_table[] = {
113     {"quot", 0x0022}, /* quotation mark = APL quote, U+0022 ISOnum */
114     {"amp", 0x0026}, /* ampersand, U+0026 ISOnum */
115     {"apos", 0x0027}, /* single quote */
116     {"lt", 0x003C}, /* less-than sign, U+003C ISOnum */
117     {"gt", 0x003E}, /* greater-than sign, U+003E ISOnum */
118    
119     /*
120     * A bunch still in the 128-255 range
121     * Replacing them depend really on the charset used.
122     */
123     {"nbsp", 0x00A0}, /* no-break space = non-breaking space, U+00A0 ISOnum */
124     {"iexcl", 0x00A1}, /* inverted exclamation mark, U+00A1 ISOnum */
125     {"cent", 0x00A2}, /* cent sign, U+00A2 ISOnum */
126     {"pound", 0x00A3}, /* pound sign, U+00A3 ISOnum */
127     {"curren", 0x00A4}, /* currency sign, U+00A4 ISOnum */
128     {"yen", 0x00A5}, /* yen sign = yuan sign, U+00A5 ISOnum */
129     {"brvbar", 0x00A6}, /* broken bar = broken vertical bar, U+00A6 ISOnum */
130     {"sect", 0x00A7}, /* section sign, U+00A7 ISOnum */
131     {"uml", 0x00A8}, /* diaeresis = spacing diaeresis, U+00A8 ISOdia */
132     {"copy", 0x00A9}, /* copyright sign, U+00A9 ISOnum */
133     {"ordf", 0x00AA}, /* feminine ordinal indicator, U+00AA ISOnum */
134     {"laquo", 0x00AB}, /* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */
135     {"not", 0x00AC}, /* not sign, U+00AC ISOnum */
136     {"shy", 0x00AD}, /* soft hyphen = discretionary hyphen, U+00AD ISOnum */
137     {"reg", 0x00AE}, /* registered sign = registered trade mark sign, U+00AE ISOnum */
138     {"macr", 0x00AF}, /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */
139     {"deg", 0x00B0}, /* degree sign, U+00B0 ISOnum */
140     {"plusmn", 0x00B1}, /* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */
141     {"sup2", 0x00B2}, /* superscript two = superscript digit two = squared, U+00B2 ISOnum */
142     {"sup3", 0x00B3}, /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */
143     {"acute", 0x00B4}, /* acute accent = spacing acute, U+00B4 ISOdia */
144     {"micro", 0x00B5}, /* micro sign, U+00B5 ISOnum */
145     {"para", 0x00B6}, /* pilcrow sign = paragraph sign, U+00B6 ISOnum */
146     {"middot", 0x00B7}, /* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */
147     {"cedil", 0x00B8}, /* cedilla = spacing cedilla, U+00B8 ISOdia */
148     {"sup1", 0x00B9}, /* superscript one = superscript digit one, U+00B9 ISOnum */
149     {"ordm", 0x00BA}, /* masculine ordinal indicator, U+00BA ISOnum */
150     {"raquo", 0x00BB}, /* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */
151     {"frac14", 0x00BC}, /* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */
152     {"frac12", 0x00BD}, /* vulgar fraction one half = fraction one half, U+00BD ISOnum */
153     {"frac34", 0x00BE}, /* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */
154     {"iquest", 0x00BF}, /* inverted question mark = turned question mark, U+00BF ISOnum */
155     {"Agrave", 0x00C0}, /* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */
156     {"Aacute", 0x00C1}, /* latin capital letter A with acute, U+00C1 ISOlat1 */
157     {"Acirc", 0x00C2}, /* latin capital letter A with circumflex, U+00C2 ISOlat1 */
158     {"Atilde", 0x00C3}, /* latin capital letter A with tilde, U+00C3 ISOlat1 */
159     {"Auml", 0x00C4}, /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */
160     {"Aring", 0x00C5}, /* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */
161     {"AElig", 0x00C6}, /* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */
162     {"Ccedil", 0x00C7}, /* latin capital letter C with cedilla, U+00C7 ISOlat1 */
163     {"Egrave", 0x00C8}, /* latin capital letter E with grave, U+00C8 ISOlat1 */
164     {"Eacute", 0x00C9}, /* latin capital letter E with acute, U+00C9 ISOlat1 */
165     {"Ecirc", 0x00CA}, /* latin capital letter E with circumflex, U+00CA ISOlat1 */
166     {"Euml", 0x00CB}, /* latin capital letter E with diaeresis, U+00CB ISOlat1 */
167     {"Igrave", 0x00CC}, /* latin capital letter I with grave, U+00CC ISOlat1 */
168     {"Iacute", 0x00CD}, /* latin capital letter I with acute, U+00CD ISOlat1 */
169     {"Icirc", 0x00CE}, /* latin capital letter I with circumflex, U+00CE ISOlat1 */
170     {"Iuml", 0x00CF}, /* latin capital letter I with diaeresis, U+00CF ISOlat1 */
171     {"ETH", 0x00D0}, /* latin capital letter ETH, U+00D0 ISOlat1 */
172     {"Ntilde", 0x00D1}, /* latin capital letter N with tilde, U+00D1 ISOlat1 */
173     {"Ograve", 0x00D2}, /* latin capital letter O with grave, U+00D2 ISOlat1 */
174     {"Oacute", 0x00D3}, /* latin capital letter O with acute, U+00D3 ISOlat1 */
175     {"Ocirc", 0x00D4}, /* latin capital letter O with circumflex, U+00D4 ISOlat1 */
176     {"Otilde", 0x00D5}, /* latin capital letter O with tilde, U+00D5 ISOlat1 */
177     {"Ouml", 0x00D6}, /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */
178     {"times", 0x00D7}, /* multiplication sign, U+00D7 ISOnum */
179     {"Oslash", 0x00D8}, /* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */
180     {"Ugrave", 0x00D9}, /* latin capital letter U with grave, U+00D9 ISOlat1 */
181     {"Uacute", 0x00DA}, /* latin capital letter U with acute, U+00DA ISOlat1 */
182     {"Ucirc", 0x00DB}, /* latin capital letter U with circumflex, U+00DB ISOlat1 */
183     {"Uuml", 0x00DC}, /* latin capital letter U with diaeresis, U+00DC ISOlat1 */
184     {"Yacute", 0x00DD}, /* latin capital letter Y with acute, U+00DD ISOlat1 */
185     {"THORN", 0x00DE}, /* latin capital letter THORN, U+00DE ISOlat1 */
186     {"szlig", 0x00DF}, /* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */
187     {"agrave", 0x00E0}, /* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */
188     {"aacute", 0x00E1}, /* latin small letter a with acute, U+00E1 ISOlat1 */
189     {"acirc", 0x00E2}, /* latin small letter a with circumflex, U+00E2 ISOlat1 */
190     {"atilde", 0x00E3}, /* latin small letter a with tilde, U+00E3 ISOlat1 */
191     {"auml", 0x00E4}, /* latin small letter a with diaeresis, U+00E4 ISOlat1 */
192     {"aring", 0x00E5}, /* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */
193     {"aelig", 0x00E6}, /* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */
194     {"ccedil", 0x00E7}, /* latin small letter c with cedilla, U+00E7 ISOlat1 */
195     {"egrave", 0x00E8}, /* latin small letter e with grave, U+00E8 ISOlat1 */
196     {"eacute", 0x00E9}, /* latin small letter e with acute, U+00E9 ISOlat1 */
197     {"ecirc", 0x00EA}, /* latin small letter e with circumflex, U+00EA ISOlat1 */
198     {"euml", 0x00EB}, /* latin small letter e with diaeresis, U+00EB ISOlat1 */
199     {"igrave", 0x00EC}, /* latin small letter i with grave, U+00EC ISOlat1 */
200     {"iacute", 0x00ED}, /* latin small letter i with acute, U+00ED ISOlat1 */
201     {"icirc", 0x00EE}, /* latin small letter i with circumflex, U+00EE ISOlat1 */
202     {"iuml", 0x00EF}, /* latin small letter i with diaeresis, U+00EF ISOlat1 */
203     {"eth", 0x00F0}, /* latin small letter eth, U+00F0 ISOlat1 */
204     {"ntilde", 0x00F1}, /* latin small letter n with tilde, U+00F1 ISOlat1 */
205     {"ograve", 0x00F2}, /* latin small letter o with grave, U+00F2 ISOlat1 */
206     {"oacute", 0x00F3}, /* latin small letter o with acute, U+00F3 ISOlat1 */
207     {"ocirc", 0x00F4}, /* latin small letter o with circumflex, U+00F4 ISOlat1 */
208     {"otilde", 0x00F5}, /* latin small letter o with tilde, U+00F5 ISOlat1 */
209     {"ouml", 0x00F6}, /* latin small letter o with diaeresis, U+00F6 ISOlat1 */
210     {"divide", 0x00F7}, /* division sign, U+00F7 ISOnum */
211     {"oslash", 0x00F8}, /* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */
212     {"ugrave", 0x00F9}, /* latin small letter u with grave, U+00F9 ISOlat1 */
213     {"uacute", 0x00FA}, /* latin small letter u with acute, U+00FA ISOlat1 */
214     {"ucirc", 0x00FB}, /* latin small letter u with circumflex, U+00FB ISOlat1 */
215     {"uuml", 0x00FC}, /* latin small letter u with diaeresis, U+00FC ISOlat1 */
216     {"yacute", 0x00FD}, /* latin small letter y with acute, U+00FD ISOlat1 */
217     {"thorn", 0x00FE}, /* latin small letter thorn with, U+00FE ISOlat1 */
218     {"yuml", 0x00FF}, /* latin small letter y with diaeresis, U+00FF ISOlat1 */
219    
220     {"OElig", 0x0152}, /* latin capital ligature OE, U+0152 ISOlat2 */
221     {"oelig", 0x0153}, /* latin small ligature oe, U+0153 ISOlat2 */
222     {"Scaron", 0x0160}, /* latin capital letter S with caron, U+0160 ISOlat2 */
223     {"scaron", 0x0161}, /* latin small letter s with caron, U+0161 ISOlat2 */
224     {"Yuml", 0x0178}, /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */
225    
226     /*
227     * Anything below should really be kept as entities references
228     */
229    
230     /*
231     -- Latin Extended-B
232     */
233     {"fnof", 0x0192}, /* latin small f with hook = function = florin, U+0192 ISOtech */
234    
235     {"circ", 0x02C6}, /* modifier letter circumflex accent, U+02C6 ISOpub */
236     {"tilde", 0x02DC}, /* small tilde, U+02DC ISOdia */
237    
238     /*
239     -- Greek symbols
240     */
241     {"Alpha", 0x0391}, /* greek capital letter alpha, U+0391 */
242     {"Beta", 0x0392}, /* greek capital letter beta, U+0392 */
243     {"Gamma", 0x0393}, /* greek capital letter gamma, U+0393 ISOgrk3 */
244     {"Delta", 0x0394}, /* greek capital letter delta, U+0394 ISOgrk3 */
245     {"Epsilon", 0x0395}, /* greek capital letter epsilon, U+0395 */
246     {"Zeta", 0x0396}, /* greek capital letter zeta, U+0396 */
247     {"Eta", 0x0397}, /* greek capital letter eta, U+0397 */
248     {"Theta", 0x0398}, /* greek capital letter theta, U+0398 ISOgrk3 */
249     {"Iota", 0x0399}, /* greek capital letter iota, U+0399 */
250     {"Kappa", 0x039A}, /* greek capital letter kappa, U+039A */
251     {"Lambda", 0x039B}, /* greek capital letter lambda, U+039B ISOgrk3 */
252     {"Mu", 0x039C}, /* greek capital letter mu, U+039C */
253     {"Nu", 0x039D}, /* greek capital letter nu, U+039D */
254     {"Xi", 0x039E}, /* greek capital letter xi, U+039E ISOgrk3 */
255     {"Omicron", 0x039F}, /* greek capital letter omicron, U+039F */
256     {"Pi", 0x03A0}, /* greek capital letter pi, U+03A0 ISOgrk3 */
257     {"Rho", 0x03A1}, /* greek capital letter rho, U+03A1 */
258     /* -- there is no Sigmaf, and no U+03A2 character either */
259     {"Sigma", 0x03A3}, /* greek capital letter sigma, U+03A3 ISOgrk3 */
260     {"Tau", 0x03A4}, /* greek capital letter tau, U+03A4 */
261     {"Upsilon", 0x03A5}, /* greek capital letter upsilon, U+03A5 ISOgrk3 */
262     {"Phi", 0x03A6}, /* greek capital letter phi, U+03A6 ISOgrk3 */
263     {"Chi", 0x03A7}, /* greek capital letter chi, U+03A7 */
264     {"Psi", 0x03A8}, /* greek capital letter psi, U+03A8 ISOgrk3 */
265     {"Omega", 0x03A9}, /* greek capital letter omega, U+03A9 ISOgrk3 */
266    
267     {"alpha", 0x03B1}, /* greek small letter alpha, U+03B1 ISOgrk3 */
268     {"beta", 0x03B2}, /* greek small letter beta, U+03B2 ISOgrk3 */
269     {"gamma", 0x03B3}, /* greek small letter gamma, U+03B3 ISOgrk3 */
270     {"delta", 0x03B4}, /* greek small letter delta, U+03B4 ISOgrk3 */
271     {"epsilon", 0x03B5}, /* greek small letter epsilon, U+03B5 ISOgrk3 */
272     {"zeta", 0x03B6}, /* greek small letter zeta, U+03B6 ISOgrk3 */
273     {"eta", 0x03B7}, /* greek small letter eta, U+03B7 ISOgrk3 */
274     {"theta", 0x03B8}, /* greek small letter theta, U+03B8 ISOgrk3 */
275     {"iota", 0x03B9}, /* greek small letter iota, U+03B9 ISOgrk3 */
276     {"kappa", 0x03BA}, /* greek small letter kappa, U+03BA ISOgrk3 */
277     {"lambda", 0x03BB}, /* greek small letter lambda, U+03BB ISOgrk3 */
278     {"mu", 0x03BC}, /* greek small letter mu, U+03BC ISOgrk3 */
279     {"nu", 0x03BD}, /* greek small letter nu, U+03BD ISOgrk3 */
280     {"xi", 0x03BE}, /* greek small letter xi, U+03BE ISOgrk3 */
281     {"omicron", 0x03BF}, /* greek small letter omicron, U+03BF NEW */
282     {"pi", 0x03C0}, /* greek small letter pi, U+03C0 ISOgrk3 */
283     {"rho", 0x03C1}, /* greek small letter rho, U+03C1 ISOgrk3 */
284     {"sigmaf", 0x03C2}, /* greek small letter final sigma, U+03C2 ISOgrk3 */
285     {"sigma", 0x03C3}, /* greek small letter sigma, U+03C3 ISOgrk3 */
286     {"tau", 0x03C4}, /* greek small letter tau, U+03C4 ISOgrk3 */
287     {"upsilon", 0x03C5}, /* greek small letter upsilon, U+03C5 ISOgrk3 */
288     {"phi", 0x03C6}, /* greek small letter phi, U+03C6 ISOgrk3 */
289     {"chi", 0x03C7}, /* greek small letter chi, U+03C7 ISOgrk3 */
290     {"psi", 0x03C8}, /* greek small letter psi, U+03C8 ISOgrk3 */
291     {"omega", 0x03C9}, /* greek small letter omega, U+03C9 ISOgrk3 */
292     {"thetasym", 0x03D1}, /* greek small letter theta symbol, U+03D1 NEW */
293     {"upsih", 0x03D2}, /* greek upsilon with hook symbol, U+03D2 NEW */
294     {"piv", 0x03D6}, /* greek pi symbol, U+03D6 ISOgrk3 */
295    
296     {"ensp", 0x2002}, /* en space, U+2002 ISOpub */
297     {"emsp", 0x2003}, /* em space, U+2003 ISOpub */
298     {"thinsp", 0x2009}, /* thin space, U+2009 ISOpub */
299     {"zwnj", 0x200C}, /* zero width non-joiner, U+200C NEW RFC 2070 */
300     {"zwj", 0x200D}, /* zero width joiner, U+200D NEW RFC 2070 */
301     {"lrm", 0x200E}, /* left-to-right mark, U+200E NEW RFC 2070 */
302     {"rlm", 0x200F}, /* right-to-left mark, U+200F NEW RFC 2070 */
303     {"ndash", 0x2013}, /* en dash, U+2013 ISOpub */
304     {"mdash", 0x2014}, /* em dash, U+2014 ISOpub */
305     {"lsquo", 0x2018}, /* left single quotation mark, U+2018 ISOnum */
306     {"rsquo", 0x2019}, /* right single quotation mark, U+2019 ISOnum */
307     {"sbquo", 0x201A}, /* single low-9 quotation mark, U+201A NEW */
308     {"ldquo", 0x201C}, /* left double quotation mark, U+201C ISOnum */
309     {"rdquo", 0x201D}, /* right double quotation mark, U+201D ISOnum */
310     {"bdquo", 0x201E}, /* double low-9 quotation mark, U+201E NEW */
311     {"dagger", 0x2020}, /* dagger, U+2020 ISOpub */
312     {"Dagger", 0x2021}, /* double dagger, U+2021 ISOpub */
313    
314     {"bull", 0x2022}, /* bullet = black small circle, U+2022 ISOpub */
315     {"hellip", 0x2026}, /* horizontal ellipsis = three dot leader, U+2026 ISOpub */
316    
317     {"permil", 0x2030}, /* per mille sign, U+2030 ISOtech */
318    
319     {"prime", 0x2032}, /* prime = minutes = feet, U+2032 ISOtech */
320     {"Prime", 0x2033}, /* double prime = seconds = inches, U+2033 ISOtech */
321    
322     {"lsaquo", 0x2039}, /* single left-pointing angle quotation mark, U+2039 ISO proposed */
323     {"rsaquo", 0x203A}, /* single right-pointing angle quotation mark, U+203A ISO proposed */
324    
325     {"oline", 0x203E}, /* overline = spacing overscore, U+203E NEW */
326     {"frasl", 0x2044}, /* fraction slash, U+2044 NEW */
327    
328     {"euro", 0x20AC}, /* euro sign, U+20AC NEW */
329    
330     /* -- Letterlike Symbols */
331     {"image", 0x2111}, /* blackletter capital I = imaginary part, U+2111 ISOamso */
332     {"weierp", 0x2118}, /* script capital P = power set = Weierstrass p, U+2118 ISOamso */
333     {"real", 0x211C}, /* blackletter capital R = real part symbol, U+211C ISOamso */
334     {"trade", 0x2122}, /* trade mark sign, U+2122 ISOnum */
335    
336     /* -- alef symbol is NOT the same as hebrew letter alef, U+05D0 */
337     {"alefsym", 0x2135}, /* alef symbol = first transfinite cardinal, U+2135 NEW */
338    
339     /* -- Arrow Symbols */
340     {"larr", 0x2190}, /* leftwards arrow, U+2190 ISOnum */
341     {"uarr", 0x2191}, /* upwards arrow, U+2191 ISOnum */
342     {"rarr", 0x2192}, /* rightwards arrow, U+2192 ISOnum */
343     {"darr", 0x2193}, /* downwards arrow, U+2193 ISOnum */
344     {"harr", 0x2194}, /* left right arrow, U+2194 ISOamsa */
345     {"crarr", 0x21B5}, /* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */
346     {"lArr", 0x21D0}, /* leftwards double arrow, U+21D0 ISOtech */
347     {"uArr", 0x21D1}, /* upwards double arrow, U+21D1 ISOamsa */
348     {"rArr", 0x21D2}, /* rightwards double arrow, U+21D2 ISOtech */
349     {"dArr", 0x21D3}, /* downwards double arrow, U+21D3 ISOamsa */
350     {"hArr", 0x21D4}, /* left right double arrow, U+21D4 ISOamsa */
351    
352     /* -- Mathematical Operators */
353     {"forall", 0x2200}, /* for all, U+2200 ISOtech */
354     {"part", 0x2202}, /* partial differential, U+2202 ISOtech */
355     {"exist", 0x2203}, /* there exists, U+2203 ISOtech */
356     {"empty", 0x2205}, /* empty set = null set = diameter, U+2205 ISOamso */
357     {"nabla", 0x2207}, /* nabla = backward difference, U+2207 ISOtech */
358     {"isin", 0x2208}, /* element of, U+2208 ISOtech */
359     {"notin", 0x2209}, /* not an element of, U+2209 ISOtech */
360     {"ni", 0x220B}, /* contains as member, U+220B ISOtech */
361     {"prod", 0x220F}, /* n-ary product = product sign, U+220F ISOamsb */
362     {"sum", 0x2211}, /* n-ary sumation, U+2211 ISOamsb */
363     {"minus", 0x2212}, /* minus sign, U+2212 ISOtech */
364     {"lowast", 0x2217}, /* asterisk operator, U+2217 ISOtech */
365     {"radic", 0x221A}, /* square root = radical sign, U+221A ISOtech */
366     {"prop", 0x221D}, /* proportional to, U+221D ISOtech */
367     {"infin", 0x221E}, /* infinity, U+221E ISOtech */
368     {"ang", 0x2220}, /* angle, U+2220 ISOamso */
369     {"and", 0x2227}, /* logical and = wedge, U+2227 ISOtech */
370     {"or", 0x2228}, /* logical or = vee, U+2228 ISOtech */
371     {"cap", 0x2229}, /* intersection = cap, U+2229 ISOtech */
372     {"cup", 0x222A}, /* union = cup, U+222A ISOtech */
373     {"int", 0x222B}, /* integral, U+222B ISOtech */
374     {"there4", 0x2234}, /* therefore, U+2234 ISOtech */
375     {"sim", 0x223C}, /* tilde operator = varies with = similar to, U+223C ISOtech */
376     {"cong", 0x2245}, /* approximately equal to, U+2245 ISOtech */
377     {"asymp", 0x2248}, /* almost equal to = asymptotic to, U+2248 ISOamsr */
378     {"ne", 0x2260}, /* not equal to, U+2260 ISOtech */
379     {"equiv", 0x2261}, /* identical to, U+2261 ISOtech */
380     {"le", 0x2264}, /* less-than or equal to, U+2264 ISOtech */
381     {"ge", 0x2265}, /* greater-than or equal to, U+2265 ISOtech */
382     {"sub", 0x2282}, /* subset of, U+2282 ISOtech */
383     {"sup", 0x2283}, /* superset of, U+2283 ISOtech */
384     {"nsub", 0x2284}, /* not a subset of, U+2284 ISOamsn */
385     {"sube", 0x2286}, /* subset of or equal to, U+2286 ISOtech */
386     {"supe", 0x2287}, /* superset of or equal to, U+2287 ISOtech */
387     {"oplus", 0x2295}, /* circled plus = direct sum, U+2295 ISOamsb */
388     {"otimes", 0x2297}, /* circled times = vector product, U+2297 ISOamsb */
389     {"perp", 0x22A5}, /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */
390     {"sdot", 0x22C5}, /* dot operator, U+22C5 ISOamsb */
391     {"lceil", 0x2308}, /* left ceiling = apl upstile, U+2308 ISOamsc */
392     {"rceil", 0x2309}, /* right ceiling, U+2309 ISOamsc */
393     {"lfloor", 0x230A}, /* left floor = apl downstile, U+230A ISOamsc */
394     {"rfloor", 0x230B}, /* right floor, U+230B ISOamsc */
395     {"lang", 0x2329}, /* left-pointing angle bracket = bra, U+2329 ISOtech */
396     {"rang", 0x232A}, /* right-pointing angle bracket = ket, U+232A ISOtech */
397     {"loz", 0x25CA}, /* lozenge, U+25CA ISOpub */
398    
399     /* -- Miscellaneous Symbols */
400     {"spades", 0x2660}, /* black spade suit, U+2660 ISOpub */
401     {"clubs", 0x2663}, /* black club suit = shamrock, U+2663 ISOpub */
402     {"hearts", 0x2665}, /* black heart suit = valentine, U+2665 ISOpub */
403     {"diams", 0x2666}, /* black diamond suit, U+2666 ISOpub */
404    
405     };
406    
407    
408     /*
409     ** ----------------------------------------------
410     **
411     ** Module management code starts here
412     **
413     ** ----------------------------------------------
414     */
415    
416     /*
417     -- init structures for Entities
418     */
419    
420     void initModule_Entities(SWISH * sw)
421     {
422     struct MOD_Entities *md;
423    
424    
425     md = (struct MOD_Entities *) emalloc(sizeof(struct MOD_Entities));
426    
427     sw->Entities = md;
428    
429     md->convertEntities = CONVERTHTMLENTITIES;
430    
431     /*
432     -- init entity hash
433     -- this is module local only
434     */
435    
436     if ( !ce_hasharray_initialized++ )
437     {
438     int i,
439     tab_len;
440     CEntity *ce_p;
441     struct CEHE **hash_pp,
442     *tmp_p;
443    
444     /* empty positions */
445     for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++)
446     ce_hasharray[i] = (struct CEHE *) NULL;
447    
448    
449     /*
450     -- fill entity table into hash
451     -- process from end to start of entity_table, because most used
452     -- entities are at the beginning (iso)
453     -- this is due to "insert in hash sequence" behavior in hashtab (performance!)
454     -- The improvement is minimal, because the hash table re-chains during usage.
455     */
456    
457     tab_len = sizeof(entity_table) / sizeof(entity_table[0]);
458    
459     for (i = tab_len - 1; i >= 0; i--)
460     {
461     ce_p = &entity_table[i];
462     hash_pp = &ce_hasharray[(int) *(ce_p->name) & 0x7F];
463     /* insert entity-ptr at start of ptr sequence in hash */
464     tmp_p = *hash_pp;
465     *hash_pp = (struct CEHE *) emalloc(sizeof(struct CEHE));
466     (*hash_pp)->ce = ce_p;
467     (*hash_pp)->next = tmp_p;
468     }
469    
470     } /* end init hash block */
471    
472     }
473    
474    
475    
476     /*
477     -- release structures for Entities
478     -- release all wired memory
479     */
480    
481     void freeModule_Entities(SWISH * sw)
482     {
483    
484     /* free module data structure */
485    
486     efree(sw->Entities);
487     sw->Entities = NULL;
488    
489    
490     /*
491     -- free local entity hash table
492     */
493     {
494     int i;
495     struct CEHE *hash_p,
496     *tmp_p;
497     /* free ptr "chains" in array */
498     for (i = 0; i < sizeof(ce_hasharray) / sizeof(ce_hasharray[0]); i++)
499     {
500     hash_p = ce_hasharray[i];
501     while (hash_p)
502     {
503     tmp_p = hash_p->next;
504     efree(hash_p);
505     hash_p = tmp_p;
506     }
507     ce_hasharray[i] = (struct CEHE *) NULL;
508     }
509    
510     } /* end free hash block */
511    
512     }
513    
514    
515     /*
516     ** ----------------------------------------------
517     **
518     ** Module config code starts here
519     **
520     ** ----------------------------------------------
521     */
522    
523    
524     /*
525     -- Config Directives
526     -- Configuration directives for this Module
527     -- return: 0/1 = none/config applied
528     */
529    
530     int configModule_Entities(SWISH * sw, StringList * sl)
531     {
532     struct MOD_Entities *md = sw->Entities;
533     char *w0;
534     int retval;
535    
536    
537     w0 = sl->word[0];
538     retval = 1;
539    
540    
541     if (strcasecmp(w0, "ConvertHTMLEntities") == 0)
542     {
543     md->convertEntities = getYesNoOrAbort(sl, 1, 1);
544     }
545     else
546     {
547     retval = 0; /* not a Entities directive */
548     }
549    
550    
551     return retval;
552     }
553    
554    
555    
556    
557    
558    
559     /*
560     ** ----------------------------------------------
561     **
562     ** Module code starts here
563     **
564     ** ----------------------------------------------
565     */
566    
567    
568     /*
569     -- convert a string containing HTML/XML entities
570     -- conversion is done on the string itsself.
571     -- conversion is only done, if config directive is set to "YES"
572     -- return ptr to converted string.
573     */
574    
575     unsigned char *sw_ConvHTMLEntities2ISO(SWISH * sw, unsigned char *s)
576     {
577     return (sw->Entities->convertEntities) ? strConvHTMLEntities2ISO(s) : s;
578     }
579    
580    
581    
582     /*
583     -- convert a string containing HTML/XML entities
584     -- conversion is done on the string itsself.
585     -- return ptr to converted string.
586     */
587    
588     unsigned char *strConvHTMLEntities2ISO(unsigned char *buf)
589     {
590     unsigned char *s,
591     *t;
592     unsigned char *d;
593     int code;
594    
595    
596     s = d = buf;
597    
598     while (*s)
599     {
600    
601     /* if not entity start, next */
602     if (*s != '&')
603     {
604     *d++ = *s++;
605     }
606     else
607     {
608     /* entity found, identify and decode */
609     /* ignore zero entities and UNICODE ! */
610     code = charEntityDecode(s, &t);
611     if (code && (code < 256))
612     *d++ = (unsigned char) code;
613     s = t;
614     }
615     }
616     *d = '\0';
617    
618     return buf;
619     }
620    
621    
622    
623     /*
624     -- decode entity string to character code:
625     -- &#dec; &#xhex; &#Xhex; &named;
626     -- Decoding is hash optimized with dynamic re-chaining for
627     -- performance improvement...
628     -- return: entity character (decoded)
629     -- position "end" (if != NULL) past "entity" or behind ret. char
630     -- on illegal entities, just return the char...
631     */
632    
633     int charEntityDecode(unsigned char *s, unsigned char **end)
634     {
635     unsigned char *s1,
636     *t;
637     unsigned char *e_end;
638     unsigned char s_cmp[MAX_ENTITY_LEN + 1];
639     int len;
640     int code;
641    
642    
643     /*
644     -- no entity ctrl start char?, err: return char
645     */
646     if (*s != '&')
647     {
648     if (end)
649     *end = s + 1;
650     return (int) *s;
651     }
652    
653    
654    
655     /* ok, seems valid entity starting char */
656     code = 0;
657     e_end = NULL;
658    
659     if (*(s + 1) == '#')
660     { /* numeric entity "&#" */
661    
662     s += 2; /* after "&#" */
663     switch (*s)
664     {
665     case 'x':
666     case 'X':
667     ++s; /* skip x */
668     code = (int) strtoul((char *)s, (char **) &e_end, (int) 16);
669     break;
670     default:
671     code = (int) strtoul((char *)s, (char **) &e_end, (int) 10);
672     break;
673     }
674    
675     }
676     else
677     {
678    
679     /*
680     -- ok, seems to be a named entity, find terminating char
681     -- t = NULL if not found...
682     -- if no char found: return '&' (illegal entity)
683     */
684    
685     len = 0;
686     t = NULL;
687     s1 = s;
688     while (len < MAX_ENTITY_LEN)
689     {
690     s_cmp[len] = *(++s1);
691     if (IS_EOE(*s1))
692     {
693     t = s1; /* End of named entity */
694     break;
695     }
696     if (!*s1)
697     break; /* maybe this is also checked by is_EOE! */
698     len++;
699     }
700     s_cmp[len] = '\0';
701    
702     /*
703     -- hash search block
704     -- case sensitiv search (hashvalue = 1 entity name char)
705     -- (& 0x7F to prevent hashtable mem coredumps by illegal chars)
706     -- improve performance, by rechaining found elements
707     */
708    
709     if (t)
710     {
711     struct CEHE *hash_p;
712     struct CEHE **hash_pp,
713     *last_p;
714    
715     hash_pp = &ce_hasharray[*(s + 1) & 0x7F];
716     last_p = NULL;
717     hash_p = *hash_pp;
718     while (hash_p)
719     {
720     if (!strcmp( (char *)hash_p->ce->name, (char *)s_cmp))
721     {
722     code = hash_p->ce->code;
723     if (last_p)
724     { /* rechain hash sequence list (last found = first) */
725     last_p->next = hash_p->next; /* take elem out of seq */
726     hash_p->next = *hash_pp; /* old 1. = 2. */
727     *hash_pp = hash_p; /* found = 1st */
728     }
729     e_end = t; /* found -> set end */
730     break;
731     }
732     last_p = hash_p;
733     hash_p = hash_p->next;
734     }
735    
736     }
737     } /* end if */
738    
739    
740     if (!e_end)
741     {
742     code = *s;
743     e_end = s + 1;
744     }
745     else
746     {
747     if (*e_end == ';')
748     e_end++; /* W3C EndOfEntity */
749     }
750    
751    
752     if (end)
753     *end = e_end;
754     return code;
755     }
756    
757    
758     /*
759     -- check if a char is the end of a html entity.
760     -- behavior can be W3C pedantic or tolerant.
761     -- mapped via macro to avoid function calls on strict ==';' behavior
762     -- return: cmp value
763     */
764    
765     static int is_EOE(int c)
766     {
767     /* be tolerant ! */
768     return ((!isprint(c)) || ispunct(c) || isspace(c)) ? 1 : 0;
769     }

  ViewVC Help
Powered by ViewVC 1.1.22