2 Lightweight and fast xml parser.
4 Part of the swftools package.
6 Copyright (c) 2010 Matthias Kramm <kramm@quiss.org>
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
29 group: 0=data 1=whitespace 2='"' 3='<' 4='>' 5='&' 6=';' 7='?' 8='/' 9='=' 10='!' 11=EOF
32 static int group[256] =
34 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
36 13, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
37 // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40 // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
41 // ! " # $ % & ' ( ) * + , - . /
42 1,10, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 8,
43 // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
44 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 9, 4, 7,
46 // 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
47 // @ A B C D E F G H I J K L M N O
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49 // 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
50 // P Q R S T U V W X Y Z [ \ ] ^ _
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,11, 0,12, 0, 0,
52 // 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
53 // ` a b c d e f g h i j k l m n o
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 // 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
56 // p q r s t u v w x y z { | } ~
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 // 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61 // 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64 // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 // b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 // e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 // f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84 static const char*errors[]=
88 /*E1*/"xml file must start with <?",
90 /*E2*/"<,; or & not allowed inside tags",
94 /*E4*/"internal error",
96 /*E5*/"attribute definition without =\"",
99 static int new_state[][16]=
100 { /* dt ws " < > & ; ? / = ! [ ] - EOB*/
101 /* 0 */{ E1, 0,E1, 1,E1,E1,E1,E1,E1,E1,E3,E1,E1,-63}, // .<
102 /* 1 */{ E1,E1,E1,E1,E1,E1,E1, 9,E1,E1,E3,E1,E1,-63}, // <.?
103 /* 2 */{ -3, 2,E3,E2,E2,E2,E2,E2,12,E2,16,E2,E2,-63}, // <.
104 /* 3 */{ E3,E3,E3,E3,-1,E3,E3,E3,E3,E1,E3,E3,E3,-63}, // < /.>
105 /* 4 */{ E3,E3,E3,E3,-2,E3,E3,E3,E3,E1,E1,E3,E3,-63}, // < .>
106 /* 5 */{ 5, 5, 5,-4, 5, 5, 5, 5, 5, 5, 5,E3,E3,-63}, // da.ta
107 /* 6 */{ 6,-7,E3,E2,-6,E2,E2,E3,-9,E3,E3,E3,E3,-63}, // <na.me
108 /* 7 */{ -8, 7,E3,E2,-2,E2,E2, 7, 3,E3,E3,E3,E3,-63}, // <name .
109 /* 8 */{ 8,-12,E3,E2,E3,E2,E2,E3,E3,-10,E3,E3,E3,-63}, // att.r
110 /* 9 */{ 9, 7,E3,E3,E3,E3,E3,E3,E3,E3,E3,E3,E3,-63}, // <?x.ml
111 /* 10 */{ E5,10,-11,E5,E5,E5,E5,E5,E5,E5,E3,E3,E3,-63},// attr=."
112 /* 11 */{ 11,11,-5 ,11,11,11,11,11,11,11,E3,E3,E3,-63},// attr="va.l
113 /* 12 */{ -13,12,E3,E3,E3,E3,E3,E3,E3,E3,E3,E3,E3,-63}, // </ . >
114 /* 13 */{ 13,-14,E3,E3,-16,E3,E3,E3,E3,E3,E3,E3,E3,-63},// </ na.me>
115 /* 14 */{ E3,14,E3,E3,-15,E3,E3,E3,E3,E3,E3,E3,E3,-63},// </ name. >
116 /* 15 */{ E3,15,E3,E2,E3,E3,E3,E3,E3,10,E3,E3,E3,-63}, // attr .=
117 /* 16 */{ E3,E3,E3,E2,E3,E3,E3,E3,E3,E3,E3,17,E3,-63}, // <!.[CDATA[ ]]>
118 /* 17 */{ 17,E3,E3,E3,E3,E3,E3,E3,E3,E3,E3,18,E3,-63}, // <![C.DATA[ ]]>
119 /* 18 */{ 18,18,18,18,18,18,18,18,18,18,18,18,19,-63}, // <![CDATA[ . ]]>
120 /* 19 */{ 18,18,18,18,-20,18,18,18,18,18,18,18,19,-63}, // <![CDATA[ ].]>
121 /* 20 */{0,0,0,0,0,0,0,0,0,0,0,-63},
122 /* 21 */{0,0,0,0,0,0,0,0,0,0,0,-63},
123 /* 22 */{0,0,0,0,0,0,0,0,0,0,0,-63},
124 /* 23 */{0,0,0,0,0,0,0,0,0,0,0,-63},
125 /* 24 */{0,0,0,0,0,0,0,0,0,0,0,-63},
128 typedef struct _tag_stack {
130 struct _tag_stack*prev;
133 typedef struct _stringstate {
140 static void stringstate_start(stringstate_t*s, char*buffer, int pos)
143 s->start = &buffer[pos];
147 static void stringstate_save(stringstate_t*s, char*buffer, int pos)
151 int add = &buffer[pos] - s->start;
154 s->current = malloc(add+1);
155 memcpy(s->current, s->start, add);
157 s->current = realloc(s->current, s->len + add + 1);
158 memcpy(s->current+s->len, s->start, add);
161 s->current[s->len] = 0;
164 static void stringstate_finish(stringstate_t*s, char*buffer, int pos)
166 stringstate_save(s, buffer, pos);
167 s->result = s->current;
171 static void stringstate_clear(stringstate_t*s)
173 if(s->result) free(s->result);
174 if(s->current) free(s->current);
175 memset(s, 0, sizeof(stringstate_t));
177 static xmlattribute_t*attributes_reverse(xmlattribute_t*attr)
179 xmlattribute_t*prev = 0;
181 xmlattribute_t*next = attr->next;
189 static void attributes_free(xmlattribute_t*attributes)
192 xmlattribute_t*next = attributes->next;
193 free((void*)attributes->name);
194 free((void*)attributes->value);
200 int xml_parse(reader_t*reader, xmlconsumer_t*out)
203 int old = 0, state = 0;
205 tag_stack_t*stack = 0;
207 stringstate_t tagname = {0,0,0,0};
208 stringstate_t attr_name = {0,0,0,0};
209 stringstate_t attr_value = {0,0,0,0};
210 stringstate_t data = {0,0,0,0};
211 xmlattribute_t*attributes = 0;
214 int num = reader->read(reader, buffer, 4096);
219 /*printf("%c, state %d->%d\n",
220 buffer[pos], state, new_state[state][group[buffer[pos]]]);*/
224 state = new_state[old=state][group[(unsigned char)buffer[pos++]]];
230 case -63: // end of buffer
232 // we could backtrace, but the spec says this is indeed illegal
233 fprintf(stderr, "error: xml contains \\0 chars\n");
240 case -1: // self closing tag
241 attributes = attributes_reverse(attributes);
242 out->start_tag(out, tagname.result, attributes);
243 out->end_tag(out, tagname.result);
244 stringstate_clear(&tagname);
245 attributes_free(attributes);attributes = 0;
246 stringstate_start(&data, buffer, pos);
249 case -6: // after <tagname, at >
250 stringstate_finish(&tagname, buffer, pos-1);
253 st = malloc(sizeof(tag_stack_t));
254 st->name = tagname.result;
257 attributes = attributes_reverse(attributes);
258 if(!first) out->start_tag(out, tagname.result, attributes);
259 attributes_free(attributes);attributes = 0;
260 stringstate_start(&data, buffer, pos);
263 case -3: case -13: // after <, start of tag name
265 stringstate_start(&tagname, buffer, pos-1);
266 state = state==-3?6:13;
268 case -14: // after </, end of tag name, begin of white space
269 stringstate_finish(&tagname, buffer, pos-1);
272 case -16: // after </, at >, end of tag name
273 stringstate_finish(&tagname, buffer, pos-1);
275 case -15: // after </, at >
276 out->end_tag(out, tagname.result);
277 stringstate_clear(&tagname);
278 stringstate_start(&data, buffer, pos);
281 case -4: // end of data
282 stringstate_finish(&data, buffer, pos-1);
283 if(!first) out->data(out, data.result, data.len);
284 stringstate_clear(&data);
287 case -7: // after <, at whitespace, end of tag name
288 stringstate_finish(&tagname, buffer, pos-1);
291 case -8: // inside tag, start of attribute name
292 stringstate_start(&attr_name, buffer, pos-1);
296 stringstate_finish(&tagname, buffer, pos-1);
299 case -12: // end of attribute name, at ws
300 stringstate_finish(&attr_name, buffer, pos-1);
303 case -10: // end of attribute name, at =
304 stringstate_finish(&attr_name, buffer, pos-1);
307 case -11: // start of attribute value
308 stringstate_start(&attr_value, buffer, pos);
311 case -5: // end of attribute value
312 stringstate_finish(&attr_value, buffer, pos-1);
313 a = malloc(sizeof(xmlattribute_t));
314 a->name = attr_name.result;attr_name.result=0;
315 a->value = attr_value.result;attr_value.result=0;
316 a->next = attributes;
325 fprintf(stderr, "%s (state %d, char '%c')\n", errors[(-state)&0x3f], old, buffer[pos-1]);
328 fprintf(stderr, "internal error: no action %d\n", state);
334 stringstate_save(&tagname, buffer, pos);
335 stringstate_save(&attr_name, buffer, pos);
336 stringstate_save(&attr_value, buffer, pos);
337 stringstate_save(&data, buffer, pos);
340 /* note: any of these except data *has* to be empty for a well formed xml */
341 stringstate_clear(&tagname);
342 stringstate_clear(&attr_name);
343 stringstate_clear(&attr_value);
344 stringstate_clear(&data);
347 tag_stack_t*next = stack->prev;
348 free((void*)stack->name);
356 void my_start_tag(xmlconsumer_t*c, char*name, xmlattribute_t*attr)
359 for(;attr;attr=attr->next) {
360 printf(" %s=\"%s\"", attr->name, attr->value);
364 void my_data(xmlconsumer_t*c, char*data, int len)
368 void my_end_tag(xmlconsumer_t*c, char*name)
370 printf("</%s>", name);
374 xmlconsumer_t c = {my_start_tag, my_data, my_end_tag, 0};
377 reader_init_filereader2(&r, "test.xml");