1 | function tree = xml_parser(xmlstr) |
---|
2 | % XML (eXtensible Markup Language) Processor |
---|
3 | % FORMAT tree = xml_parser(xmlstr) |
---|
4 | % |
---|
5 | % xmlstr - XML string to parse |
---|
6 | % tree - tree structure corresponding to the XML file |
---|
7 | %_______________________________________________________________________ |
---|
8 | % |
---|
9 | % xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser |
---|
10 | % written in Matlab. It aims to be fully conforming. It is currently not |
---|
11 | % a validating XML processor. |
---|
12 | % |
---|
13 | % A description of the tree structure provided in output is detailed in |
---|
14 | % the header of this m-file. |
---|
15 | %_______________________________________________________________________ |
---|
16 | % @(#)xml_parser.m Guillaume Flandin 2002/04/04 |
---|
17 | |
---|
18 | % XML Processor for MATLAB (The Mathworks, Inc.). |
---|
19 | % Copyright (C) 2002-2003 Guillaume Flandin <Guillaume@artefact.tk> |
---|
20 | % |
---|
21 | % This program is free software; you can redistribute it and/or |
---|
22 | % modify it under the terms of the GNU General Public License |
---|
23 | % as published by the Free Software Foundation; either version 2 |
---|
24 | % of the License, or any later version. |
---|
25 | % |
---|
26 | % This program is distributed in the hope that it will be useful, |
---|
27 | % but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
28 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
29 | % GNU General Public License for more details. |
---|
30 | % |
---|
31 | % You should have received a copy of the GNU General Public License |
---|
32 | % along with this program; if not, write to the Free Software |
---|
33 | % Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA. |
---|
34 | %----------------------------------------------------------------------- |
---|
35 | |
---|
36 | % Suggestions for improvement and fixes are always welcome, although no |
---|
37 | % guarantee is made whether and when they will be implemented. |
---|
38 | % Send requests to <Guillaume@artefact.tk> |
---|
39 | % Check also the latest developments on the following webpage: |
---|
40 | % <http://www.artefact.tk/software/matlab/xml/> |
---|
41 | %----------------------------------------------------------------------- |
---|
42 | |
---|
43 | % The implementation of this XML parser is much inspired from a |
---|
44 | % Javascript parser available at <http://www.jeremie.com/> |
---|
45 | |
---|
46 | % A mex-file xml_findstr.c is also required, to encompass some |
---|
47 | % limitations of the built-in findstr Matlab function. |
---|
48 | % Compile it on your architecture using 'mex -O xml_findstr.c' command |
---|
49 | % if the compiled version for your system is not provided. |
---|
50 | % If this function behaves badly (crash or wrong results), comment the |
---|
51 | % line '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again. |
---|
52 | %----------------------------------------------------------------------- |
---|
53 | |
---|
54 | % Structure of the output tree: |
---|
55 | % There are 5 types of nodes in an XML file: element, chardata, cdata, |
---|
56 | % pi and comment. |
---|
57 | % Each of them contains an UID (Unique Identifier): an integer between |
---|
58 | % 1 and the number of nodes of the XML file. |
---|
59 | % |
---|
60 | % element (a tag <name key="value"> [contents] </name> |
---|
61 | % |_ type: 'element' |
---|
62 | % |_ name: string |
---|
63 | % |_ attributes: cell array of struct 'key' and 'value' or [] |
---|
64 | % |_ contents: double array of uid's or [] if empty |
---|
65 | % |_ parent: uid of the parent ([] if root) |
---|
66 | % |_ uid: double |
---|
67 | % |
---|
68 | % chardata (a character array) |
---|
69 | % |_ type: 'chardata' |
---|
70 | % |_ value: string |
---|
71 | % |_ parent: uid of the parent |
---|
72 | % |_ uid: double |
---|
73 | % |
---|
74 | % cdata (a litteral string <![CDATA[value]]>) |
---|
75 | % |_ type: 'cdata' |
---|
76 | % |_ value: string |
---|
77 | % |_ parent: uid of the parent |
---|
78 | % |_ uid: double |
---|
79 | % |
---|
80 | % pi (a processing instruction <?target value ?>) |
---|
81 | % |_ type: 'pi' |
---|
82 | % |_ target: string (may be empty) |
---|
83 | % |_ value: string |
---|
84 | % |_ parent: uid of the parent |
---|
85 | % |_ uid: double |
---|
86 | % |
---|
87 | % comment (a comment <!-- value -->) |
---|
88 | % |_ type: 'comment' |
---|
89 | % |_ value: string |
---|
90 | % |_ parent: uid of the parent |
---|
91 | % |_ uid: double |
---|
92 | % |
---|
93 | %----------------------------------------------------------------------- |
---|
94 | |
---|
95 | % TODO/BUG/FEATURES: |
---|
96 | % - [compile] only a warning if TagStart is empty ? |
---|
97 | % - [attribution] should look for " and ' rather than only " |
---|
98 | % - [main] with normalize as a preprocessing, CDATA are modified |
---|
99 | % - [prolog] look for a DOCTYPE in the whole string even if it occurs |
---|
100 | % only in a far CDATA tag, bug even if the doctype is inside a comment |
---|
101 | % - [tag_element] erode should replace normalize here |
---|
102 | % - remove globals? uppercase globals rather persistent (clear mfile)? |
---|
103 | % - xml_findstr is indeed xml_strfind according to Mathworks vocabulary |
---|
104 | % - problem with entities: do we need to convert them here? (é) |
---|
105 | %----------------------------------------------------------------------- |
---|
106 | |
---|
107 | %- XML string to parse and number of tags read |
---|
108 | global xmlstring Xparse_count xtree; |
---|
109 | |
---|
110 | %- Check input arguments |
---|
111 | error(nargchk(1,1,nargin)); |
---|
112 | if isempty(xmlstr) |
---|
113 | error('[XML] Not enough parameters.') |
---|
114 | elseif ~isstr(xmlstr) | sum(size(xmlstr)>1)>1 |
---|
115 | error('[XML] Input must be a string.') |
---|
116 | end |
---|
117 | |
---|
118 | %- Initialize number of tags (<=> uid) |
---|
119 | Xparse_count = 0; |
---|
120 | |
---|
121 | %- Remove prolog and white space characters from the XML string |
---|
122 | xmlstring = normalize(prolog(xmlstr)); |
---|
123 | |
---|
124 | %- Initialize the XML tree |
---|
125 | xtree = {}; |
---|
126 | tree = fragment; |
---|
127 | tree.str = 1; |
---|
128 | tree.parent = 0; |
---|
129 | |
---|
130 | %- Parse the XML string |
---|
131 | tree = compile(tree); |
---|
132 | |
---|
133 | %- Return the XML tree |
---|
134 | tree = xtree; |
---|
135 | |
---|
136 | %- Remove global variables from the workspace |
---|
137 | clear global xmlstring Xparse_count xtree; |
---|
138 | |
---|
139 | %======================================================================= |
---|
140 | % SUBFUNCTIONS |
---|
141 | |
---|
142 | %----------------------------------------------------------------------- |
---|
143 | function frag = compile(frag) |
---|
144 | global xmlstring xtree Xparse_count; |
---|
145 | |
---|
146 | while 1, |
---|
147 | if length(xmlstring)<=frag.str | ... |
---|
148 | (frag.str == length(xmlstring)-1 & strcmp(xmlstring(frag.str:end),' ')) |
---|
149 | return |
---|
150 | end |
---|
151 | TagStart = xml_findstr(xmlstring,'<',frag.str,1); |
---|
152 | if isempty(TagStart) |
---|
153 | %- Character data |
---|
154 | error(sprintf(['[XML] Unknown data at the end of the XML file.\n' ... |
---|
155 | ' Please send me your XML file at Guillaume@artefact.tk'])); |
---|
156 | xtree{Xparse_count} = chardata; |
---|
157 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end))); |
---|
158 | xtree{Xparse_count}.parent = frag.parent; |
---|
159 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
160 | frag.str = ''; |
---|
161 | elseif TagStart > frag.str |
---|
162 | if strcmp(xmlstring(frag.str:TagStart-1),' ') |
---|
163 | %- A single white space before a tag (ignore) |
---|
164 | frag.str = TagStart; |
---|
165 | else |
---|
166 | %- Character data |
---|
167 | xtree{Xparse_count} = chardata; |
---|
168 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1))); |
---|
169 | xtree{Xparse_count}.parent = frag.parent; |
---|
170 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
171 | frag.str = TagStart; |
---|
172 | end |
---|
173 | else |
---|
174 | if strcmp(xmlstring(frag.str+1),'?') |
---|
175 | %- Processing instruction |
---|
176 | frag = tag_pi(frag); |
---|
177 | else |
---|
178 | if length(xmlstring)-frag.str>4 & strcmp(xmlstring(frag.str+1:frag.str+3),'!--') |
---|
179 | %- Comment |
---|
180 | frag = tag_comment(frag); |
---|
181 | else |
---|
182 | if length(xmlstring)-frag.str>9 & strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[') |
---|
183 | %- Litteral data |
---|
184 | frag = tag_cdata(frag); |
---|
185 | else |
---|
186 | %- A tag element (empty (<.../>) or not) |
---|
187 | if ~isempty(frag.end) |
---|
188 | endmk = ['/' frag.end '>']; |
---|
189 | else |
---|
190 | endmk = '/>'; |
---|
191 | end |
---|
192 | if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) | ... |
---|
193 | strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk) |
---|
194 | frag.str = frag.str + length(frag.end)+3; |
---|
195 | return |
---|
196 | else |
---|
197 | frag = tag_element(frag); |
---|
198 | end |
---|
199 | end |
---|
200 | end |
---|
201 | end |
---|
202 | end |
---|
203 | end |
---|
204 | |
---|
205 | %----------------------------------------------------------------------- |
---|
206 | function frag = tag_element(frag) |
---|
207 | global xmlstring xtree Xparse_count; |
---|
208 | close = xml_findstr(xmlstring,'>',frag.str,1); |
---|
209 | if isempty(close) |
---|
210 | error('[XML] Tag < opened but not closed.'); |
---|
211 | else |
---|
212 | empty = strcmp(xmlstring(close-1:close),'/>'); |
---|
213 | if empty |
---|
214 | close = close - 1; |
---|
215 | end |
---|
216 | starttag = normalize(xmlstring(frag.str+1:close-1)); |
---|
217 | nextspace = xml_findstr(starttag,' ',1,1); |
---|
218 | attribs = ''; |
---|
219 | if isempty(nextspace) |
---|
220 | name = starttag; |
---|
221 | else |
---|
222 | name = starttag(1:nextspace-1); |
---|
223 | attribs = starttag(nextspace+1:end); |
---|
224 | end |
---|
225 | xtree{Xparse_count} = element; |
---|
226 | xtree{Xparse_count}.name = strip(name); |
---|
227 | if frag.parent |
---|
228 | xtree{Xparse_count}.parent = frag.parent; |
---|
229 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
230 | end |
---|
231 | if length(attribs) > 0 |
---|
232 | xtree{Xparse_count}.attributes = attribution(attribs); |
---|
233 | end |
---|
234 | if ~empty |
---|
235 | contents = fragment; |
---|
236 | contents.str = close+1; |
---|
237 | contents.end = name; |
---|
238 | contents.parent = Xparse_count; |
---|
239 | contents = compile(contents); |
---|
240 | frag.str = contents.str; |
---|
241 | else |
---|
242 | frag.str = close+2; |
---|
243 | end |
---|
244 | end |
---|
245 | |
---|
246 | %----------------------------------------------------------------------- |
---|
247 | function frag = tag_pi(frag) |
---|
248 | global xmlstring xtree Xparse_count; |
---|
249 | close = xml_findstr(xmlstring,'?>',frag.str,1); |
---|
250 | if isempty(close) |
---|
251 | warning('[XML] Tag <? opened but not closed.') |
---|
252 | else |
---|
253 | nextspace = xml_findstr(xmlstring,' ',frag.str,1); |
---|
254 | xtree{Xparse_count} = pri; |
---|
255 | if nextspace > close | nextspace == frag.str+2 |
---|
256 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1)); |
---|
257 | else |
---|
258 | xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1)); |
---|
259 | xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace)); |
---|
260 | end |
---|
261 | if frag.parent |
---|
262 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
263 | xtree{Xparse_count}.parent = frag.parent; |
---|
264 | end |
---|
265 | frag.str = close+2; |
---|
266 | end |
---|
267 | |
---|
268 | %----------------------------------------------------------------------- |
---|
269 | function frag = tag_comment(frag) |
---|
270 | global xmlstring xtree Xparse_count; |
---|
271 | close = xml_findstr(xmlstring,'-->',frag.str,1); |
---|
272 | if isempty(close) |
---|
273 | warning('[XML] Tag <!-- opened but not closed.') |
---|
274 | else |
---|
275 | xtree{Xparse_count} = comment; |
---|
276 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1)); |
---|
277 | if frag.parent |
---|
278 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
279 | xtree{Xparse_count}.parent = frag.parent; |
---|
280 | end |
---|
281 | frag.str = close+3; |
---|
282 | end |
---|
283 | |
---|
284 | %----------------------------------------------------------------------- |
---|
285 | function frag = tag_cdata(frag) |
---|
286 | global xmlstring xtree Xparse_count; |
---|
287 | close = xml_findstr(xmlstring,']]>',frag.str,1); |
---|
288 | if isempty(close) |
---|
289 | warning('[XML] Tag <![CDATA[ opened but not closed.') |
---|
290 | else |
---|
291 | xtree{Xparse_count} = cdata; |
---|
292 | xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1); |
---|
293 | if frag.parent |
---|
294 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
295 | xtree{Xparse_count}.parent = frag.parent; |
---|
296 | end |
---|
297 | frag.str = close+3; |
---|
298 | end |
---|
299 | |
---|
300 | %----------------------------------------------------------------------- |
---|
301 | function all = attribution(str) |
---|
302 | %- Initialize attributs |
---|
303 | nbattr = 0; |
---|
304 | all = cell(nbattr); |
---|
305 | %- Look for 'key="value"' substrings |
---|
306 | while 1, |
---|
307 | eq = xml_findstr(str,'=',1,1); |
---|
308 | if isempty(str) | isempty(eq), return; end |
---|
309 | id = xml_findstr(str,'"',1,1); % should also look for '''' |
---|
310 | nextid = xml_findstr(str,'"',id+1,1);% rather than only '"' |
---|
311 | nbattr = nbattr + 1; |
---|
312 | all{nbattr}.key = strip(str(1:(eq-1))); |
---|
313 | all{nbattr}.val = entity(str((id+1):(nextid-1))); |
---|
314 | str = str((nextid+1):end); |
---|
315 | end |
---|
316 | |
---|
317 | %----------------------------------------------------------------------- |
---|
318 | function elm = element |
---|
319 | global Xparse_count; |
---|
320 | Xparse_count = Xparse_count + 1; |
---|
321 | elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count); |
---|
322 | |
---|
323 | %----------------------------------------------------------------------- |
---|
324 | function cdat = chardata |
---|
325 | global Xparse_count; |
---|
326 | Xparse_count = Xparse_count + 1; |
---|
327 | cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count); |
---|
328 | |
---|
329 | %----------------------------------------------------------------------- |
---|
330 | function cdat = cdata |
---|
331 | global Xparse_count; |
---|
332 | Xparse_count = Xparse_count + 1; |
---|
333 | cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count); |
---|
334 | |
---|
335 | %----------------------------------------------------------------------- |
---|
336 | function proce = pri |
---|
337 | global Xparse_count; |
---|
338 | Xparse_count = Xparse_count + 1; |
---|
339 | proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count); |
---|
340 | |
---|
341 | %----------------------------------------------------------------------- |
---|
342 | function commt = comment |
---|
343 | global Xparse_count; |
---|
344 | Xparse_count = Xparse_count + 1; |
---|
345 | commt = struct('type','comment','value','','parent',[],'uid',Xparse_count); |
---|
346 | |
---|
347 | %----------------------------------------------------------------------- |
---|
348 | function frg = fragment |
---|
349 | frg = struct('str','','parent','','end',''); |
---|
350 | |
---|
351 | %----------------------------------------------------------------------- |
---|
352 | function str = prolog(str) |
---|
353 | %- Initialize beginning index of elements tree |
---|
354 | b = 1; |
---|
355 | %- Initial tag |
---|
356 | start = xml_findstr(str,'<',1,1); |
---|
357 | if isempty(start) |
---|
358 | error('[XML] No tag found.') |
---|
359 | end |
---|
360 | %- Header (<?xml version="1.0" ... ?>) |
---|
361 | if strcmp(lower(str(start:start+2)),'<?x') |
---|
362 | close = xml_findstr(str,'?>',1,1); |
---|
363 | if ~isempty(close) |
---|
364 | b = close + 2; |
---|
365 | else |
---|
366 | warning('[XML] Header tag incomplete.') |
---|
367 | end |
---|
368 | end |
---|
369 | %- Doctype (<!DOCTYPE type ... [ declarations ]>) |
---|
370 | start = xml_findstr(str,'<!DOCTYPE',b,1); % length('<!DOCTYPE') = 9 |
---|
371 | if ~isempty(start) |
---|
372 | close = xml_findstr(str,'>',start+9,1); |
---|
373 | if ~isempty(close) |
---|
374 | b = close + 1; |
---|
375 | dp = xml_findstr(str,'[',start+9,1); |
---|
376 | if (~isempty(dp) & dp < b) |
---|
377 | k = xml_findstr(str,']>',start+9,1); |
---|
378 | if ~isempty(k) |
---|
379 | b = k + 2; |
---|
380 | else |
---|
381 | warning('[XML] Tag [ in DOCTYPE opened but not closed.') |
---|
382 | end |
---|
383 | end |
---|
384 | else |
---|
385 | warning('[XML] Tag DOCTYPE opened but not closed.') |
---|
386 | end |
---|
387 | end |
---|
388 | %- Skip prolog from the xml string |
---|
389 | str = str(b:end); |
---|
390 | |
---|
391 | %----------------------------------------------------------------------- |
---|
392 | function str = strip(str) |
---|
393 | a = isspace(str); |
---|
394 | a = find(a==1); |
---|
395 | str(a) = ''; |
---|
396 | |
---|
397 | %----------------------------------------------------------------------- |
---|
398 | function str = normalize(str) |
---|
399 | % Find white characters (space, newline, carriage return, tabs, ...) |
---|
400 | i = isspace(str); |
---|
401 | i = find(i == 1); |
---|
402 | str(i) = ' '; |
---|
403 | % replace several white characters by only one |
---|
404 | if ~isempty(i) |
---|
405 | j = i - [i(2:end) i(end)]; |
---|
406 | k = find(j == -1); |
---|
407 | str(i(k)) = []; |
---|
408 | end |
---|
409 | |
---|
410 | %----------------------------------------------------------------------- |
---|
411 | function str = entity(str) |
---|
412 | str = strrep(str,'<','<'); |
---|
413 | str = strrep(str,'>','>'); |
---|
414 | str = strrep(str,'"','"'); |
---|
415 | str = strrep(str,''',''''); |
---|
416 | str = strrep(str,'&','&'); |
---|
417 | |
---|
418 | %----------------------------------------------------------------------- |
---|
419 | function str = erode(str) |
---|
420 | if ~isempty(str) & str(1)==' ' str(1)=''; end; |
---|
421 | if ~isempty(str) & str(end)==' ' str(end)=''; end; |
---|