1 | function tree = xml_parser(xmlstr)
|
---|
2 | % XML (eXtensible Markup Language) Processor
|
---|
3 | % FORMAT tree = xml_parser(xmlstr)
|
---|
4 | %
|
---|
5 | % xmlstr - XML string to parse
|
---|
6 | % tree - tree structure corresponding to the XML file
|
---|
7 | %__________________________________________________________________________
|
---|
8 | %
|
---|
9 | % xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser.
|
---|
10 | % It aims to be fully conforming. It is currently not a validating
|
---|
11 | % XML processor.
|
---|
12 | %
|
---|
13 | % A description of the tree structure provided in output is detailed in
|
---|
14 | % the header of this m-file.
|
---|
15 | %__________________________________________________________________________
|
---|
16 | % Copyright (C) 2002-2015 http://www.artefact.tk/
|
---|
17 |
|
---|
18 | % Guillaume Flandin
|
---|
19 | % $Id: xml_parser.m 6480 2015-06-13 01:08:30Z guillaume $
|
---|
20 |
|
---|
21 | % XML Processor for GNU Octave and MATLAB (The Mathworks, Inc.)
|
---|
22 | % Copyright (C) 2002-2015 Guillaume Flandin <Guillaume@artefact.tk>
|
---|
23 | %
|
---|
24 | % This program is free software; you can redistribute it and/or
|
---|
25 | % modify it under the terms of the GNU General Public License
|
---|
26 | % as published by the Free Software Foundation; either version 2
|
---|
27 | % of the License, or any later version.
|
---|
28 | %
|
---|
29 | % This program is distributed in the hope that it will be useful,
|
---|
30 | % but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
31 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
32 | % GNU General Public License for more details.
|
---|
33 | %
|
---|
34 | % You should have received a copy of the GNU General Public License
|
---|
35 | % along with this program; if not, write to the Free Software
|
---|
36 | % Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA.
|
---|
37 | %--------------------------------------------------------------------------
|
---|
38 |
|
---|
39 | % Suggestions for improvement and fixes are always welcome, although no
|
---|
40 | % guarantee is made whether and when they will be implemented.
|
---|
41 | % Send requests to <Guillaume@artefact.tk>
|
---|
42 | % Check also the latest developments on the following webpage:
|
---|
43 | % <http://www.artefact.tk/software/matlab/xml/>
|
---|
44 | %--------------------------------------------------------------------------
|
---|
45 |
|
---|
46 | % The implementation of this XML parser is much inspired from a
|
---|
47 | % Javascript parser that used to be available at <http://www.jeremie.com/>
|
---|
48 |
|
---|
49 | % A C-MEX file xml_findstr.c is also required, to encompass some
|
---|
50 | % limitations of the built-in FINDSTR function.
|
---|
51 | % Compile it on your architecture using 'mex -O xml_findstr.c' command
|
---|
52 | % if the compiled version for your system is not provided.
|
---|
53 | % If this function does not behave as expected, comment the line
|
---|
54 | % '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again.
|
---|
55 | %--------------------------------------------------------------------------
|
---|
56 |
|
---|
57 | % Structure of the output tree:
|
---|
58 | % There are 5 types of nodes in an XML file: element, chardata, cdata,
|
---|
59 | % pi and comment.
|
---|
60 | % Each of them contains an UID (Unique Identifier): an integer between
|
---|
61 | % 1 and the number of nodes of the XML file.
|
---|
62 | %
|
---|
63 | % element (a tag <name key="value"> [contents] </name>
|
---|
64 | % |_ type: 'element'
|
---|
65 | % |_ name: string
|
---|
66 | % |_ attributes: cell array of struct 'key' and 'value' or []
|
---|
67 | % |_ contents: double array of uid's or [] if empty
|
---|
68 | % |_ parent: uid of the parent ([] if root)
|
---|
69 | % |_ uid: double
|
---|
70 | %
|
---|
71 | % chardata (a character array)
|
---|
72 | % |_ type: 'chardata'
|
---|
73 | % |_ value: string
|
---|
74 | % |_ parent: uid of the parent
|
---|
75 | % |_ uid: double
|
---|
76 | %
|
---|
77 | % cdata (a litteral string <![CDATA[value]]>)
|
---|
78 | % |_ type: 'cdata'
|
---|
79 | % |_ value: string
|
---|
80 | % |_ parent: uid of the parent
|
---|
81 | % |_ uid: double
|
---|
82 | %
|
---|
83 | % pi (a processing instruction <?target value ?>)
|
---|
84 | % |_ type: 'pi'
|
---|
85 | % |_ target: string (may be empty)
|
---|
86 | % |_ value: string
|
---|
87 | % |_ parent: uid of the parent
|
---|
88 | % |_ uid: double
|
---|
89 | %
|
---|
90 | % comment (a comment <!-- value -->)
|
---|
91 | % |_ type: 'comment'
|
---|
92 | % |_ value: string
|
---|
93 | % |_ parent: uid of the parent
|
---|
94 | % |_ uid: double
|
---|
95 | %
|
---|
96 | %--------------------------------------------------------------------------
|
---|
97 |
|
---|
98 | % TODO/BUG/FEATURES:
|
---|
99 | % - [compile] only a warning if TagStart is empty ?
|
---|
100 | % - [attribution] should look for " and ' rather than only "
|
---|
101 | % - [main] with normalize as a preprocessing, CDATA are modified
|
---|
102 | % - [prolog] look for a DOCTYPE in the whole string even if it occurs
|
---|
103 | % only in a far CDATA tag, bug even if the doctype is inside a comment
|
---|
104 | % - [tag_element] erode should replace normalize here
|
---|
105 | % - remove globals? uppercase globals rather persistent (clear mfile)?
|
---|
106 | % - xml_findstr is indeed xml_strfind according to Mathworks vocabulary
|
---|
107 | % - problem with entities: do we need to convert them here? (é)
|
---|
108 | %--------------------------------------------------------------------------
|
---|
109 |
|
---|
110 | %- XML string to parse and number of tags read
|
---|
111 | global xmlstring Xparse_count xtree;
|
---|
112 |
|
---|
113 | %- Check input arguments
|
---|
114 | %error(nargchk(1,1,nargin));
|
---|
115 | if isempty(xmlstr)
|
---|
116 | error('[XML] Not enough parameters.')
|
---|
117 | elseif ~ischar(xmlstr) || sum(size(xmlstr)>1)>1
|
---|
118 | error('[XML] Input must be a string.')
|
---|
119 | end
|
---|
120 |
|
---|
121 | %- Initialize number of tags (<=> uid)
|
---|
122 | Xparse_count = 0;
|
---|
123 |
|
---|
124 | %- Remove prolog and white space characters from the XML string
|
---|
125 | xmlstring = normalize(prolog(xmlstr));
|
---|
126 |
|
---|
127 | %- Initialize the XML tree
|
---|
128 | xtree = {};
|
---|
129 | tree = fragment;
|
---|
130 | tree.str = 1;
|
---|
131 | tree.parent = 0;
|
---|
132 |
|
---|
133 | %- Parse the XML string
|
---|
134 | tree = compile(tree);
|
---|
135 |
|
---|
136 | %- Return the XML tree
|
---|
137 | tree = xtree;
|
---|
138 |
|
---|
139 | %- Remove global variables from the workspace
|
---|
140 | clear global xmlstring Xparse_count xtree;
|
---|
141 |
|
---|
142 | %==========================================================================
|
---|
143 | % SUBFUNCTIONS
|
---|
144 |
|
---|
145 | %--------------------------------------------------------------------------
|
---|
146 | function frag = compile(frag)
|
---|
147 | global xmlstring xtree Xparse_count;
|
---|
148 |
|
---|
149 | while 1,
|
---|
150 | if length(xmlstring)<=frag.str || ...
|
---|
151 | (frag.str == length(xmlstring)-1 && strcmp(xmlstring(frag.str:end),' '))
|
---|
152 | return
|
---|
153 | end
|
---|
154 | TagStart = xml_findstr(xmlstring,'<',frag.str,1);
|
---|
155 | if isempty(TagStart)
|
---|
156 | %- Character data
|
---|
157 | error('[XML] Unknown data at the end of the XML file.');
|
---|
158 | Xparse_count = Xparse_count + 1;
|
---|
159 | xtree{Xparse_count} = chardata;
|
---|
160 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end)));
|
---|
161 | xtree{Xparse_count}.parent = frag.parent;
|
---|
162 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
163 | frag.str = '';
|
---|
164 | elseif TagStart > frag.str
|
---|
165 | if strcmp(xmlstring(frag.str:TagStart-1),' ')
|
---|
166 | %- A single white space before a tag (ignore)
|
---|
167 | frag.str = TagStart;
|
---|
168 | else
|
---|
169 | %- Character data
|
---|
170 | Xparse_count = Xparse_count + 1;
|
---|
171 | xtree{Xparse_count} = chardata;
|
---|
172 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1)));
|
---|
173 | xtree{Xparse_count}.parent = frag.parent;
|
---|
174 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
175 | frag.str = TagStart;
|
---|
176 | end
|
---|
177 | else
|
---|
178 | if strcmp(xmlstring(frag.str+1),'?')
|
---|
179 | %- Processing instruction
|
---|
180 | frag = tag_pi(frag);
|
---|
181 | else
|
---|
182 | if length(xmlstring)-frag.str>4 && strcmp(xmlstring(frag.str+1:frag.str+3),'!--')
|
---|
183 | %- Comment
|
---|
184 | frag = tag_comment(frag);
|
---|
185 | else
|
---|
186 | if length(xmlstring)-frag.str>9 && strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[')
|
---|
187 | %- Litteral data
|
---|
188 | frag = tag_cdata(frag);
|
---|
189 | else
|
---|
190 | %- A tag element (empty (<.../>) or not)
|
---|
191 | if ~isempty(frag.end)
|
---|
192 | endmk = ['/' frag.end '>'];
|
---|
193 | else
|
---|
194 | endmk = '/>';
|
---|
195 | end
|
---|
196 | if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) || ...
|
---|
197 | strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk)
|
---|
198 | frag.str = frag.str + length(frag.end)+3;
|
---|
199 | return
|
---|
200 | else
|
---|
201 | frag = tag_element(frag);
|
---|
202 | end
|
---|
203 | end
|
---|
204 | end
|
---|
205 | end
|
---|
206 | end
|
---|
207 | end
|
---|
208 |
|
---|
209 | %--------------------------------------------------------------------------
|
---|
210 | function frag = tag_element(frag)
|
---|
211 | global xmlstring xtree Xparse_count;
|
---|
212 | close = xml_findstr(xmlstring,'>',frag.str,1);
|
---|
213 | if isempty(close)
|
---|
214 | error('[XML] Tag < opened but not closed.');
|
---|
215 | else
|
---|
216 | empty = strcmp(xmlstring(close-1:close),'/>');
|
---|
217 | if empty
|
---|
218 | close = close - 1;
|
---|
219 | end
|
---|
220 | starttag = normalize(xmlstring(frag.str+1:close-1));
|
---|
221 | nextspace = xml_findstr(starttag,' ',1,1);
|
---|
222 | attribs = '';
|
---|
223 | if isempty(nextspace)
|
---|
224 | name = starttag;
|
---|
225 | else
|
---|
226 | name = starttag(1:nextspace-1);
|
---|
227 | attribs = starttag(nextspace+1:end);
|
---|
228 | end
|
---|
229 | Xparse_count = Xparse_count + 1;
|
---|
230 | xtree{Xparse_count} = element;
|
---|
231 | xtree{Xparse_count}.name = strip(name);
|
---|
232 | if frag.parent
|
---|
233 | xtree{Xparse_count}.parent = frag.parent;
|
---|
234 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
235 | end
|
---|
236 | if ~isempty(attribs)
|
---|
237 | xtree{Xparse_count}.attributes = attribution(attribs);
|
---|
238 | end
|
---|
239 | if ~empty
|
---|
240 | contents = fragment;
|
---|
241 | contents.str = close+1;
|
---|
242 | contents.end = name;
|
---|
243 | contents.parent = Xparse_count;
|
---|
244 | contents = compile(contents);
|
---|
245 | frag.str = contents.str;
|
---|
246 | else
|
---|
247 | frag.str = close+2;
|
---|
248 | end
|
---|
249 | end
|
---|
250 |
|
---|
251 | %--------------------------------------------------------------------------
|
---|
252 | function frag = tag_pi(frag)
|
---|
253 | global xmlstring xtree Xparse_count;
|
---|
254 | close = xml_findstr(xmlstring,'?>',frag.str,1);
|
---|
255 | if isempty(close)
|
---|
256 | warning('[XML] Tag <? opened but not closed.')
|
---|
257 | else
|
---|
258 | nextspace = xml_findstr(xmlstring,' ',frag.str,1);
|
---|
259 | Xparse_count = Xparse_count + 1;
|
---|
260 | xtree{Xparse_count} = pri;
|
---|
261 | if nextspace > close || nextspace == frag.str+2
|
---|
262 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1));
|
---|
263 | else
|
---|
264 | xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1));
|
---|
265 | xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace));
|
---|
266 | end
|
---|
267 | if frag.parent
|
---|
268 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
269 | xtree{Xparse_count}.parent = frag.parent;
|
---|
270 | end
|
---|
271 | frag.str = close+2;
|
---|
272 | end
|
---|
273 |
|
---|
274 | %--------------------------------------------------------------------------
|
---|
275 | function frag = tag_comment(frag)
|
---|
276 | global xmlstring xtree Xparse_count;
|
---|
277 | close = xml_findstr(xmlstring,'-->',frag.str,1);
|
---|
278 | if isempty(close)
|
---|
279 | warning('[XML] Tag <!-- opened but not closed.')
|
---|
280 | else
|
---|
281 | Xparse_count = Xparse_count + 1;
|
---|
282 | xtree{Xparse_count} = comment;
|
---|
283 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1));
|
---|
284 | if frag.parent
|
---|
285 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
286 | xtree{Xparse_count}.parent = frag.parent;
|
---|
287 | end
|
---|
288 | frag.str = close+3;
|
---|
289 | end
|
---|
290 |
|
---|
291 | %--------------------------------------------------------------------------
|
---|
292 | function frag = tag_cdata(frag)
|
---|
293 | global xmlstring xtree Xparse_count;
|
---|
294 | close = xml_findstr(xmlstring,']]>',frag.str,1);
|
---|
295 | if isempty(close)
|
---|
296 | warning('[XML] Tag <![CDATA[ opened but not closed.')
|
---|
297 | else
|
---|
298 | Xparse_count = Xparse_count + 1;
|
---|
299 | xtree{Xparse_count} = cdata;
|
---|
300 | xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1);
|
---|
301 | if frag.parent
|
---|
302 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
303 | xtree{Xparse_count}.parent = frag.parent;
|
---|
304 | end
|
---|
305 | frag.str = close+3;
|
---|
306 | end
|
---|
307 |
|
---|
308 | %--------------------------------------------------------------------------
|
---|
309 | function all = attribution(str)
|
---|
310 | %- Initialize attributs
|
---|
311 | nbattr = 0;
|
---|
312 | all = cell(nbattr);
|
---|
313 | %- Look for 'key="value"' substrings
|
---|
314 | while 1,
|
---|
315 | eq = xml_findstr(str,'=',1,1);
|
---|
316 | if isempty(str) || isempty(eq), return; end
|
---|
317 | id = sort([xml_findstr(str,'"',1,1),xml_findstr(str,'''',1,1)]); id=id(1);
|
---|
318 | nextid = sort([xml_findstr(str,'"',id+1,1),xml_findstr(str,'''',id+1,1)]);nextid=nextid(1);
|
---|
319 | nbattr = nbattr + 1;
|
---|
320 | all{nbattr}.key = strip(str(1:(eq-1)));
|
---|
321 | all{nbattr}.val = entity(str((id+1):(nextid-1)));
|
---|
322 | str = str((nextid+1):end);
|
---|
323 | end
|
---|
324 |
|
---|
325 | %--------------------------------------------------------------------------
|
---|
326 | function elm = element
|
---|
327 | global Xparse_count;
|
---|
328 | elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count);
|
---|
329 |
|
---|
330 | %--------------------------------------------------------------------------
|
---|
331 | function cdat = chardata
|
---|
332 | global Xparse_count;
|
---|
333 | cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count);
|
---|
334 |
|
---|
335 | %--------------------------------------------------------------------------
|
---|
336 | function cdat = cdata
|
---|
337 | global Xparse_count;
|
---|
338 | cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count);
|
---|
339 |
|
---|
340 | %--------------------------------------------------------------------------
|
---|
341 | function proce = pri
|
---|
342 | global Xparse_count;
|
---|
343 | proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count);
|
---|
344 |
|
---|
345 | %--------------------------------------------------------------------------
|
---|
346 | function commt = comment
|
---|
347 | global Xparse_count;
|
---|
348 | commt = struct('type','comment','value','','parent',[],'uid',Xparse_count);
|
---|
349 |
|
---|
350 | %--------------------------------------------------------------------------
|
---|
351 | function frg = fragment
|
---|
352 | frg = struct('str','','parent','','end','');
|
---|
353 |
|
---|
354 | %--------------------------------------------------------------------------
|
---|
355 | function str = prolog(str)
|
---|
356 | %- Initialize beginning index of elements tree
|
---|
357 | b = 1;
|
---|
358 | %- Initial tag
|
---|
359 | start = xml_findstr(str,'<',1,1);
|
---|
360 | if isempty(start)
|
---|
361 | error('[XML] No tag found.')
|
---|
362 | end
|
---|
363 | %- Header (<?xml version="1.0" ... ?>)
|
---|
364 | if strcmpi(str(start:start+2),'<?x')
|
---|
365 | close = xml_findstr(str,'?>',1,1);
|
---|
366 | if ~isempty(close)
|
---|
367 | b = close + 2;
|
---|
368 | else
|
---|
369 | warning('[XML] Header tag incomplete.')
|
---|
370 | end
|
---|
371 | end
|
---|
372 | %- Doctype (<!DOCTYPE type ... [ declarations ]>)
|
---|
373 | start = xml_findstr(str,'<!DOCTYPE',b,1); % length('<!DOCTYPE') = 9
|
---|
374 | if ~isempty(start)
|
---|
375 | close = xml_findstr(str,'>',start+9,1);
|
---|
376 | if ~isempty(close)
|
---|
377 | b = close + 1;
|
---|
378 | dp = xml_findstr(str,'[',start+9,1);
|
---|
379 | if (~isempty(dp) && dp < b)
|
---|
380 | k = xml_findstr(str,']>',start+9,1);
|
---|
381 | if ~isempty(k)
|
---|
382 | b = k + 2;
|
---|
383 | else
|
---|
384 | warning('[XML] Tag [ in DOCTYPE opened but not closed.')
|
---|
385 | end
|
---|
386 | end
|
---|
387 | else
|
---|
388 | warning('[XML] Tag DOCTYPE opened but not closed.')
|
---|
389 | end
|
---|
390 | end
|
---|
391 | %- Skip prolog from the xml string
|
---|
392 | str = str(b:end);
|
---|
393 |
|
---|
394 | %--------------------------------------------------------------------------
|
---|
395 | function str = strip(str)
|
---|
396 | str(isspace(str)) = '';
|
---|
397 |
|
---|
398 | %--------------------------------------------------------------------------
|
---|
399 | function str = normalize(str)
|
---|
400 | % Find white characters (space, newline, carriage return, tabs, ...)
|
---|
401 | i = isspace(str);
|
---|
402 | i = find(i == 1);
|
---|
403 | str(i) = ' ';
|
---|
404 | % replace several white characters by only one
|
---|
405 | if ~isempty(i)
|
---|
406 | j = i - [i(2:end) i(end)];
|
---|
407 | str(i(j == -1)) = [];
|
---|
408 | end
|
---|
409 |
|
---|
410 | %--------------------------------------------------------------------------
|
---|
411 | function str = entity(str)
|
---|
412 | str = strrep(str,'<','<');
|
---|
413 | str = strrep(str,'>','>');
|
---|
414 | str = strrep(str,'"','"');
|
---|
415 | str = strrep(str,''','''');
|
---|
416 | str = strrep(str,'&','&');
|
---|
417 |
|
---|
418 | %--------------------------------------------------------------------------
|
---|
419 | function str = erode(str)
|
---|
420 | if ~isempty(str) && str(1)==' ', str(1)=''; end;
|
---|
421 | if ~isempty(str) && str(end)==' ', str(end)=''; end;
|
---|