[925] | 1 | function tree = xml_parser(xmlstr)
|
---|
| 2 | % XML (eXtensible Markup Language) Processor
|
---|
| 3 | % FORMAT tree = xml_parser(xmlstr)
|
---|
| 4 | %
|
---|
| 5 | % xmlstr - XML string to parse
|
---|
| 6 | % tree - tree structure corresponding to the XML file
|
---|
| 7 | %__________________________________________________________________________
|
---|
| 8 | %
|
---|
| 9 | % xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser.
|
---|
| 10 | % It aims to be fully conforming. It is currently not a validating
|
---|
| 11 | % XML processor.
|
---|
| 12 | %
|
---|
| 13 | % A description of the tree structure provided in output is detailed in
|
---|
| 14 | % the header of this m-file.
|
---|
| 15 | %__________________________________________________________________________
|
---|
| 16 | % Copyright (C) 2002-2015 http://www.artefact.tk/
|
---|
| 17 |
|
---|
| 18 | % Guillaume Flandin
|
---|
| 19 | % $Id: xml_parser.m 6480 2015-06-13 01:08:30Z guillaume $
|
---|
| 20 |
|
---|
| 21 | % XML Processor for GNU Octave and MATLAB (The Mathworks, Inc.)
|
---|
| 22 | % Copyright (C) 2002-2015 Guillaume Flandin <Guillaume@artefact.tk>
|
---|
| 23 | %
|
---|
| 24 | % This program is free software; you can redistribute it and/or
|
---|
| 25 | % modify it under the terms of the GNU General Public License
|
---|
| 26 | % as published by the Free Software Foundation; either version 2
|
---|
| 27 | % of the License, or any later version.
|
---|
| 28 | %
|
---|
| 29 | % This program is distributed in the hope that it will be useful,
|
---|
| 30 | % but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 31 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 32 | % GNU General Public License for more details.
|
---|
| 33 | %
|
---|
| 34 | % You should have received a copy of the GNU General Public License
|
---|
| 35 | % along with this program; if not, write to the Free Software
|
---|
| 36 | % Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA.
|
---|
| 37 | %--------------------------------------------------------------------------
|
---|
| 38 |
|
---|
| 39 | % Suggestions for improvement and fixes are always welcome, although no
|
---|
| 40 | % guarantee is made whether and when they will be implemented.
|
---|
| 41 | % Send requests to <Guillaume@artefact.tk>
|
---|
| 42 | % Check also the latest developments on the following webpage:
|
---|
| 43 | % <http://www.artefact.tk/software/matlab/xml/>
|
---|
| 44 | %--------------------------------------------------------------------------
|
---|
| 45 |
|
---|
| 46 | % The implementation of this XML parser is much inspired from a
|
---|
| 47 | % Javascript parser that used to be available at <http://www.jeremie.com/>
|
---|
| 48 |
|
---|
| 49 | % A C-MEX file xml_findstr.c is also required, to encompass some
|
---|
| 50 | % limitations of the built-in FINDSTR function.
|
---|
| 51 | % Compile it on your architecture using 'mex -O xml_findstr.c' command
|
---|
| 52 | % if the compiled version for your system is not provided.
|
---|
| 53 | % If this function does not behave as expected, comment the line
|
---|
| 54 | % '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again.
|
---|
| 55 | %--------------------------------------------------------------------------
|
---|
| 56 |
|
---|
| 57 | % Structure of the output tree:
|
---|
| 58 | % There are 5 types of nodes in an XML file: element, chardata, cdata,
|
---|
| 59 | % pi and comment.
|
---|
| 60 | % Each of them contains an UID (Unique Identifier): an integer between
|
---|
| 61 | % 1 and the number of nodes of the XML file.
|
---|
| 62 | %
|
---|
| 63 | % element (a tag <name key="value"> [contents] </name>
|
---|
| 64 | % |_ type: 'element'
|
---|
| 65 | % |_ name: string
|
---|
| 66 | % |_ attributes: cell array of struct 'key' and 'value' or []
|
---|
| 67 | % |_ contents: double array of uid's or [] if empty
|
---|
| 68 | % |_ parent: uid of the parent ([] if root)
|
---|
| 69 | % |_ uid: double
|
---|
| 70 | %
|
---|
| 71 | % chardata (a character array)
|
---|
| 72 | % |_ type: 'chardata'
|
---|
| 73 | % |_ value: string
|
---|
| 74 | % |_ parent: uid of the parent
|
---|
| 75 | % |_ uid: double
|
---|
| 76 | %
|
---|
| 77 | % cdata (a litteral string <![CDATA[value]]>)
|
---|
| 78 | % |_ type: 'cdata'
|
---|
| 79 | % |_ value: string
|
---|
| 80 | % |_ parent: uid of the parent
|
---|
| 81 | % |_ uid: double
|
---|
| 82 | %
|
---|
| 83 | % pi (a processing instruction <?target value ?>)
|
---|
| 84 | % |_ type: 'pi'
|
---|
| 85 | % |_ target: string (may be empty)
|
---|
| 86 | % |_ value: string
|
---|
| 87 | % |_ parent: uid of the parent
|
---|
| 88 | % |_ uid: double
|
---|
| 89 | %
|
---|
| 90 | % comment (a comment <!-- value -->)
|
---|
| 91 | % |_ type: 'comment'
|
---|
| 92 | % |_ value: string
|
---|
| 93 | % |_ parent: uid of the parent
|
---|
| 94 | % |_ uid: double
|
---|
| 95 | %
|
---|
| 96 | %--------------------------------------------------------------------------
|
---|
| 97 |
|
---|
| 98 | % TODO/BUG/FEATURES:
|
---|
| 99 | % - [compile] only a warning if TagStart is empty ?
|
---|
| 100 | % - [attribution] should look for " and ' rather than only "
|
---|
| 101 | % - [main] with normalize as a preprocessing, CDATA are modified
|
---|
| 102 | % - [prolog] look for a DOCTYPE in the whole string even if it occurs
|
---|
| 103 | % only in a far CDATA tag, bug even if the doctype is inside a comment
|
---|
| 104 | % - [tag_element] erode should replace normalize here
|
---|
| 105 | % - remove globals? uppercase globals rather persistent (clear mfile)?
|
---|
| 106 | % - xml_findstr is indeed xml_strfind according to Mathworks vocabulary
|
---|
| 107 | % - problem with entities: do we need to convert them here? (é)
|
---|
| 108 | %--------------------------------------------------------------------------
|
---|
| 109 |
|
---|
| 110 | %- XML string to parse and number of tags read
|
---|
| 111 | global xmlstring Xparse_count xtree;
|
---|
| 112 |
|
---|
| 113 | %- Check input arguments
|
---|
| 114 | %error(nargchk(1,1,nargin));
|
---|
| 115 | if isempty(xmlstr)
|
---|
| 116 | error('[XML] Not enough parameters.')
|
---|
| 117 | elseif ~ischar(xmlstr) || sum(size(xmlstr)>1)>1
|
---|
| 118 | error('[XML] Input must be a string.')
|
---|
| 119 | end
|
---|
| 120 |
|
---|
| 121 | %- Initialize number of tags (<=> uid)
|
---|
| 122 | Xparse_count = 0;
|
---|
| 123 |
|
---|
| 124 | %- Remove prolog and white space characters from the XML string
|
---|
| 125 | xmlstring = normalize(prolog(xmlstr));
|
---|
| 126 |
|
---|
| 127 | %- Initialize the XML tree
|
---|
| 128 | xtree = {};
|
---|
| 129 | tree = fragment;
|
---|
| 130 | tree.str = 1;
|
---|
| 131 | tree.parent = 0;
|
---|
| 132 |
|
---|
| 133 | %- Parse the XML string
|
---|
| 134 | tree = compile(tree);
|
---|
| 135 |
|
---|
| 136 | %- Return the XML tree
|
---|
| 137 | tree = xtree;
|
---|
| 138 |
|
---|
| 139 | %- Remove global variables from the workspace
|
---|
| 140 | clear global xmlstring Xparse_count xtree;
|
---|
| 141 |
|
---|
| 142 | %==========================================================================
|
---|
| 143 | % SUBFUNCTIONS
|
---|
| 144 |
|
---|
| 145 | %--------------------------------------------------------------------------
|
---|
| 146 | function frag = compile(frag)
|
---|
| 147 | global xmlstring xtree Xparse_count;
|
---|
| 148 |
|
---|
| 149 | while 1,
|
---|
| 150 | if length(xmlstring)<=frag.str || ...
|
---|
| 151 | (frag.str == length(xmlstring)-1 && strcmp(xmlstring(frag.str:end),' '))
|
---|
| 152 | return
|
---|
| 153 | end
|
---|
| 154 | TagStart = xml_findstr(xmlstring,'<',frag.str,1);
|
---|
| 155 | if isempty(TagStart)
|
---|
| 156 | %- Character data
|
---|
| 157 | error('[XML] Unknown data at the end of the XML file.');
|
---|
| 158 | Xparse_count = Xparse_count + 1;
|
---|
| 159 | xtree{Xparse_count} = chardata;
|
---|
| 160 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end)));
|
---|
| 161 | xtree{Xparse_count}.parent = frag.parent;
|
---|
| 162 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
| 163 | frag.str = '';
|
---|
| 164 | elseif TagStart > frag.str
|
---|
| 165 | if strcmp(xmlstring(frag.str:TagStart-1),' ')
|
---|
| 166 | %- A single white space before a tag (ignore)
|
---|
| 167 | frag.str = TagStart;
|
---|
| 168 | else
|
---|
| 169 | %- Character data
|
---|
| 170 | Xparse_count = Xparse_count + 1;
|
---|
| 171 | xtree{Xparse_count} = chardata;
|
---|
| 172 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1)));
|
---|
| 173 | xtree{Xparse_count}.parent = frag.parent;
|
---|
| 174 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
| 175 | frag.str = TagStart;
|
---|
| 176 | end
|
---|
| 177 | else
|
---|
| 178 | if strcmp(xmlstring(frag.str+1),'?')
|
---|
| 179 | %- Processing instruction
|
---|
| 180 | frag = tag_pi(frag);
|
---|
| 181 | else
|
---|
| 182 | if length(xmlstring)-frag.str>4 && strcmp(xmlstring(frag.str+1:frag.str+3),'!--')
|
---|
| 183 | %- Comment
|
---|
| 184 | frag = tag_comment(frag);
|
---|
| 185 | else
|
---|
| 186 | if length(xmlstring)-frag.str>9 && strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[')
|
---|
| 187 | %- Litteral data
|
---|
| 188 | frag = tag_cdata(frag);
|
---|
| 189 | else
|
---|
| 190 | %- A tag element (empty (<.../>) or not)
|
---|
| 191 | if ~isempty(frag.end)
|
---|
| 192 | endmk = ['/' frag.end '>'];
|
---|
| 193 | else
|
---|
| 194 | endmk = '/>';
|
---|
| 195 | end
|
---|
| 196 | if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) || ...
|
---|
| 197 | strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk)
|
---|
| 198 | frag.str = frag.str + length(frag.end)+3;
|
---|
| 199 | return
|
---|
| 200 | else
|
---|
| 201 | frag = tag_element(frag);
|
---|
| 202 | end
|
---|
| 203 | end
|
---|
| 204 | end
|
---|
| 205 | end
|
---|
| 206 | end
|
---|
| 207 | end
|
---|
| 208 |
|
---|
| 209 | %--------------------------------------------------------------------------
|
---|
| 210 | function frag = tag_element(frag)
|
---|
| 211 | global xmlstring xtree Xparse_count;
|
---|
| 212 | close = xml_findstr(xmlstring,'>',frag.str,1);
|
---|
| 213 | if isempty(close)
|
---|
| 214 | error('[XML] Tag < opened but not closed.');
|
---|
| 215 | else
|
---|
| 216 | empty = strcmp(xmlstring(close-1:close),'/>');
|
---|
| 217 | if empty
|
---|
| 218 | close = close - 1;
|
---|
| 219 | end
|
---|
| 220 | starttag = normalize(xmlstring(frag.str+1:close-1));
|
---|
| 221 | nextspace = xml_findstr(starttag,' ',1,1);
|
---|
| 222 | attribs = '';
|
---|
| 223 | if isempty(nextspace)
|
---|
| 224 | name = starttag;
|
---|
| 225 | else
|
---|
| 226 | name = starttag(1:nextspace-1);
|
---|
| 227 | attribs = starttag(nextspace+1:end);
|
---|
| 228 | end
|
---|
| 229 | Xparse_count = Xparse_count + 1;
|
---|
| 230 | xtree{Xparse_count} = element;
|
---|
| 231 | xtree{Xparse_count}.name = strip(name);
|
---|
| 232 | if frag.parent
|
---|
| 233 | xtree{Xparse_count}.parent = frag.parent;
|
---|
| 234 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
| 235 | end
|
---|
| 236 | if ~isempty(attribs)
|
---|
| 237 | xtree{Xparse_count}.attributes = attribution(attribs);
|
---|
| 238 | end
|
---|
| 239 | if ~empty
|
---|
| 240 | contents = fragment;
|
---|
| 241 | contents.str = close+1;
|
---|
| 242 | contents.end = name;
|
---|
| 243 | contents.parent = Xparse_count;
|
---|
| 244 | contents = compile(contents);
|
---|
| 245 | frag.str = contents.str;
|
---|
| 246 | else
|
---|
| 247 | frag.str = close+2;
|
---|
| 248 | end
|
---|
| 249 | end
|
---|
| 250 |
|
---|
| 251 | %--------------------------------------------------------------------------
|
---|
| 252 | function frag = tag_pi(frag)
|
---|
| 253 | global xmlstring xtree Xparse_count;
|
---|
| 254 | close = xml_findstr(xmlstring,'?>',frag.str,1);
|
---|
| 255 | if isempty(close)
|
---|
| 256 | warning('[XML] Tag <? opened but not closed.')
|
---|
| 257 | else
|
---|
| 258 | nextspace = xml_findstr(xmlstring,' ',frag.str,1);
|
---|
| 259 | Xparse_count = Xparse_count + 1;
|
---|
| 260 | xtree{Xparse_count} = pri;
|
---|
| 261 | if nextspace > close || nextspace == frag.str+2
|
---|
| 262 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1));
|
---|
| 263 | else
|
---|
| 264 | xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1));
|
---|
| 265 | xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace));
|
---|
| 266 | end
|
---|
| 267 | if frag.parent
|
---|
| 268 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
| 269 | xtree{Xparse_count}.parent = frag.parent;
|
---|
| 270 | end
|
---|
| 271 | frag.str = close+2;
|
---|
| 272 | end
|
---|
| 273 |
|
---|
| 274 | %--------------------------------------------------------------------------
|
---|
| 275 | function frag = tag_comment(frag)
|
---|
| 276 | global xmlstring xtree Xparse_count;
|
---|
| 277 | close = xml_findstr(xmlstring,'-->',frag.str,1);
|
---|
| 278 | if isempty(close)
|
---|
| 279 | warning('[XML] Tag <!-- opened but not closed.')
|
---|
| 280 | else
|
---|
| 281 | Xparse_count = Xparse_count + 1;
|
---|
| 282 | xtree{Xparse_count} = comment;
|
---|
| 283 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1));
|
---|
| 284 | if frag.parent
|
---|
| 285 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
| 286 | xtree{Xparse_count}.parent = frag.parent;
|
---|
| 287 | end
|
---|
| 288 | frag.str = close+3;
|
---|
| 289 | end
|
---|
| 290 |
|
---|
| 291 | %--------------------------------------------------------------------------
|
---|
| 292 | function frag = tag_cdata(frag)
|
---|
| 293 | global xmlstring xtree Xparse_count;
|
---|
| 294 | close = xml_findstr(xmlstring,']]>',frag.str,1);
|
---|
| 295 | if isempty(close)
|
---|
| 296 | warning('[XML] Tag <![CDATA[ opened but not closed.')
|
---|
| 297 | else
|
---|
| 298 | Xparse_count = Xparse_count + 1;
|
---|
| 299 | xtree{Xparse_count} = cdata;
|
---|
| 300 | xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1);
|
---|
| 301 | if frag.parent
|
---|
| 302 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
|
---|
| 303 | xtree{Xparse_count}.parent = frag.parent;
|
---|
| 304 | end
|
---|
| 305 | frag.str = close+3;
|
---|
| 306 | end
|
---|
| 307 |
|
---|
| 308 | %--------------------------------------------------------------------------
|
---|
| 309 | function all = attribution(str)
|
---|
| 310 | %- Initialize attributs
|
---|
| 311 | nbattr = 0;
|
---|
| 312 | all = cell(nbattr);
|
---|
| 313 | %- Look for 'key="value"' substrings
|
---|
| 314 | while 1,
|
---|
| 315 | eq = xml_findstr(str,'=',1,1);
|
---|
| 316 | if isempty(str) || isempty(eq), return; end
|
---|
| 317 | id = sort([xml_findstr(str,'"',1,1),xml_findstr(str,'''',1,1)]); id=id(1);
|
---|
| 318 | nextid = sort([xml_findstr(str,'"',id+1,1),xml_findstr(str,'''',id+1,1)]);nextid=nextid(1);
|
---|
| 319 | nbattr = nbattr + 1;
|
---|
| 320 | all{nbattr}.key = strip(str(1:(eq-1)));
|
---|
| 321 | all{nbattr}.val = entity(str((id+1):(nextid-1)));
|
---|
| 322 | str = str((nextid+1):end);
|
---|
| 323 | end
|
---|
| 324 |
|
---|
| 325 | %--------------------------------------------------------------------------
|
---|
| 326 | function elm = element
|
---|
| 327 | global Xparse_count;
|
---|
| 328 | elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count);
|
---|
| 329 |
|
---|
| 330 | %--------------------------------------------------------------------------
|
---|
| 331 | function cdat = chardata
|
---|
| 332 | global Xparse_count;
|
---|
| 333 | cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count);
|
---|
| 334 |
|
---|
| 335 | %--------------------------------------------------------------------------
|
---|
| 336 | function cdat = cdata
|
---|
| 337 | global Xparse_count;
|
---|
| 338 | cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count);
|
---|
| 339 |
|
---|
| 340 | %--------------------------------------------------------------------------
|
---|
| 341 | function proce = pri
|
---|
| 342 | global Xparse_count;
|
---|
| 343 | proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count);
|
---|
| 344 |
|
---|
| 345 | %--------------------------------------------------------------------------
|
---|
| 346 | function commt = comment
|
---|
| 347 | global Xparse_count;
|
---|
| 348 | commt = struct('type','comment','value','','parent',[],'uid',Xparse_count);
|
---|
| 349 |
|
---|
| 350 | %--------------------------------------------------------------------------
|
---|
| 351 | function frg = fragment
|
---|
| 352 | frg = struct('str','','parent','','end','');
|
---|
| 353 |
|
---|
| 354 | %--------------------------------------------------------------------------
|
---|
| 355 | function str = prolog(str)
|
---|
| 356 | %- Initialize beginning index of elements tree
|
---|
| 357 | b = 1;
|
---|
| 358 | %- Initial tag
|
---|
| 359 | start = xml_findstr(str,'<',1,1);
|
---|
| 360 | if isempty(start)
|
---|
| 361 | error('[XML] No tag found.')
|
---|
| 362 | end
|
---|
| 363 | %- Header (<?xml version="1.0" ... ?>)
|
---|
| 364 | if strcmpi(str(start:start+2),'<?x')
|
---|
| 365 | close = xml_findstr(str,'?>',1,1);
|
---|
| 366 | if ~isempty(close)
|
---|
| 367 | b = close + 2;
|
---|
| 368 | else
|
---|
| 369 | warning('[XML] Header tag incomplete.')
|
---|
| 370 | end
|
---|
| 371 | end
|
---|
| 372 | %- Doctype (<!DOCTYPE type ... [ declarations ]>)
|
---|
| 373 | start = xml_findstr(str,'<!DOCTYPE',b,1); % length('<!DOCTYPE') = 9
|
---|
| 374 | if ~isempty(start)
|
---|
| 375 | close = xml_findstr(str,'>',start+9,1);
|
---|
| 376 | if ~isempty(close)
|
---|
| 377 | b = close + 1;
|
---|
| 378 | dp = xml_findstr(str,'[',start+9,1);
|
---|
| 379 | if (~isempty(dp) && dp < b)
|
---|
| 380 | k = xml_findstr(str,']>',start+9,1);
|
---|
| 381 | if ~isempty(k)
|
---|
| 382 | b = k + 2;
|
---|
| 383 | else
|
---|
| 384 | warning('[XML] Tag [ in DOCTYPE opened but not closed.')
|
---|
| 385 | end
|
---|
| 386 | end
|
---|
| 387 | else
|
---|
| 388 | warning('[XML] Tag DOCTYPE opened but not closed.')
|
---|
| 389 | end
|
---|
| 390 | end
|
---|
| 391 | %- Skip prolog from the xml string
|
---|
| 392 | str = str(b:end);
|
---|
| 393 |
|
---|
| 394 | %--------------------------------------------------------------------------
|
---|
| 395 | function str = strip(str)
|
---|
| 396 | str(isspace(str)) = '';
|
---|
| 397 |
|
---|
| 398 | %--------------------------------------------------------------------------
|
---|
| 399 | function str = normalize(str)
|
---|
| 400 | % Find white characters (space, newline, carriage return, tabs, ...)
|
---|
| 401 | i = isspace(str);
|
---|
| 402 | i = find(i == 1);
|
---|
| 403 | str(i) = ' ';
|
---|
| 404 | % replace several white characters by only one
|
---|
| 405 | if ~isempty(i)
|
---|
| 406 | j = i - [i(2:end) i(end)];
|
---|
| 407 | str(i(j == -1)) = [];
|
---|
| 408 | end
|
---|
| 409 |
|
---|
| 410 | %--------------------------------------------------------------------------
|
---|
| 411 | function str = entity(str)
|
---|
| 412 | str = strrep(str,'<','<');
|
---|
| 413 | str = strrep(str,'>','>');
|
---|
| 414 | str = strrep(str,'"','"');
|
---|
| 415 | str = strrep(str,''','''');
|
---|
| 416 | str = strrep(str,'&','&');
|
---|
| 417 |
|
---|
| 418 | %--------------------------------------------------------------------------
|
---|
| 419 | function str = erode(str)
|
---|
| 420 | if ~isempty(str) && str(1)==' ', str(1)=''; end;
|
---|
| 421 | if ~isempty(str) && str(end)==' ', str(end)=''; end;
|
---|