[820] | 1 | function tree = xml_parser(xmlstr) |
---|
[723] | 2 | % XML (eXtensible Markup Language) Processor |
---|
[820] | 3 | % FORMAT tree = xml_parser(xmlstr) |
---|
[723] | 4 | % |
---|
[820] | 5 | % xmlstr - XML string to parse |
---|
| 6 | % tree - tree structure corresponding to the XML file |
---|
[723] | 7 | %_______________________________________________________________________ |
---|
| 8 | % |
---|
| 9 | % xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser |
---|
| 10 | % written in Matlab. It aims to be fully conforming. It is currently not |
---|
| 11 | % a validating XML processor. |
---|
| 12 | % |
---|
| 13 | % A description of the tree structure provided in output is detailed in |
---|
| 14 | % the header of this m-file. |
---|
| 15 | %_______________________________________________________________________ |
---|
[820] | 16 | % @(#)xml_parser.m Guillaume Flandin 2002/04/04 |
---|
[723] | 17 | |
---|
| 18 | % XML Processor for MATLAB (The Mathworks, Inc.). |
---|
[820] | 19 | % Copyright (C) 2002-2003 Guillaume Flandin <Guillaume@artefact.tk> |
---|
[723] | 20 | % |
---|
| 21 | % This program is free software; you can redistribute it and/or |
---|
| 22 | % modify it under the terms of the GNU General Public License |
---|
| 23 | % as published by the Free Software Foundation; either version 2 |
---|
| 24 | % of the License, or any later version. |
---|
| 25 | % |
---|
| 26 | % This program is distributed in the hope that it will be useful, |
---|
| 27 | % but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 28 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 29 | % GNU General Public License for more details. |
---|
| 30 | % |
---|
| 31 | % You should have received a copy of the GNU General Public License |
---|
| 32 | % along with this program; if not, write to the Free Software |
---|
| 33 | % Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA. |
---|
| 34 | %----------------------------------------------------------------------- |
---|
| 35 | |
---|
[820] | 36 | % Suggestions for improvement and fixes are always welcome, although no |
---|
| 37 | % guarantee is made whether and when they will be implemented. |
---|
| 38 | % Send requests to <Guillaume@artefact.tk> |
---|
[723] | 39 | % Check also the latest developments on the following webpage: |
---|
[820] | 40 | % <http://www.artefact.tk/software/matlab/xml/> |
---|
[723] | 41 | %----------------------------------------------------------------------- |
---|
| 42 | |
---|
[820] | 43 | % The implementation of this XML parser is much inspired from a |
---|
| 44 | % Javascript parser available at <http://www.jeremie.com/> |
---|
| 45 | |
---|
[723] | 46 | % A mex-file xml_findstr.c is also required, to encompass some |
---|
| 47 | % limitations of the built-in findstr Matlab function. |
---|
| 48 | % Compile it on your architecture using 'mex -O xml_findstr.c' command |
---|
| 49 | % if the compiled version for your system is not provided. |
---|
| 50 | % If this function behaves badly (crash or wrong results), comment the |
---|
| 51 | % line '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again. |
---|
| 52 | %----------------------------------------------------------------------- |
---|
| 53 | |
---|
| 54 | % Structure of the output tree: |
---|
| 55 | % There are 5 types of nodes in an XML file: element, chardata, cdata, |
---|
| 56 | % pi and comment. |
---|
| 57 | % Each of them contains an UID (Unique Identifier): an integer between |
---|
| 58 | % 1 and the number of nodes of the XML file. |
---|
| 59 | % |
---|
| 60 | % element (a tag <name key="value"> [contents] </name> |
---|
| 61 | % |_ type: 'element' |
---|
| 62 | % |_ name: string |
---|
| 63 | % |_ attributes: cell array of struct 'key' and 'value' or [] |
---|
| 64 | % |_ contents: double array of uid's or [] if empty |
---|
| 65 | % |_ parent: uid of the parent ([] if root) |
---|
| 66 | % |_ uid: double |
---|
| 67 | % |
---|
| 68 | % chardata (a character array) |
---|
| 69 | % |_ type: 'chardata' |
---|
| 70 | % |_ value: string |
---|
| 71 | % |_ parent: uid of the parent |
---|
| 72 | % |_ uid: double |
---|
| 73 | % |
---|
| 74 | % cdata (a litteral string <![CDATA[value]]>) |
---|
| 75 | % |_ type: 'cdata' |
---|
| 76 | % |_ value: string |
---|
| 77 | % |_ parent: uid of the parent |
---|
| 78 | % |_ uid: double |
---|
| 79 | % |
---|
| 80 | % pi (a processing instruction <?target value ?>) |
---|
| 81 | % |_ type: 'pi' |
---|
| 82 | % |_ target: string (may be empty) |
---|
| 83 | % |_ value: string |
---|
| 84 | % |_ parent: uid of the parent |
---|
| 85 | % |_ uid: double |
---|
| 86 | % |
---|
| 87 | % comment (a comment <!-- value -->) |
---|
| 88 | % |_ type: 'comment' |
---|
| 89 | % |_ value: string |
---|
| 90 | % |_ parent: uid of the parent |
---|
| 91 | % |_ uid: double |
---|
| 92 | % |
---|
| 93 | %----------------------------------------------------------------------- |
---|
| 94 | |
---|
| 95 | % TODO/BUG/FEATURES: |
---|
[820] | 96 | % - [compile] only a warning if TagStart is empty ? |
---|
[723] | 97 | % - [attribution] should look for " and ' rather than only " |
---|
| 98 | % - [main] with normalize as a preprocessing, CDATA are modified |
---|
| 99 | % - [prolog] look for a DOCTYPE in the whole string even if it occurs |
---|
[820] | 100 | % only in a far CDATA tag, bug even if the doctype is inside a comment |
---|
[723] | 101 | % - [tag_element] erode should replace normalize here |
---|
| 102 | % - remove globals? uppercase globals rather persistent (clear mfile)? |
---|
[820] | 103 | % - xml_findstr is indeed xml_strfind according to Mathworks vocabulary |
---|
| 104 | % - problem with entities: do we need to convert them here? (é) |
---|
[723] | 105 | %----------------------------------------------------------------------- |
---|
| 106 | |
---|
| 107 | %- XML string to parse and number of tags read |
---|
| 108 | global xmlstring Xparse_count xtree; |
---|
| 109 | |
---|
| 110 | %- Check input arguments |
---|
| 111 | error(nargchk(1,1,nargin)); |
---|
[820] | 112 | if isempty(xmlstr) |
---|
| 113 | error('[XML] Not enough parameters.') |
---|
| 114 | elseif ~isstr(xmlstr) | sum(size(xmlstr)>1)>1 |
---|
| 115 | error('[XML] Input must be a string.') |
---|
[723] | 116 | end |
---|
| 117 | |
---|
| 118 | %- Initialize number of tags (<=> uid) |
---|
| 119 | Xparse_count = 0; |
---|
| 120 | |
---|
| 121 | %- Remove prolog and white space characters from the XML string |
---|
[820] | 122 | xmlstring = normalize(prolog(xmlstr)); |
---|
[723] | 123 | |
---|
| 124 | %- Initialize the XML tree |
---|
| 125 | xtree = {}; |
---|
| 126 | tree = fragment; |
---|
| 127 | tree.str = 1; |
---|
| 128 | tree.parent = 0; |
---|
| 129 | |
---|
| 130 | %- Parse the XML string |
---|
| 131 | tree = compile(tree); |
---|
| 132 | |
---|
| 133 | %- Return the XML tree |
---|
| 134 | tree = xtree; |
---|
| 135 | |
---|
| 136 | %- Remove global variables from the workspace |
---|
| 137 | clear global xmlstring Xparse_count xtree; |
---|
| 138 | |
---|
| 139 | %======================================================================= |
---|
| 140 | % SUBFUNCTIONS |
---|
| 141 | |
---|
| 142 | %----------------------------------------------------------------------- |
---|
| 143 | function frag = compile(frag) |
---|
| 144 | global xmlstring xtree Xparse_count; |
---|
| 145 | |
---|
| 146 | while 1, |
---|
| 147 | if length(xmlstring)<=frag.str | ... |
---|
| 148 | (frag.str == length(xmlstring)-1 & strcmp(xmlstring(frag.str:end),' ')) |
---|
| 149 | return |
---|
| 150 | end |
---|
| 151 | TagStart = xml_findstr(xmlstring,'<',frag.str,1); |
---|
| 152 | if isempty(TagStart) |
---|
[820] | 153 | %- Character data |
---|
| 154 | error(sprintf(['[XML] Unknown data at the end of the XML file.\n' ... |
---|
| 155 | ' Please send me your XML file at Guillaume@artefact.tk'])); |
---|
| 156 | xtree{Xparse_count} = chardata; |
---|
[723] | 157 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end))); |
---|
| 158 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 159 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
[820] | 160 | frag.str = ''; |
---|
[723] | 161 | elseif TagStart > frag.str |
---|
| 162 | if strcmp(xmlstring(frag.str:TagStart-1),' ') |
---|
| 163 | %- A single white space before a tag (ignore) |
---|
| 164 | frag.str = TagStart; |
---|
| 165 | else |
---|
| 166 | %- Character data |
---|
| 167 | xtree{Xparse_count} = chardata; |
---|
| 168 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1))); |
---|
| 169 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 170 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 171 | frag.str = TagStart; |
---|
| 172 | end |
---|
| 173 | else |
---|
| 174 | if strcmp(xmlstring(frag.str+1),'?') |
---|
| 175 | %- Processing instruction |
---|
| 176 | frag = tag_pi(frag); |
---|
| 177 | else |
---|
| 178 | if length(xmlstring)-frag.str>4 & strcmp(xmlstring(frag.str+1:frag.str+3),'!--') |
---|
| 179 | %- Comment |
---|
| 180 | frag = tag_comment(frag); |
---|
| 181 | else |
---|
| 182 | if length(xmlstring)-frag.str>9 & strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[') |
---|
| 183 | %- Litteral data |
---|
| 184 | frag = tag_cdata(frag); |
---|
| 185 | else |
---|
| 186 | %- A tag element (empty (<.../>) or not) |
---|
| 187 | if ~isempty(frag.end) |
---|
| 188 | endmk = ['/' frag.end '>']; |
---|
| 189 | else |
---|
| 190 | endmk = '/>'; |
---|
| 191 | end |
---|
| 192 | if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) | ... |
---|
| 193 | strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk) |
---|
| 194 | frag.str = frag.str + length(frag.end)+3; |
---|
| 195 | return |
---|
| 196 | else |
---|
| 197 | frag = tag_element(frag); |
---|
| 198 | end |
---|
| 199 | end |
---|
| 200 | end |
---|
| 201 | end |
---|
| 202 | end |
---|
| 203 | end |
---|
| 204 | |
---|
| 205 | %----------------------------------------------------------------------- |
---|
| 206 | function frag = tag_element(frag) |
---|
| 207 | global xmlstring xtree Xparse_count; |
---|
| 208 | close = xml_findstr(xmlstring,'>',frag.str,1); |
---|
| 209 | if isempty(close) |
---|
| 210 | error('[XML] Tag < opened but not closed.'); |
---|
| 211 | else |
---|
| 212 | empty = strcmp(xmlstring(close-1:close),'/>'); |
---|
| 213 | if empty |
---|
| 214 | close = close - 1; |
---|
| 215 | end |
---|
| 216 | starttag = normalize(xmlstring(frag.str+1:close-1)); |
---|
| 217 | nextspace = xml_findstr(starttag,' ',1,1); |
---|
| 218 | attribs = ''; |
---|
| 219 | if isempty(nextspace) |
---|
| 220 | name = starttag; |
---|
| 221 | else |
---|
| 222 | name = starttag(1:nextspace-1); |
---|
| 223 | attribs = starttag(nextspace+1:end); |
---|
| 224 | end |
---|
| 225 | xtree{Xparse_count} = element; |
---|
| 226 | xtree{Xparse_count}.name = strip(name); |
---|
| 227 | if frag.parent |
---|
| 228 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 229 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 230 | end |
---|
| 231 | if length(attribs) > 0 |
---|
| 232 | xtree{Xparse_count}.attributes = attribution(attribs); |
---|
| 233 | end |
---|
| 234 | if ~empty |
---|
| 235 | contents = fragment; |
---|
| 236 | contents.str = close+1; |
---|
| 237 | contents.end = name; |
---|
| 238 | contents.parent = Xparse_count; |
---|
| 239 | contents = compile(contents); |
---|
| 240 | frag.str = contents.str; |
---|
| 241 | else |
---|
| 242 | frag.str = close+2; |
---|
| 243 | end |
---|
| 244 | end |
---|
| 245 | |
---|
| 246 | %----------------------------------------------------------------------- |
---|
| 247 | function frag = tag_pi(frag) |
---|
| 248 | global xmlstring xtree Xparse_count; |
---|
| 249 | close = xml_findstr(xmlstring,'?>',frag.str,1); |
---|
| 250 | if isempty(close) |
---|
| 251 | warning('[XML] Tag <? opened but not closed.') |
---|
| 252 | else |
---|
| 253 | nextspace = xml_findstr(xmlstring,' ',frag.str,1); |
---|
| 254 | xtree{Xparse_count} = pri; |
---|
| 255 | if nextspace > close | nextspace == frag.str+2 |
---|
| 256 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1)); |
---|
| 257 | else |
---|
| 258 | xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1)); |
---|
| 259 | xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace)); |
---|
| 260 | end |
---|
| 261 | if frag.parent |
---|
| 262 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 263 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 264 | end |
---|
| 265 | frag.str = close+2; |
---|
| 266 | end |
---|
| 267 | |
---|
| 268 | %----------------------------------------------------------------------- |
---|
| 269 | function frag = tag_comment(frag) |
---|
| 270 | global xmlstring xtree Xparse_count; |
---|
| 271 | close = xml_findstr(xmlstring,'-->',frag.str,1); |
---|
| 272 | if isempty(close) |
---|
| 273 | warning('[XML] Tag <!-- opened but not closed.') |
---|
| 274 | else |
---|
| 275 | xtree{Xparse_count} = comment; |
---|
| 276 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1)); |
---|
| 277 | if frag.parent |
---|
| 278 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 279 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 280 | end |
---|
| 281 | frag.str = close+3; |
---|
| 282 | end |
---|
| 283 | |
---|
| 284 | %----------------------------------------------------------------------- |
---|
| 285 | function frag = tag_cdata(frag) |
---|
| 286 | global xmlstring xtree Xparse_count; |
---|
| 287 | close = xml_findstr(xmlstring,']]>',frag.str,1); |
---|
| 288 | if isempty(close) |
---|
| 289 | warning('[XML] Tag <![CDATA[ opened but not closed.') |
---|
| 290 | else |
---|
| 291 | xtree{Xparse_count} = cdata; |
---|
| 292 | xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1); |
---|
| 293 | if frag.parent |
---|
| 294 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 295 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 296 | end |
---|
| 297 | frag.str = close+3; |
---|
| 298 | end |
---|
| 299 | |
---|
| 300 | %----------------------------------------------------------------------- |
---|
| 301 | function all = attribution(str) |
---|
| 302 | %- Initialize attributs |
---|
| 303 | nbattr = 0; |
---|
| 304 | all = cell(nbattr); |
---|
| 305 | %- Look for 'key="value"' substrings |
---|
| 306 | while 1, |
---|
| 307 | eq = xml_findstr(str,'=',1,1); |
---|
| 308 | if isempty(str) | isempty(eq), return; end |
---|
| 309 | id = xml_findstr(str,'"',1,1); % should also look for '''' |
---|
| 310 | nextid = xml_findstr(str,'"',id+1,1);% rather than only '"' |
---|
| 311 | nbattr = nbattr + 1; |
---|
| 312 | all{nbattr}.key = strip(str(1:(eq-1))); |
---|
| 313 | all{nbattr}.val = entity(str((id+1):(nextid-1))); |
---|
| 314 | str = str((nextid+1):end); |
---|
| 315 | end |
---|
| 316 | |
---|
| 317 | %----------------------------------------------------------------------- |
---|
| 318 | function elm = element |
---|
| 319 | global Xparse_count; |
---|
| 320 | Xparse_count = Xparse_count + 1; |
---|
| 321 | elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count); |
---|
| 322 | |
---|
| 323 | %----------------------------------------------------------------------- |
---|
| 324 | function cdat = chardata |
---|
| 325 | global Xparse_count; |
---|
| 326 | Xparse_count = Xparse_count + 1; |
---|
| 327 | cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count); |
---|
| 328 | |
---|
| 329 | %----------------------------------------------------------------------- |
---|
| 330 | function cdat = cdata |
---|
| 331 | global Xparse_count; |
---|
| 332 | Xparse_count = Xparse_count + 1; |
---|
| 333 | cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count); |
---|
| 334 | |
---|
| 335 | %----------------------------------------------------------------------- |
---|
| 336 | function proce = pri |
---|
| 337 | global Xparse_count; |
---|
| 338 | Xparse_count = Xparse_count + 1; |
---|
| 339 | proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count); |
---|
| 340 | |
---|
| 341 | %----------------------------------------------------------------------- |
---|
| 342 | function commt = comment |
---|
| 343 | global Xparse_count; |
---|
| 344 | Xparse_count = Xparse_count + 1; |
---|
| 345 | commt = struct('type','comment','value','','parent',[],'uid',Xparse_count); |
---|
| 346 | |
---|
| 347 | %----------------------------------------------------------------------- |
---|
| 348 | function frg = fragment |
---|
| 349 | frg = struct('str','','parent','','end',''); |
---|
| 350 | |
---|
| 351 | %----------------------------------------------------------------------- |
---|
| 352 | function str = prolog(str) |
---|
| 353 | %- Initialize beginning index of elements tree |
---|
| 354 | b = 1; |
---|
| 355 | %- Initial tag |
---|
| 356 | start = xml_findstr(str,'<',1,1); |
---|
| 357 | if isempty(start) |
---|
| 358 | error('[XML] No tag found.') |
---|
| 359 | end |
---|
| 360 | %- Header (<?xml version="1.0" ... ?>) |
---|
| 361 | if strcmp(lower(str(start:start+2)),'<?x') |
---|
| 362 | close = xml_findstr(str,'?>',1,1); |
---|
| 363 | if ~isempty(close) |
---|
| 364 | b = close + 2; |
---|
| 365 | else |
---|
| 366 | warning('[XML] Header tag incomplete.') |
---|
| 367 | end |
---|
| 368 | end |
---|
| 369 | %- Doctype (<!DOCTYPE type ... [ declarations ]>) |
---|
| 370 | start = xml_findstr(str,'<!DOCTYPE',b,1); % length('<!DOCTYPE') = 9 |
---|
| 371 | if ~isempty(start) |
---|
| 372 | close = xml_findstr(str,'>',start+9,1); |
---|
| 373 | if ~isempty(close) |
---|
| 374 | b = close + 1; |
---|
| 375 | dp = xml_findstr(str,'[',start+9,1); |
---|
| 376 | if (~isempty(dp) & dp < b) |
---|
| 377 | k = xml_findstr(str,']>',start+9,1); |
---|
| 378 | if ~isempty(k) |
---|
| 379 | b = k + 2; |
---|
| 380 | else |
---|
| 381 | warning('[XML] Tag [ in DOCTYPE opened but not closed.') |
---|
| 382 | end |
---|
| 383 | end |
---|
| 384 | else |
---|
| 385 | warning('[XML] Tag DOCTYPE opened but not closed.') |
---|
| 386 | end |
---|
| 387 | end |
---|
| 388 | %- Skip prolog from the xml string |
---|
| 389 | str = str(b:end); |
---|
| 390 | |
---|
| 391 | %----------------------------------------------------------------------- |
---|
| 392 | function str = strip(str) |
---|
| 393 | a = isspace(str); |
---|
| 394 | a = find(a==1); |
---|
| 395 | str(a) = ''; |
---|
| 396 | |
---|
| 397 | %----------------------------------------------------------------------- |
---|
| 398 | function str = normalize(str) |
---|
| 399 | % Find white characters (space, newline, carriage return, tabs, ...) |
---|
| 400 | i = isspace(str); |
---|
| 401 | i = find(i == 1); |
---|
| 402 | str(i) = ' '; |
---|
| 403 | % replace several white characters by only one |
---|
| 404 | if ~isempty(i) |
---|
| 405 | j = i - [i(2:end) i(end)]; |
---|
| 406 | k = find(j == -1); |
---|
| 407 | str(i(k)) = []; |
---|
| 408 | end |
---|
| 409 | |
---|
| 410 | %----------------------------------------------------------------------- |
---|
| 411 | function str = entity(str) |
---|
| 412 | str = strrep(str,'<','<'); |
---|
| 413 | str = strrep(str,'>','>'); |
---|
| 414 | str = strrep(str,'"','"'); |
---|
| 415 | str = strrep(str,''',''''); |
---|
| 416 | str = strrep(str,'&','&'); |
---|
| 417 | |
---|
| 418 | %----------------------------------------------------------------------- |
---|
| 419 | function str = erode(str) |
---|
| 420 | if ~isempty(str) & str(1)==' ' str(1)=''; end; |
---|
| 421 | if ~isempty(str) & str(end)==' ' str(end)=''; end; |
---|