[723] | 1 | function tree = xml_parser(filename) |
---|
| 2 | % XML (eXtensible Markup Language) Processor |
---|
| 3 | % FORMAT tree = xml_parser(filename) |
---|
| 4 | % |
---|
| 5 | % filename - XML file to parse |
---|
| 6 | % tree - tree structure corresponding to the XML file |
---|
| 7 | %_______________________________________________________________________ |
---|
| 8 | % |
---|
| 9 | % xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser |
---|
| 10 | % written in Matlab. It aims to be fully conforming. It is currently not |
---|
| 11 | % a validating XML processor. |
---|
| 12 | % (based on a Javascript parser available at http://www.jeremie.com) |
---|
| 13 | % |
---|
| 14 | % A description of the tree structure provided in output is detailed in |
---|
| 15 | % the header of this m-file. |
---|
| 16 | %_______________________________________________________________________ |
---|
| 17 | % @(#)xml_parser.m Guillaume Flandin 2002/04/04 |
---|
| 18 | |
---|
| 19 | % XML Processor for MATLAB (The Mathworks, Inc.). |
---|
| 20 | % Copyright (C) 2002 Guillaume Flandin |
---|
| 21 | % |
---|
| 22 | % This program is free software; you can redistribute it and/or |
---|
| 23 | % modify it under the terms of the GNU General Public License |
---|
| 24 | % as published by the Free Software Foundation; either version 2 |
---|
| 25 | % of the License, or any later version. |
---|
| 26 | % |
---|
| 27 | % This program is distributed in the hope that it will be useful, |
---|
| 28 | % but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 29 | % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 30 | % GNU General Public License for more details. |
---|
| 31 | % |
---|
| 32 | % You should have received a copy of the GNU General Public License |
---|
| 33 | % along with this program; if not, write to the Free Software |
---|
| 34 | % Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA. |
---|
| 35 | %----------------------------------------------------------------------- |
---|
| 36 | |
---|
| 37 | % Please feel free to email the author any comment/suggestion/bug report |
---|
| 38 | % to improve this XML processor in Matlab. |
---|
| 39 | % Email: Guillaume.Flandin@sophia.inria.fr |
---|
| 40 | % Check also the latest developments on the following webpage: |
---|
| 41 | % http://www-sop.inria.fr/epidaure/personnel/flandin/xml/ |
---|
| 42 | %----------------------------------------------------------------------- |
---|
| 43 | |
---|
| 44 | % A mex-file xml_findstr.c is also required, to encompass some |
---|
| 45 | % limitations of the built-in findstr Matlab function. |
---|
| 46 | % Compile it on your architecture using 'mex -O xml_findstr.c' command |
---|
| 47 | % if the compiled version for your system is not provided. |
---|
| 48 | % If this function behaves badly (crash or wrong results), comment the |
---|
| 49 | % line '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again. |
---|
| 50 | %----------------------------------------------------------------------- |
---|
| 51 | |
---|
| 52 | % Structure of the output tree: |
---|
| 53 | % There are 5 types of nodes in an XML file: element, chardata, cdata, |
---|
| 54 | % pi and comment. |
---|
| 55 | % Each of them contains an UID (Unique Identifier): an integer between |
---|
| 56 | % 1 and the number of nodes of the XML file. |
---|
| 57 | % |
---|
| 58 | % element (a tag <name key="value"> [contents] </name> |
---|
| 59 | % |_ type: 'element' |
---|
| 60 | % |_ name: string |
---|
| 61 | % |_ attributes: cell array of struct 'key' and 'value' or [] |
---|
| 62 | % |_ contents: double array of uid's or [] if empty |
---|
| 63 | % |_ parent: uid of the parent ([] if root) |
---|
| 64 | % |_ uid: double |
---|
| 65 | % |
---|
| 66 | % chardata (a character array) |
---|
| 67 | % |_ type: 'chardata' |
---|
| 68 | % |_ value: string |
---|
| 69 | % |_ parent: uid of the parent |
---|
| 70 | % |_ uid: double |
---|
| 71 | % |
---|
| 72 | % cdata (a litteral string <![CDATA[value]]>) |
---|
| 73 | % |_ type: 'cdata' |
---|
| 74 | % |_ value: string |
---|
| 75 | % |_ parent: uid of the parent |
---|
| 76 | % |_ uid: double |
---|
| 77 | % |
---|
| 78 | % pi (a processing instruction <?target value ?>) |
---|
| 79 | % |_ type: 'pi' |
---|
| 80 | % |_ target: string (may be empty) |
---|
| 81 | % |_ value: string |
---|
| 82 | % |_ parent: uid of the parent |
---|
| 83 | % |_ uid: double |
---|
| 84 | % |
---|
| 85 | % comment (a comment <!-- value -->) |
---|
| 86 | % |_ type: 'comment' |
---|
| 87 | % |_ value: string |
---|
| 88 | % |_ parent: uid of the parent |
---|
| 89 | % |_ uid: double |
---|
| 90 | % |
---|
| 91 | %----------------------------------------------------------------------- |
---|
| 92 | |
---|
| 93 | % TODO/BUG/FEATURES: |
---|
| 94 | % - [compile] only a warning if TagStart is empty |
---|
| 95 | % - [attribution] should look for " and ' rather than only " |
---|
| 96 | % - [main] with normalize as a preprocessing, CDATA are modified |
---|
| 97 | % - [prolog] look for a DOCTYPE in the whole string even if it occurs |
---|
| 98 | % only in a far CDATA tag (for example)... |
---|
| 99 | % - [tag_element] erode should replace normalize here |
---|
| 100 | % - remove globals? uppercase globals rather persistent (clear mfile)? |
---|
| 101 | % - xml_findst is in fact xml_strfind according to Mathworks vocabulary |
---|
| 102 | % - problem with entity (don't know if the bug is here or in save fct.) |
---|
| 103 | %----------------------------------------------------------------------- |
---|
| 104 | |
---|
| 105 | %- XML string to parse and number of tags read |
---|
| 106 | global xmlstring Xparse_count xtree; |
---|
| 107 | |
---|
| 108 | %- Check input arguments |
---|
| 109 | error(nargchk(1,1,nargin)); |
---|
| 110 | if isempty(filename) |
---|
| 111 | error('Not enough parameters.') |
---|
| 112 | elseif ~isstr(filename) | sum(size(filename)>1)>1 |
---|
| 113 | error('Input must be a string filename.') |
---|
| 114 | end |
---|
| 115 | |
---|
| 116 | %- Read the entire XML file |
---|
| 117 | fid = fopen(filename,'rt'); |
---|
| 118 | if (fid==-1) |
---|
| 119 | error(sprintf('Cannot open %s for reading.',filename)) |
---|
| 120 | end |
---|
| 121 | xmlstring = fscanf(fid,'%c'); |
---|
| 122 | fclose(fid); |
---|
| 123 | |
---|
| 124 | %- Initialize number of tags (<=> uid) |
---|
| 125 | Xparse_count = 0; |
---|
| 126 | |
---|
| 127 | %- Remove prolog and white space characters from the XML string |
---|
| 128 | xmlstring = normalize(prolog(xmlstring)); |
---|
| 129 | |
---|
| 130 | %- Initialize the XML tree |
---|
| 131 | xtree = {}; |
---|
| 132 | tree = fragment; |
---|
| 133 | tree.str = 1; |
---|
| 134 | tree.parent = 0; |
---|
| 135 | |
---|
| 136 | %- Parse the XML string |
---|
| 137 | tree = compile(tree); |
---|
| 138 | |
---|
| 139 | %- Return the XML tree |
---|
| 140 | tree = xtree; |
---|
| 141 | |
---|
| 142 | %- Remove global variables from the workspace |
---|
| 143 | clear global xmlstring Xparse_count xtree; |
---|
| 144 | |
---|
| 145 | %======================================================================= |
---|
| 146 | % SUBFUNCTIONS |
---|
| 147 | |
---|
| 148 | %----------------------------------------------------------------------- |
---|
| 149 | function frag = compile(frag) |
---|
| 150 | global xmlstring xtree Xparse_count; |
---|
| 151 | |
---|
| 152 | while 1, |
---|
| 153 | if length(xmlstring)<=frag.str | ... |
---|
| 154 | (frag.str == length(xmlstring)-1 & strcmp(xmlstring(frag.str:end),' ')) |
---|
| 155 | return |
---|
| 156 | end |
---|
| 157 | TagStart = xml_findstr(xmlstring,'<',frag.str,1); |
---|
| 158 | if isempty(TagStart) |
---|
| 159 | %- Character data (should be an error) |
---|
| 160 | warning('[XML] Unknown data at the end of the XML file.'); |
---|
| 161 | fprintf('Please send me your XML file at gflandin@sophia.inria.fr\n'); |
---|
| 162 | %thisary = length(frag.ary) + 1; |
---|
| 163 | xtree{Xparse_count+1} = chardata; |
---|
| 164 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end))); |
---|
| 165 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 166 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 167 | %frag.str = ''; |
---|
| 168 | elseif TagStart > frag.str |
---|
| 169 | if strcmp(xmlstring(frag.str:TagStart-1),' ') |
---|
| 170 | %- A single white space before a tag (ignore) |
---|
| 171 | frag.str = TagStart; |
---|
| 172 | else |
---|
| 173 | %- Character data |
---|
| 174 | xtree{Xparse_count} = chardata; |
---|
| 175 | xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1))); |
---|
| 176 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 177 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 178 | frag.str = TagStart; |
---|
| 179 | end |
---|
| 180 | else |
---|
| 181 | if strcmp(xmlstring(frag.str+1),'?') |
---|
| 182 | %- Processing instruction |
---|
| 183 | frag = tag_pi(frag); |
---|
| 184 | else |
---|
| 185 | if length(xmlstring)-frag.str>4 & strcmp(xmlstring(frag.str+1:frag.str+3),'!--') |
---|
| 186 | %- Comment |
---|
| 187 | frag = tag_comment(frag); |
---|
| 188 | else |
---|
| 189 | if length(xmlstring)-frag.str>9 & strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[') |
---|
| 190 | %- Litteral data |
---|
| 191 | frag = tag_cdata(frag); |
---|
| 192 | else |
---|
| 193 | %- A tag element (empty (<.../>) or not) |
---|
| 194 | if ~isempty(frag.end) |
---|
| 195 | endmk = ['/' frag.end '>']; |
---|
| 196 | else |
---|
| 197 | endmk = '/>'; |
---|
| 198 | end |
---|
| 199 | if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) | ... |
---|
| 200 | strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk) |
---|
| 201 | frag.str = frag.str + length(frag.end)+3; |
---|
| 202 | return |
---|
| 203 | else |
---|
| 204 | frag = tag_element(frag); |
---|
| 205 | end |
---|
| 206 | end |
---|
| 207 | end |
---|
| 208 | end |
---|
| 209 | end |
---|
| 210 | end |
---|
| 211 | |
---|
| 212 | %----------------------------------------------------------------------- |
---|
| 213 | function frag = tag_element(frag) |
---|
| 214 | global xmlstring xtree Xparse_count; |
---|
| 215 | close = xml_findstr(xmlstring,'>',frag.str,1); |
---|
| 216 | if isempty(close) |
---|
| 217 | error('[XML] Tag < opened but not closed.'); |
---|
| 218 | else |
---|
| 219 | empty = strcmp(xmlstring(close-1:close),'/>'); |
---|
| 220 | if empty |
---|
| 221 | close = close - 1; |
---|
| 222 | end |
---|
| 223 | starttag = normalize(xmlstring(frag.str+1:close-1)); |
---|
| 224 | nextspace = xml_findstr(starttag,' ',1,1); |
---|
| 225 | attribs = ''; |
---|
| 226 | if isempty(nextspace) |
---|
| 227 | name = starttag; |
---|
| 228 | else |
---|
| 229 | name = starttag(1:nextspace-1); |
---|
| 230 | attribs = starttag(nextspace+1:end); |
---|
| 231 | end |
---|
| 232 | xtree{Xparse_count} = element; |
---|
| 233 | xtree{Xparse_count}.name = strip(name); |
---|
| 234 | if frag.parent |
---|
| 235 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 236 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 237 | end |
---|
| 238 | if length(attribs) > 0 |
---|
| 239 | xtree{Xparse_count}.attributes = attribution(attribs); |
---|
| 240 | end |
---|
| 241 | if ~empty |
---|
| 242 | contents = fragment; |
---|
| 243 | contents.str = close+1; |
---|
| 244 | contents.end = name; |
---|
| 245 | contents.parent = Xparse_count; |
---|
| 246 | contents = compile(contents); |
---|
| 247 | frag.str = contents.str; |
---|
| 248 | else |
---|
| 249 | frag.str = close+2; |
---|
| 250 | end |
---|
| 251 | end |
---|
| 252 | |
---|
| 253 | %----------------------------------------------------------------------- |
---|
| 254 | function frag = tag_pi(frag) |
---|
| 255 | global xmlstring xtree Xparse_count; |
---|
| 256 | close = xml_findstr(xmlstring,'?>',frag.str,1); |
---|
| 257 | if isempty(close) |
---|
| 258 | warning('[XML] Tag <? opened but not closed.') |
---|
| 259 | else |
---|
| 260 | nextspace = xml_findstr(xmlstring,' ',frag.str,1); |
---|
| 261 | xtree{Xparse_count} = pri; |
---|
| 262 | if nextspace > close | nextspace == frag.str+2 |
---|
| 263 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1)); |
---|
| 264 | else |
---|
| 265 | xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1)); |
---|
| 266 | xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace)); |
---|
| 267 | end |
---|
| 268 | if frag.parent |
---|
| 269 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 270 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 271 | end |
---|
| 272 | frag.str = close+2; |
---|
| 273 | end |
---|
| 274 | |
---|
| 275 | %----------------------------------------------------------------------- |
---|
| 276 | function frag = tag_comment(frag) |
---|
| 277 | global xmlstring xtree Xparse_count; |
---|
| 278 | close = xml_findstr(xmlstring,'-->',frag.str,1); |
---|
| 279 | if isempty(close) |
---|
| 280 | warning('[XML] Tag <!-- opened but not closed.') |
---|
| 281 | else |
---|
| 282 | xtree{Xparse_count} = comment; |
---|
| 283 | xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1)); |
---|
| 284 | if frag.parent |
---|
| 285 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 286 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 287 | end |
---|
| 288 | frag.str = close+3; |
---|
| 289 | end |
---|
| 290 | |
---|
| 291 | %----------------------------------------------------------------------- |
---|
| 292 | function frag = tag_cdata(frag) |
---|
| 293 | global xmlstring xtree Xparse_count; |
---|
| 294 | close = xml_findstr(xmlstring,']]>',frag.str,1); |
---|
| 295 | if isempty(close) |
---|
| 296 | warning('[XML] Tag <![CDATA[ opened but not closed.') |
---|
| 297 | else |
---|
| 298 | xtree{Xparse_count} = cdata; |
---|
| 299 | xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1); |
---|
| 300 | if frag.parent |
---|
| 301 | xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count]; |
---|
| 302 | xtree{Xparse_count}.parent = frag.parent; |
---|
| 303 | end |
---|
| 304 | frag.str = close+3; |
---|
| 305 | end |
---|
| 306 | |
---|
| 307 | %----------------------------------------------------------------------- |
---|
| 308 | function all = attribution(str) |
---|
| 309 | %- Initialize attributs |
---|
| 310 | nbattr = 0; |
---|
| 311 | all = cell(nbattr); |
---|
| 312 | %- Look for 'key="value"' substrings |
---|
| 313 | while 1, |
---|
| 314 | eq = xml_findstr(str,'=',1,1); |
---|
| 315 | if isempty(str) | isempty(eq), return; end |
---|
| 316 | id = xml_findstr(str,'"',1,1); % should also look for '''' |
---|
| 317 | nextid = xml_findstr(str,'"',id+1,1);% rather than only '"' |
---|
| 318 | nbattr = nbattr + 1; |
---|
| 319 | all{nbattr}.key = strip(str(1:(eq-1))); |
---|
| 320 | all{nbattr}.val = entity(str((id+1):(nextid-1))); |
---|
| 321 | str = str((nextid+1):end); |
---|
| 322 | end |
---|
| 323 | |
---|
| 324 | %----------------------------------------------------------------------- |
---|
| 325 | function elm = element |
---|
| 326 | global Xparse_count; |
---|
| 327 | Xparse_count = Xparse_count + 1; |
---|
| 328 | elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count); |
---|
| 329 | |
---|
| 330 | %----------------------------------------------------------------------- |
---|
| 331 | function cdat = chardata |
---|
| 332 | global Xparse_count; |
---|
| 333 | Xparse_count = Xparse_count + 1; |
---|
| 334 | cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count); |
---|
| 335 | |
---|
| 336 | %----------------------------------------------------------------------- |
---|
| 337 | function cdat = cdata |
---|
| 338 | global Xparse_count; |
---|
| 339 | Xparse_count = Xparse_count + 1; |
---|
| 340 | cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count); |
---|
| 341 | |
---|
| 342 | %----------------------------------------------------------------------- |
---|
| 343 | function proce = pri |
---|
| 344 | global Xparse_count; |
---|
| 345 | Xparse_count = Xparse_count + 1; |
---|
| 346 | proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count); |
---|
| 347 | |
---|
| 348 | %----------------------------------------------------------------------- |
---|
| 349 | function commt = comment |
---|
| 350 | global Xparse_count; |
---|
| 351 | Xparse_count = Xparse_count + 1; |
---|
| 352 | commt = struct('type','comment','value','','parent',[],'uid',Xparse_count); |
---|
| 353 | |
---|
| 354 | %----------------------------------------------------------------------- |
---|
| 355 | function frg = fragment |
---|
| 356 | frg = struct('str','','parent','','end',''); |
---|
| 357 | |
---|
| 358 | %----------------------------------------------------------------------- |
---|
| 359 | function str = prolog(str) |
---|
| 360 | %- Initialize beginning index of elements tree |
---|
| 361 | b = 1; |
---|
| 362 | %- Initial tag |
---|
| 363 | start = xml_findstr(str,'<',1,1); |
---|
| 364 | if isempty(start) |
---|
| 365 | error('[XML] No tag found.') |
---|
| 366 | end |
---|
| 367 | %- Header (<?xml version="1.0" ... ?>) |
---|
| 368 | if strcmp(lower(str(start:start+2)),'<?x') |
---|
| 369 | close = xml_findstr(str,'?>',1,1); |
---|
| 370 | if ~isempty(close) |
---|
| 371 | b = close + 2; |
---|
| 372 | else |
---|
| 373 | warning('[XML] Header tag incomplete.') |
---|
| 374 | end |
---|
| 375 | end |
---|
| 376 | %- Doctype (<!DOCTYPE type ... [ declarations ]>) |
---|
| 377 | start = xml_findstr(str,'<!DOCTYPE',b,1); % length('<!DOCTYPE') = 9 |
---|
| 378 | if ~isempty(start) |
---|
| 379 | close = xml_findstr(str,'>',start+9,1); |
---|
| 380 | if ~isempty(close) |
---|
| 381 | b = close + 1; |
---|
| 382 | dp = xml_findstr(str,'[',start+9,1); |
---|
| 383 | if (~isempty(dp) & dp < b) |
---|
| 384 | k = xml_findstr(str,']>',start+9,1); |
---|
| 385 | if ~isempty(k) |
---|
| 386 | b = k + 2; |
---|
| 387 | else |
---|
| 388 | warning('[XML] Tag [ in DOCTYPE opened but not closed.') |
---|
| 389 | end |
---|
| 390 | end |
---|
| 391 | else |
---|
| 392 | warning('[XML] Tag DOCTYPE opened but not closed.') |
---|
| 393 | end |
---|
| 394 | end |
---|
| 395 | %- Skip prolog from the xml string |
---|
| 396 | str = str(b:end); |
---|
| 397 | |
---|
| 398 | %----------------------------------------------------------------------- |
---|
| 399 | function str = strip(str) |
---|
| 400 | a = isspace(str); |
---|
| 401 | a = find(a==1); |
---|
| 402 | str(a) = ''; |
---|
| 403 | |
---|
| 404 | %----------------------------------------------------------------------- |
---|
| 405 | function str = normalize(str) |
---|
| 406 | % Find white characters (space, newline, carriage return, tabs, ...) |
---|
| 407 | i = isspace(str); |
---|
| 408 | i = find(i == 1); |
---|
| 409 | str(i) = ' '; |
---|
| 410 | % replace several white characters by only one |
---|
| 411 | if ~isempty(i) |
---|
| 412 | j = i - [i(2:end) i(end)]; |
---|
| 413 | k = find(j == -1); |
---|
| 414 | str(i(k)) = []; |
---|
| 415 | end |
---|
| 416 | |
---|
| 417 | %----------------------------------------------------------------------- |
---|
| 418 | function str = entity(str) |
---|
| 419 | str = strrep(str,'<','<'); |
---|
| 420 | str = strrep(str,'>','>'); |
---|
| 421 | str = strrep(str,'"','"'); |
---|
| 422 | str = strrep(str,''',''''); |
---|
| 423 | str = strrep(str,'&','&'); |
---|
| 424 | |
---|
| 425 | %----------------------------------------------------------------------- |
---|
| 426 | function str = erode(str) |
---|
| 427 | if ~isempty(str) & str(1)==' ' str(1)=''; end; |
---|
| 428 | if ~isempty(str) & str(end)==' ' str(end)=''; end; |
---|