source: trunk/src/@xmltree/private/xml_parser.m @ 803

Last change on this file since 803 was 723, checked in by sommeria, 11 years ago

xmltree and toolbox_calib added to svn

File size: 14.2 KB
RevLine 
[723]1function tree = xml_parser(filename)
2% XML (eXtensible Markup Language) Processor
3% FORMAT tree = xml_parser(filename)
4%
5% filename - XML file to parse
6% tree     - tree structure corresponding to the XML file
7%_______________________________________________________________________
8%
9% xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser
10% written in Matlab. It aims to be fully conforming. It is currently not
11% a validating XML processor.
12% (based on a Javascript parser available at http://www.jeremie.com)
13%
14% A description of the tree structure provided in output is detailed in
15% the header of this m-file.
16%_______________________________________________________________________
17% @(#)xml_parser.m               Guillaume Flandin           2002/04/04
18
19% XML Processor for MATLAB (The Mathworks, Inc.).
20% Copyright (C) 2002  Guillaume Flandin
21%
22% This program is free software; you can redistribute it and/or
23% modify it under the terms of the GNU General Public License
24% as published by the Free Software Foundation; either version 2
25% of the License, or any later version.
26%
27% This program is distributed in the hope that it will be useful,
28% but WITHOUT ANY WARRANTY; without even the implied warranty of
29% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30% GNU General Public License for more details.
31%
32% You should have received a copy of the GNU General Public License
33% along with this program; if not, write to the Free Software
34% Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA.
35%-----------------------------------------------------------------------
36
37% Please feel free to email the author any comment/suggestion/bug report
38% to improve this XML processor in Matlab.
39% Email: Guillaume.Flandin@sophia.inria.fr
40% Check also the latest developments on the following webpage:
41% http://www-sop.inria.fr/epidaure/personnel/flandin/xml/
42%-----------------------------------------------------------------------
43
44% A mex-file xml_findstr.c is also required, to encompass some
45% limitations of the built-in findstr Matlab function.
46% Compile it on your architecture using 'mex -O xml_findstr.c' command
47% if the compiled version for your system is not provided.
48% If this function behaves badly (crash or wrong results), comment the
49% line '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again.
50%-----------------------------------------------------------------------
51
52% Structure of the output tree:
53% There are 5 types of nodes in an XML file: element, chardata, cdata,
54% pi and comment.
55% Each of them contains an UID (Unique Identifier): an integer between
56% 1 and the number of nodes of the XML file.
57%
58%    element (a tag <name key="value"> [contents] </name>
59%       |_ type:       'element'
60%       |_ name:       string
61%       |_ attributes: cell array of struct 'key' and 'value' or []
62%       |_ contents:   double array of uid's or [] if empty
63%       |_ parent:     uid of the parent ([] if root)
64%       |_ uid:        double
65%
66%    chardata (a character array)
67%       |_ type:   'chardata'
68%       |_ value:  string
69%       |_ parent: uid of the parent
70%       |_ uid:    double
71%
72%    cdata (a litteral string <![CDATA[value]]>)
73%       |_ type:   'cdata'
74%       |_ value:  string
75%       |_ parent: uid of the parent
76%       |_ uid:    double
77%
78%      pi (a processing instruction <?target value ?>)
79%       |_ type:   'pi'
80%       |_ target: string (may be empty)
81%       |_ value:  string
82%       |_ parent: uid of the parent
83%       |_ uid:    double
84%
85%    comment (a comment <!-- value -->)
86%       |_ type:   'comment'
87%       |_ value:  string
88%       |_ parent: uid of the parent
89%       |_ uid:    double
90%
91%-----------------------------------------------------------------------
92
93% TODO/BUG/FEATURES:
94%  - [compile] only a warning if TagStart is empty
95%  - [attribution] should look for " and ' rather than only "
96%  - [main] with normalize as a preprocessing, CDATA are modified
97%  - [prolog] look for a DOCTYPE in the whole string even if it occurs
98%    only in a far CDATA tag (for example)...
99%  - [tag_element] erode should replace normalize here
100%  - remove globals? uppercase globals  rather persistent (clear mfile)?
101%  - xml_findst is in fact xml_strfind according to Mathworks vocabulary
102%  - problem with entity (don't know if the bug is here or in save fct.)
103%-----------------------------------------------------------------------
104
105%- XML string to parse and number of tags read
106global xmlstring Xparse_count xtree;
107
108%- Check input arguments
109error(nargchk(1,1,nargin));
110if isempty(filename)
111        error('Not enough parameters.')
112elseif ~isstr(filename) | sum(size(filename)>1)>1
113        error('Input must be a string filename.')
114end
115
116%- Read the entire XML file
117fid = fopen(filename,'rt');
118if (fid==-1)
119        error(sprintf('Cannot open %s for reading.',filename))
120end
121xmlstring = fscanf(fid,'%c');
122fclose(fid);
123
124%- Initialize number of tags (<=> uid)
125Xparse_count = 0;
126
127%- Remove prolog and white space characters from the XML string
128xmlstring = normalize(prolog(xmlstring));
129
130%- Initialize the XML tree
131xtree = {};
132tree = fragment;
133tree.str = 1;
134tree.parent = 0;
135
136%- Parse the XML string
137tree = compile(tree);
138
139%- Return the XML tree
140tree = xtree;
141
142%- Remove global variables from the workspace
143clear global xmlstring Xparse_count xtree;
144
145%=======================================================================
146% SUBFUNCTIONS
147
148%-----------------------------------------------------------------------
149function frag = compile(frag)
150        global xmlstring xtree Xparse_count;
151       
152        while 1,
153                if length(xmlstring)<=frag.str | ...
154                   (frag.str == length(xmlstring)-1 & strcmp(xmlstring(frag.str:end),' '))
155                        return
156                end
157                TagStart = xml_findstr(xmlstring,'<',frag.str,1);
158                if isempty(TagStart)
159                        %- Character data (should be an error)
160                        warning('[XML] Unknown data at the end of the XML file.');
161                        fprintf('Please send me your XML file at gflandin@sophia.inria.fr\n');
162                        %thisary = length(frag.ary) + 1;
163                        xtree{Xparse_count+1} = chardata;
164                        xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end)));
165                        xtree{Xparse_count}.parent = frag.parent;
166                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
167                        %frag.str = '';
168                elseif TagStart > frag.str
169                        if strcmp(xmlstring(frag.str:TagStart-1),' ')
170                                %- A single white space before a tag (ignore)
171                                frag.str = TagStart;
172                        else
173                                %- Character data
174                                xtree{Xparse_count} = chardata;
175                                xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1)));
176                                xtree{Xparse_count}.parent = frag.parent;
177                                xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
178                                frag.str = TagStart;
179                        end
180                else
181                        if strcmp(xmlstring(frag.str+1),'?')
182                                %- Processing instruction
183                                frag = tag_pi(frag);
184                        else
185                                if length(xmlstring)-frag.str>4 & strcmp(xmlstring(frag.str+1:frag.str+3),'!--')
186                                        %- Comment
187                                        frag = tag_comment(frag);
188                                else
189                                        if length(xmlstring)-frag.str>9 & strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[')
190                                                %- Litteral data
191                                                frag = tag_cdata(frag);
192                                        else
193                                                %- A tag element (empty (<.../>) or not)
194                                                if ~isempty(frag.end)
195                                                        endmk = ['/' frag.end '>'];
196                                                else
197                                                        endmk = '/>';
198                                                end
199                                                if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) | ...
200                                                        strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk)
201                                                        frag.str = frag.str + length(frag.end)+3;
202                                                        return
203                                                else
204                                                        frag = tag_element(frag);
205                                                end
206                                        end
207                                end
208                        end
209                end
210        end
211
212%-----------------------------------------------------------------------
213function frag = tag_element(frag)
214        global xmlstring xtree Xparse_count;
215        close =  xml_findstr(xmlstring,'>',frag.str,1);
216        if isempty(close)
217                error('[XML] Tag < opened but not closed.');
218        else
219                empty = strcmp(xmlstring(close-1:close),'/>');
220                if empty
221                        close = close - 1;
222                end
223                starttag = normalize(xmlstring(frag.str+1:close-1));
224                nextspace = xml_findstr(starttag,' ',1,1);
225                attribs = '';
226                if isempty(nextspace)
227                        name = starttag;
228                else
229                        name = starttag(1:nextspace-1);
230                        attribs = starttag(nextspace+1:end);
231                end
232                xtree{Xparse_count} = element;
233                xtree{Xparse_count}.name = strip(name);
234                if frag.parent
235                        xtree{Xparse_count}.parent = frag.parent;
236                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
237                end
238                if length(attribs) > 0
239                        xtree{Xparse_count}.attributes = attribution(attribs);
240                end
241                if ~empty
242                        contents = fragment;
243                        contents.str = close+1;
244                        contents.end = name;
245                        contents.parent = Xparse_count;
246                        contents = compile(contents);
247                        frag.str = contents.str;
248                else
249                        frag.str = close+2;
250                end
251        end
252
253%-----------------------------------------------------------------------
254function frag = tag_pi(frag)
255        global xmlstring xtree Xparse_count;
256        close = xml_findstr(xmlstring,'?>',frag.str,1);
257        if isempty(close)
258                warning('[XML] Tag <? opened but not closed.')
259        else
260                nextspace = xml_findstr(xmlstring,' ',frag.str,1);
261                xtree{Xparse_count} = pri;
262                if nextspace > close | nextspace == frag.str+2
263                        xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1));
264                else
265                        xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1));
266                        xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace));
267                end
268                if frag.parent
269                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
270                        xtree{Xparse_count}.parent = frag.parent;
271                end
272                frag.str = close+2;
273        end
274
275%-----------------------------------------------------------------------
276function frag = tag_comment(frag)
277        global xmlstring xtree Xparse_count;
278        close = xml_findstr(xmlstring,'-->',frag.str,1);
279        if isempty(close)
280                warning('[XML] Tag <!-- opened but not closed.')
281        else
282                xtree{Xparse_count} = comment;
283                xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1));
284                if frag.parent
285                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
286                        xtree{Xparse_count}.parent = frag.parent;
287                end
288                frag.str = close+3;
289        end
290
291%-----------------------------------------------------------------------
292function frag = tag_cdata(frag)
293        global xmlstring xtree Xparse_count;
294        close = xml_findstr(xmlstring,']]>',frag.str,1);
295        if isempty(close)
296                warning('[XML] Tag <![CDATA[ opened but not closed.')
297        else
298                xtree{Xparse_count} = cdata;
299                xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1);
300                if frag.parent
301                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
302                        xtree{Xparse_count}.parent = frag.parent;
303                end
304                frag.str = close+3;
305        end
306
307%-----------------------------------------------------------------------
308function all = attribution(str)
309        %- Initialize attributs
310        nbattr = 0;
311        all = cell(nbattr);
312        %- Look for 'key="value"' substrings
313        while 1,
314                eq = xml_findstr(str,'=',1,1);
315                if isempty(str) | isempty(eq), return; end
316                id = xml_findstr(str,'"',1,1);       % should also look for ''''
317                nextid = xml_findstr(str,'"',id+1,1);% rather than only '"'
318                nbattr = nbattr + 1;
319                all{nbattr}.key = strip(str(1:(eq-1)));
320                all{nbattr}.val = entity(str((id+1):(nextid-1)));
321                str = str((nextid+1):end);
322        end
323
324%-----------------------------------------------------------------------
325function elm = element
326        global Xparse_count;
327        Xparse_count = Xparse_count + 1;
328        elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count);
329   
330%-----------------------------------------------------------------------
331function cdat = chardata
332        global Xparse_count;
333        Xparse_count = Xparse_count + 1;
334        cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count);
335   
336%-----------------------------------------------------------------------
337function cdat = cdata
338        global Xparse_count;
339        Xparse_count = Xparse_count + 1;
340        cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count);
341   
342%-----------------------------------------------------------------------
343function proce = pri
344        global Xparse_count;
345        Xparse_count = Xparse_count + 1;
346        proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count);
347
348%-----------------------------------------------------------------------
349function commt = comment
350        global Xparse_count;
351        Xparse_count = Xparse_count + 1;
352        commt = struct('type','comment','value','','parent',[],'uid',Xparse_count);
353
354%-----------------------------------------------------------------------
355function frg = fragment
356        frg = struct('str','','parent','','end','');
357
358%-----------------------------------------------------------------------
359function str = prolog(str)
360        %- Initialize beginning index of elements tree
361        b = 1;
362        %- Initial tag
363        start = xml_findstr(str,'<',1,1);
364        if isempty(start)
365                error('[XML] No tag found.')
366        end
367        %- Header (<?xml version="1.0" ... ?>)
368        if strcmp(lower(str(start:start+2)),'<?x')
369                close = xml_findstr(str,'?>',1,1);
370                if ~isempty(close)
371                        b = close + 2;
372                else
373                        warning('[XML] Header tag incomplete.')
374                end
375        end
376        %- Doctype (<!DOCTYPE type ... [ declarations ]>)
377        start = xml_findstr(str,'<!DOCTYPE',b,1);  % length('<!DOCTYPE') = 9
378        if ~isempty(start)
379                close = xml_findstr(str,'>',start+9,1);
380                if ~isempty(close)
381                        b = close + 1;
382                        dp = xml_findstr(str,'[',start+9,1);
383                        if (~isempty(dp) & dp < b)
384                                k = xml_findstr(str,']>',start+9,1);
385                                if ~isempty(k)
386                                        b = k + 2;
387                                else
388                                        warning('[XML] Tag [ in DOCTYPE opened but not closed.')
389                                end
390                        end
391                else
392                        warning('[XML] Tag DOCTYPE opened but not closed.')
393                end
394        end
395        %- Skip prolog from the xml string
396        str = str(b:end);
397
398%-----------------------------------------------------------------------
399function str = strip(str)
400        a = isspace(str);
401        a = find(a==1);
402        str(a) = '';
403
404%-----------------------------------------------------------------------
405function str = normalize(str)
406        % Find white characters (space, newline, carriage return, tabs, ...)
407        i = isspace(str);
408        i = find(i == 1);
409        str(i) = ' ';
410        % replace several white characters by only one
411        if ~isempty(i)
412                j = i - [i(2:end) i(end)];
413                k = find(j == -1);
414                str(i(k)) = [];
415        end
416
417%-----------------------------------------------------------------------
418function str = entity(str)
419        str = strrep(str,'&lt;','<');
420        str = strrep(str,'&gt;','>');
421        str = strrep(str,'&quot;','"');
422        str = strrep(str,'&apos;','''');
423        str = strrep(str,'&amp;','&');
424   
425%-----------------------------------------------------------------------
426function str = erode(str)
427        if ~isempty(str) & str(1)==' ' str(1)=''; end;
428        if ~isempty(str) & str(end)==' ' str(end)=''; end;
Note: See TracBrowser for help on using the repository browser.