source: trunk/src/@xmltree/private/xml_parser.m @ 831

Last change on this file since 831 was 820, checked in by sommeria, 10 years ago

xmltree_updated

File size: 14.1 KB
Line 
1function tree = xml_parser(xmlstr)
2% XML (eXtensible Markup Language) Processor
3% FORMAT tree = xml_parser(xmlstr)
4%
5% xmlstr  - XML string to parse
6% tree    - tree structure corresponding to the XML file
7%_______________________________________________________________________
8%
9% xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser
10% written in Matlab. It aims to be fully conforming. It is currently not
11% a validating XML processor.
12%
13% A description of the tree structure provided in output is detailed in
14% the header of this m-file.
15%_______________________________________________________________________
16% @(#)xml_parser.m              Guillaume Flandin            2002/04/04
17
18% XML Processor for MATLAB (The Mathworks, Inc.).
19% Copyright (C) 2002-2003 Guillaume Flandin <Guillaume@artefact.tk>
20%
21% This program is free software; you can redistribute it and/or
22% modify it under the terms of the GNU General Public License
23% as published by the Free Software Foundation; either version 2
24% of the License, or any later version.
25%
26% This program is distributed in the hope that it will be useful,
27% but WITHOUT ANY WARRANTY; without even the implied warranty of
28% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29% GNU General Public License for more details.
30%
31% You should have received a copy of the GNU General Public License
32% along with this program; if not, write to the Free Software
33% Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA.
34%-----------------------------------------------------------------------
35
36% Suggestions for improvement and fixes are always welcome, although no
37% guarantee is made whether and when they will be implemented.
38% Send requests to <Guillaume@artefact.tk>
39% Check also the latest developments on the following webpage:
40%           <http://www.artefact.tk/software/matlab/xml/>
41%-----------------------------------------------------------------------
42
43% The implementation of this XML parser is much inspired from a
44% Javascript parser available at <http://www.jeremie.com/>
45
46% A mex-file xml_findstr.c is also required, to encompass some
47% limitations of the built-in findstr Matlab function.
48% Compile it on your architecture using 'mex -O xml_findstr.c' command
49% if the compiled version for your system is not provided.
50% If this function behaves badly (crash or wrong results), comment the
51% line '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again.
52%-----------------------------------------------------------------------
53
54% Structure of the output tree:
55% There are 5 types of nodes in an XML file: element, chardata, cdata,
56% pi and comment.
57% Each of them contains an UID (Unique Identifier): an integer between
58% 1 and the number of nodes of the XML file.
59%
60%    element (a tag <name key="value"> [contents] </name>
61%       |_ type:       'element'
62%       |_ name:       string
63%       |_ attributes: cell array of struct 'key' and 'value' or []
64%       |_ contents:   double array of uid's or [] if empty
65%       |_ parent:     uid of the parent ([] if root)
66%       |_ uid:        double
67%
68%    chardata (a character array)
69%       |_ type:   'chardata'
70%       |_ value:  string
71%       |_ parent: uid of the parent
72%       |_ uid:    double
73%
74%    cdata (a litteral string <![CDATA[value]]>)
75%       |_ type:   'cdata'
76%       |_ value:  string
77%       |_ parent: uid of the parent
78%       |_ uid:    double
79%
80%      pi (a processing instruction <?target value ?>)
81%       |_ type:   'pi'
82%       |_ target: string (may be empty)
83%       |_ value:  string
84%       |_ parent: uid of the parent
85%       |_ uid:    double
86%
87%    comment (a comment <!-- value -->)
88%       |_ type:   'comment'
89%       |_ value:  string
90%       |_ parent: uid of the parent
91%       |_ uid:    double
92%
93%-----------------------------------------------------------------------
94
95% TODO/BUG/FEATURES:
96%  - [compile] only a warning if TagStart is empty ?
97%  - [attribution] should look for " and ' rather than only "
98%  - [main] with normalize as a preprocessing, CDATA are modified
99%  - [prolog] look for a DOCTYPE in the whole string even if it occurs
100%    only in a far CDATA tag, bug even if the doctype is inside a comment
101%  - [tag_element] erode should replace normalize here
102%  - remove globals? uppercase globals  rather persistent (clear mfile)?
103%  - xml_findstr is indeed xml_strfind according to Mathworks vocabulary
104%  - problem with entities: do we need to convert them here? (&eacute;)
105%-----------------------------------------------------------------------
106
107%- XML string to parse and number of tags read
108global xmlstring Xparse_count xtree;
109
110%- Check input arguments
111error(nargchk(1,1,nargin));
112if isempty(xmlstr)
113        error('[XML] Not enough parameters.')
114elseif ~isstr(xmlstr) | sum(size(xmlstr)>1)>1
115        error('[XML] Input must be a string.')
116end
117
118%- Initialize number of tags (<=> uid)
119Xparse_count = 0;
120
121%- Remove prolog and white space characters from the XML string
122xmlstring = normalize(prolog(xmlstr));
123
124%- Initialize the XML tree
125xtree = {};
126tree = fragment;
127tree.str = 1;
128tree.parent = 0;
129
130%- Parse the XML string
131tree = compile(tree);
132
133%- Return the XML tree
134tree = xtree;
135
136%- Remove global variables from the workspace
137clear global xmlstring Xparse_count xtree;
138
139%=======================================================================
140% SUBFUNCTIONS
141
142%-----------------------------------------------------------------------
143function frag = compile(frag)
144        global xmlstring xtree Xparse_count;
145       
146        while 1,
147                if length(xmlstring)<=frag.str | ...
148                   (frag.str == length(xmlstring)-1 & strcmp(xmlstring(frag.str:end),' '))
149                        return
150                end
151                TagStart = xml_findstr(xmlstring,'<',frag.str,1);
152                if isempty(TagStart)
153                        %- Character data
154                        error(sprintf(['[XML] Unknown data at the end of the XML file.\n' ...
155                        '      Please send me your XML file at Guillaume@artefact.tk']));
156                        xtree{Xparse_count} = chardata;
157                        xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end)));
158                        xtree{Xparse_count}.parent = frag.parent;
159                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
160                        frag.str = '';
161                elseif TagStart > frag.str
162                        if strcmp(xmlstring(frag.str:TagStart-1),' ')
163                                %- A single white space before a tag (ignore)
164                                frag.str = TagStart;
165                        else
166                                %- Character data
167                                xtree{Xparse_count} = chardata;
168                                xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1)));
169                                xtree{Xparse_count}.parent = frag.parent;
170                                xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
171                                frag.str = TagStart;
172                        end
173                else
174                        if strcmp(xmlstring(frag.str+1),'?')
175                                %- Processing instruction
176                                frag = tag_pi(frag);
177                        else
178                                if length(xmlstring)-frag.str>4 & strcmp(xmlstring(frag.str+1:frag.str+3),'!--')
179                                        %- Comment
180                                        frag = tag_comment(frag);
181                                else
182                                        if length(xmlstring)-frag.str>9 & strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[')
183                                                %- Litteral data
184                                                frag = tag_cdata(frag);
185                                        else
186                                                %- A tag element (empty (<.../>) or not)
187                                                if ~isempty(frag.end)
188                                                        endmk = ['/' frag.end '>'];
189                                                else
190                                                        endmk = '/>';
191                                                end
192                                                if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) | ...
193                                                        strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk)
194                                                        frag.str = frag.str + length(frag.end)+3;
195                                                        return
196                                                else
197                                                        frag = tag_element(frag);
198                                                end
199                                        end
200                                end
201                        end
202                end
203        end
204
205%-----------------------------------------------------------------------
206function frag = tag_element(frag)
207        global xmlstring xtree Xparse_count;
208        close =  xml_findstr(xmlstring,'>',frag.str,1);
209        if isempty(close)
210                error('[XML] Tag < opened but not closed.');
211        else
212                empty = strcmp(xmlstring(close-1:close),'/>');
213                if empty
214                        close = close - 1;
215                end
216                starttag = normalize(xmlstring(frag.str+1:close-1));
217                nextspace = xml_findstr(starttag,' ',1,1);
218                attribs = '';
219                if isempty(nextspace)
220                        name = starttag;
221                else
222                        name = starttag(1:nextspace-1);
223                        attribs = starttag(nextspace+1:end);
224                end
225                xtree{Xparse_count} = element;
226                xtree{Xparse_count}.name = strip(name);
227                if frag.parent
228                        xtree{Xparse_count}.parent = frag.parent;
229                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
230                end
231                if length(attribs) > 0
232                        xtree{Xparse_count}.attributes = attribution(attribs);
233                end
234                if ~empty
235                        contents = fragment;
236                        contents.str = close+1;
237                        contents.end = name;
238                        contents.parent = Xparse_count;
239                        contents = compile(contents);
240                        frag.str = contents.str;
241                else
242                        frag.str = close+2;
243                end
244        end
245
246%-----------------------------------------------------------------------
247function frag = tag_pi(frag)
248        global xmlstring xtree Xparse_count;
249        close = xml_findstr(xmlstring,'?>',frag.str,1);
250        if isempty(close)
251                warning('[XML] Tag <? opened but not closed.')
252        else
253                nextspace = xml_findstr(xmlstring,' ',frag.str,1);
254                xtree{Xparse_count} = pri;
255                if nextspace > close | nextspace == frag.str+2
256                        xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1));
257                else
258                        xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1));
259                        xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace));
260                end
261                if frag.parent
262                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
263                        xtree{Xparse_count}.parent = frag.parent;
264                end
265                frag.str = close+2;
266        end
267
268%-----------------------------------------------------------------------
269function frag = tag_comment(frag)
270        global xmlstring xtree Xparse_count;
271        close = xml_findstr(xmlstring,'-->',frag.str,1);
272        if isempty(close)
273                warning('[XML] Tag <!-- opened but not closed.')
274        else
275                xtree{Xparse_count} = comment;
276                xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1));
277                if frag.parent
278                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
279                        xtree{Xparse_count}.parent = frag.parent;
280                end
281                frag.str = close+3;
282        end
283
284%-----------------------------------------------------------------------
285function frag = tag_cdata(frag)
286        global xmlstring xtree Xparse_count;
287        close = xml_findstr(xmlstring,']]>',frag.str,1);
288        if isempty(close)
289                warning('[XML] Tag <![CDATA[ opened but not closed.')
290        else
291                xtree{Xparse_count} = cdata;
292                xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1);
293                if frag.parent
294                        xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
295                        xtree{Xparse_count}.parent = frag.parent;
296                end
297                frag.str = close+3;
298        end
299
300%-----------------------------------------------------------------------
301function all = attribution(str)
302        %- Initialize attributs
303        nbattr = 0;
304        all = cell(nbattr);
305        %- Look for 'key="value"' substrings
306        while 1,
307                eq = xml_findstr(str,'=',1,1);
308                if isempty(str) | isempty(eq), return; end
309                id = xml_findstr(str,'"',1,1);       % should also look for ''''
310                nextid = xml_findstr(str,'"',id+1,1);% rather than only '"'
311                nbattr = nbattr + 1;
312                all{nbattr}.key = strip(str(1:(eq-1)));
313                all{nbattr}.val = entity(str((id+1):(nextid-1)));
314                str = str((nextid+1):end);
315        end
316
317%-----------------------------------------------------------------------
318function elm = element
319        global Xparse_count;
320        Xparse_count = Xparse_count + 1;
321        elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count);
322   
323%-----------------------------------------------------------------------
324function cdat = chardata
325        global Xparse_count;
326        Xparse_count = Xparse_count + 1;
327        cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count);
328   
329%-----------------------------------------------------------------------
330function cdat = cdata
331        global Xparse_count;
332        Xparse_count = Xparse_count + 1;
333        cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count);
334   
335%-----------------------------------------------------------------------
336function proce = pri
337        global Xparse_count;
338        Xparse_count = Xparse_count + 1;
339        proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count);
340
341%-----------------------------------------------------------------------
342function commt = comment
343        global Xparse_count;
344        Xparse_count = Xparse_count + 1;
345        commt = struct('type','comment','value','','parent',[],'uid',Xparse_count);
346
347%-----------------------------------------------------------------------
348function frg = fragment
349        frg = struct('str','','parent','','end','');
350
351%-----------------------------------------------------------------------
352function str = prolog(str)
353        %- Initialize beginning index of elements tree
354        b = 1;
355        %- Initial tag
356        start = xml_findstr(str,'<',1,1);
357        if isempty(start)
358                error('[XML] No tag found.')
359        end
360        %- Header (<?xml version="1.0" ... ?>)
361        if strcmp(lower(str(start:start+2)),'<?x')
362                close = xml_findstr(str,'?>',1,1);
363                if ~isempty(close)
364                        b = close + 2;
365                else
366                        warning('[XML] Header tag incomplete.')
367                end
368        end
369        %- Doctype (<!DOCTYPE type ... [ declarations ]>)
370        start = xml_findstr(str,'<!DOCTYPE',b,1);  % length('<!DOCTYPE') = 9
371        if ~isempty(start)
372                close = xml_findstr(str,'>',start+9,1);
373                if ~isempty(close)
374                        b = close + 1;
375                        dp = xml_findstr(str,'[',start+9,1);
376                        if (~isempty(dp) & dp < b)
377                                k = xml_findstr(str,']>',start+9,1);
378                                if ~isempty(k)
379                                        b = k + 2;
380                                else
381                                        warning('[XML] Tag [ in DOCTYPE opened but not closed.')
382                                end
383                        end
384                else
385                        warning('[XML] Tag DOCTYPE opened but not closed.')
386                end
387        end
388        %- Skip prolog from the xml string
389        str = str(b:end);
390
391%-----------------------------------------------------------------------
392function str = strip(str)
393        a = isspace(str);
394        a = find(a==1);
395        str(a) = '';
396
397%-----------------------------------------------------------------------
398function str = normalize(str)
399        % Find white characters (space, newline, carriage return, tabs, ...)
400        i = isspace(str);
401        i = find(i == 1);
402        str(i) = ' ';
403        % replace several white characters by only one
404        if ~isempty(i)
405                j = i - [i(2:end) i(end)];
406                k = find(j == -1);
407                str(i(k)) = [];
408        end
409
410%-----------------------------------------------------------------------
411function str = entity(str)
412        str = strrep(str,'&lt;','<');
413        str = strrep(str,'&gt;','>');
414        str = strrep(str,'&quot;','"');
415        str = strrep(str,'&apos;','''');
416        str = strrep(str,'&amp;','&');
417   
418%-----------------------------------------------------------------------
419function str = erode(str)
420        if ~isempty(str) & str(1)==' ' str(1)=''; end;
421        if ~isempty(str) & str(end)==' ' str(end)=''; end;
Note: See TracBrowser for help on using the repository browser.