source: trunk/src/@xmltree/private/xml_parser.m @ 942

Last change on this file since 942 was 925, checked in by sommeria, 9 years ago

xmltree updated

File size: 16.2 KB
Line 
1function tree = xml_parser(xmlstr)
2% XML (eXtensible Markup Language) Processor
3% FORMAT tree = xml_parser(xmlstr)
4%
5% xmlstr  - XML string to parse
6% tree    - tree structure corresponding to the XML file
7%__________________________________________________________________________
8%
9% xml_parser.m is an XML 1.0 (http://www.w3.org/TR/REC-xml) parser.
10% It aims to be fully conforming. It is currently not a validating
11% XML processor.
12%
13% A description of the tree structure provided in output is detailed in
14% the header of this m-file.
15%__________________________________________________________________________
16% Copyright (C) 2002-2015  http://www.artefact.tk/
17
18% Guillaume Flandin
19% $Id: xml_parser.m 6480 2015-06-13 01:08:30Z guillaume $
20
21% XML Processor for GNU Octave and MATLAB (The Mathworks, Inc.)
22% Copyright (C) 2002-2015 Guillaume Flandin <Guillaume@artefact.tk>
23%
24% This program is free software; you can redistribute it and/or
25% modify it under the terms of the GNU General Public License
26% as published by the Free Software Foundation; either version 2
27% of the License, or any later version.
28%
29% This program is distributed in the hope that it will be useful,
30% but WITHOUT ANY WARRANTY; without even the implied warranty of
31% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
32% GNU General Public License for more details.
33%
34% You should have received a copy of the GNU General Public License
35% along with this program; if not, write to the Free Software
36% Foundation Inc, 59 Temple Pl. - Suite 330, Boston, MA 02111-1307, USA.
37%--------------------------------------------------------------------------
38
39% Suggestions for improvement and fixes are always welcome, although no
40% guarantee is made whether and when they will be implemented.
41% Send requests to <Guillaume@artefact.tk>
42% Check also the latest developments on the following webpage:
43%           <http://www.artefact.tk/software/matlab/xml/>
44%--------------------------------------------------------------------------
45
46% The implementation of this XML parser is much inspired from a
47% Javascript parser that used to be available at <http://www.jeremie.com/>
48
49% A C-MEX file xml_findstr.c is also required, to encompass some
50% limitations of the built-in FINDSTR function.
51% Compile it on your architecture using 'mex -O xml_findstr.c' command
52% if the compiled version for your system is not provided.
53% If this function does not behave as expected, comment the line
54% '#define __HACK_MXCHAR__' in xml_findstr.c and compile it again.
55%--------------------------------------------------------------------------
56
57% Structure of the output tree:
58% There are 5 types of nodes in an XML file: element, chardata, cdata,
59% pi and comment.
60% Each of them contains an UID (Unique Identifier): an integer between
61% 1 and the number of nodes of the XML file.
62%
63%    element (a tag <name key="value"> [contents] </name>
64%       |_ type:       'element'
65%       |_ name:       string
66%       |_ attributes: cell array of struct 'key' and 'value' or []
67%       |_ contents:   double array of uid's or [] if empty
68%       |_ parent:     uid of the parent ([] if root)
69%       |_ uid:        double
70%
71%    chardata (a character array)
72%       |_ type:   'chardata'
73%       |_ value:  string
74%       |_ parent: uid of the parent
75%       |_ uid:    double
76%
77%    cdata (a litteral string <![CDATA[value]]>)
78%       |_ type:   'cdata'
79%       |_ value:  string
80%       |_ parent: uid of the parent
81%       |_ uid:    double
82%
83%      pi (a processing instruction <?target value ?>)
84%       |_ type:   'pi'
85%       |_ target: string (may be empty)
86%       |_ value:  string
87%       |_ parent: uid of the parent
88%       |_ uid:    double
89%
90%    comment (a comment <!-- value -->)
91%       |_ type:   'comment'
92%       |_ value:  string
93%       |_ parent: uid of the parent
94%       |_ uid:    double
95%
96%--------------------------------------------------------------------------
97
98% TODO/BUG/FEATURES:
99%  - [compile] only a warning if TagStart is empty ?
100%  - [attribution] should look for " and ' rather than only "
101%  - [main] with normalize as a preprocessing, CDATA are modified
102%  - [prolog] look for a DOCTYPE in the whole string even if it occurs
103%    only in a far CDATA tag, bug even if the doctype is inside a comment
104%  - [tag_element] erode should replace normalize here
105%  - remove globals? uppercase globals  rather persistent (clear mfile)?
106%  - xml_findstr is indeed xml_strfind according to Mathworks vocabulary
107%  - problem with entities: do we need to convert them here? (&eacute;)
108%--------------------------------------------------------------------------
109
110%- XML string to parse and number of tags read
111global xmlstring Xparse_count xtree;
112
113%- Check input arguments
114%error(nargchk(1,1,nargin));
115if isempty(xmlstr)
116    error('[XML] Not enough parameters.')
117elseif ~ischar(xmlstr) || sum(size(xmlstr)>1)>1
118    error('[XML] Input must be a string.')
119end
120
121%- Initialize number of tags (<=> uid)
122Xparse_count = 0;
123
124%- Remove prolog and white space characters from the XML string
125xmlstring = normalize(prolog(xmlstr));
126
127%- Initialize the XML tree
128xtree = {};
129tree = fragment;
130tree.str = 1;
131tree.parent = 0;
132
133%- Parse the XML string
134tree = compile(tree);
135
136%- Return the XML tree
137tree = xtree;
138
139%- Remove global variables from the workspace
140clear global xmlstring Xparse_count xtree;
141
142%==========================================================================
143% SUBFUNCTIONS
144
145%--------------------------------------------------------------------------
146function frag = compile(frag)
147    global xmlstring xtree Xparse_count;
148   
149    while 1,
150        if length(xmlstring)<=frag.str || ...
151           (frag.str == length(xmlstring)-1 && strcmp(xmlstring(frag.str:end),' '))
152            return
153        end
154        TagStart = xml_findstr(xmlstring,'<',frag.str,1);
155        if isempty(TagStart)
156            %- Character data
157            error('[XML] Unknown data at the end of the XML file.');
158            Xparse_count = Xparse_count + 1;
159            xtree{Xparse_count} = chardata;
160            xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:end)));
161            xtree{Xparse_count}.parent = frag.parent;
162            xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
163            frag.str = '';
164        elseif TagStart > frag.str
165            if strcmp(xmlstring(frag.str:TagStart-1),' ')
166                %- A single white space before a tag (ignore)
167                frag.str = TagStart;
168            else
169                %- Character data
170                Xparse_count = Xparse_count + 1;
171                xtree{Xparse_count} = chardata;
172                xtree{Xparse_count}.value = erode(entity(xmlstring(frag.str:TagStart-1)));
173                xtree{Xparse_count}.parent = frag.parent;
174                xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
175                frag.str = TagStart;
176            end
177        else
178            if strcmp(xmlstring(frag.str+1),'?')
179                %- Processing instruction
180                frag = tag_pi(frag);
181            else
182                if length(xmlstring)-frag.str>4 && strcmp(xmlstring(frag.str+1:frag.str+3),'!--')
183                    %- Comment
184                    frag = tag_comment(frag);
185                else
186                    if length(xmlstring)-frag.str>9 && strcmp(xmlstring(frag.str+1:frag.str+8),'![CDATA[')
187                        %- Litteral data
188                        frag = tag_cdata(frag);
189                    else
190                        %- A tag element (empty (<.../>) or not)
191                        if ~isempty(frag.end)
192                            endmk = ['/' frag.end '>'];
193                        else
194                            endmk = '/>';
195                        end
196                        if strcmp(xmlstring(frag.str+1:frag.str+length(frag.end)+2),endmk) || ...
197                            strcmp(strip(xmlstring(frag.str+1:frag.str+length(frag.end)+2)),endmk)
198                            frag.str = frag.str + length(frag.end)+3;
199                            return
200                        else
201                            frag = tag_element(frag);
202                        end
203                    end
204                end
205            end
206        end
207    end
208
209%--------------------------------------------------------------------------
210function frag = tag_element(frag)
211    global xmlstring xtree Xparse_count;
212    close =  xml_findstr(xmlstring,'>',frag.str,1);
213    if isempty(close)
214        error('[XML] Tag < opened but not closed.');
215    else
216        empty = strcmp(xmlstring(close-1:close),'/>');
217        if empty
218            close = close - 1;
219        end
220        starttag = normalize(xmlstring(frag.str+1:close-1));
221        nextspace = xml_findstr(starttag,' ',1,1);
222        attribs = '';
223        if isempty(nextspace)
224            name = starttag;
225        else
226            name = starttag(1:nextspace-1);
227            attribs = starttag(nextspace+1:end);
228        end
229        Xparse_count = Xparse_count + 1;
230        xtree{Xparse_count} = element;
231        xtree{Xparse_count}.name = strip(name);
232        if frag.parent
233            xtree{Xparse_count}.parent = frag.parent;
234            xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
235        end
236        if ~isempty(attribs)
237            xtree{Xparse_count}.attributes = attribution(attribs);
238        end
239        if ~empty
240            contents = fragment;
241            contents.str = close+1;
242            contents.end = name;
243            contents.parent = Xparse_count;
244            contents = compile(contents);
245            frag.str = contents.str;
246        else
247            frag.str = close+2;
248        end
249    end
250
251%--------------------------------------------------------------------------
252function frag = tag_pi(frag)
253    global xmlstring xtree Xparse_count;
254    close = xml_findstr(xmlstring,'?>',frag.str,1);
255    if isempty(close)
256        warning('[XML] Tag <? opened but not closed.')
257    else
258        nextspace = xml_findstr(xmlstring,' ',frag.str,1);
259        Xparse_count = Xparse_count + 1;
260        xtree{Xparse_count} = pri;
261        if nextspace > close || nextspace == frag.str+2
262            xtree{Xparse_count}.value = erode(xmlstring(frag.str+2:close-1));
263        else
264            xtree{Xparse_count}.value = erode(xmlstring(nextspace+1:close-1));
265            xtree{Xparse_count}.target = erode(xmlstring(frag.str+2:nextspace));
266        end
267        if frag.parent
268            xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
269            xtree{Xparse_count}.parent = frag.parent;
270        end
271        frag.str = close+2;
272    end
273
274%--------------------------------------------------------------------------
275function frag = tag_comment(frag)
276    global xmlstring xtree Xparse_count;
277    close = xml_findstr(xmlstring,'-->',frag.str,1);
278    if isempty(close)
279        warning('[XML] Tag <!-- opened but not closed.')
280    else
281        Xparse_count = Xparse_count + 1;
282        xtree{Xparse_count} = comment;
283        xtree{Xparse_count}.value = erode(xmlstring(frag.str+4:close-1));
284        if frag.parent
285            xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
286            xtree{Xparse_count}.parent = frag.parent;
287        end
288        frag.str = close+3;
289    end
290
291%--------------------------------------------------------------------------
292function frag = tag_cdata(frag)
293    global xmlstring xtree Xparse_count;
294    close = xml_findstr(xmlstring,']]>',frag.str,1);
295    if isempty(close)
296        warning('[XML] Tag <![CDATA[ opened but not closed.')
297    else
298        Xparse_count = Xparse_count + 1;
299        xtree{Xparse_count} = cdata;
300        xtree{Xparse_count}.value = xmlstring(frag.str+9:close-1);
301        if frag.parent
302            xtree{frag.parent}.contents = [xtree{frag.parent}.contents Xparse_count];
303            xtree{Xparse_count}.parent = frag.parent;
304        end
305        frag.str = close+3;
306    end
307
308%--------------------------------------------------------------------------
309function all = attribution(str)
310    %- Initialize attributs
311    nbattr = 0;
312    all = cell(nbattr);
313    %- Look for 'key="value"' substrings
314    while 1,
315        eq = xml_findstr(str,'=',1,1);
316        if isempty(str) || isempty(eq), return; end
317        id = sort([xml_findstr(str,'"',1,1),xml_findstr(str,'''',1,1)]); id=id(1);
318        nextid = sort([xml_findstr(str,'"',id+1,1),xml_findstr(str,'''',id+1,1)]);nextid=nextid(1);
319        nbattr = nbattr + 1;
320        all{nbattr}.key = strip(str(1:(eq-1)));
321        all{nbattr}.val = entity(str((id+1):(nextid-1)));
322        str = str((nextid+1):end);
323    end
324
325%--------------------------------------------------------------------------
326function elm = element
327    global Xparse_count;
328    elm = struct('type','element','name','','attributes',[],'contents',[],'parent',[],'uid',Xparse_count);
329   
330%--------------------------------------------------------------------------
331function cdat = chardata
332    global Xparse_count;
333    cdat = struct('type','chardata','value','','parent',[],'uid',Xparse_count);
334   
335%--------------------------------------------------------------------------
336function cdat = cdata
337    global Xparse_count;
338    cdat = struct('type','cdata','value','','parent',[],'uid',Xparse_count);
339   
340%--------------------------------------------------------------------------
341function proce = pri
342    global Xparse_count;
343    proce = struct('type','pi','value','','target','','parent',[],'uid',Xparse_count);
344
345%--------------------------------------------------------------------------
346function commt = comment
347    global Xparse_count;
348    commt = struct('type','comment','value','','parent',[],'uid',Xparse_count);
349
350%--------------------------------------------------------------------------
351function frg = fragment
352    frg = struct('str','','parent','','end','');
353
354%--------------------------------------------------------------------------
355function str = prolog(str)
356    %- Initialize beginning index of elements tree
357    b = 1;
358    %- Initial tag
359    start = xml_findstr(str,'<',1,1);
360    if isempty(start)
361        error('[XML] No tag found.')
362    end
363    %- Header (<?xml version="1.0" ... ?>)
364    if strcmpi(str(start:start+2),'<?x')
365        close = xml_findstr(str,'?>',1,1);
366        if ~isempty(close)
367            b = close + 2;
368        else
369            warning('[XML] Header tag incomplete.')
370        end
371    end
372    %- Doctype (<!DOCTYPE type ... [ declarations ]>)
373    start = xml_findstr(str,'<!DOCTYPE',b,1);  % length('<!DOCTYPE') = 9
374    if ~isempty(start)
375        close = xml_findstr(str,'>',start+9,1);
376        if ~isempty(close)
377            b = close + 1;
378            dp = xml_findstr(str,'[',start+9,1);
379            if (~isempty(dp) && dp < b)
380                k = xml_findstr(str,']>',start+9,1);
381                if ~isempty(k)
382                    b = k + 2;
383                else
384                    warning('[XML] Tag [ in DOCTYPE opened but not closed.')
385                end
386            end
387        else
388            warning('[XML] Tag DOCTYPE opened but not closed.')
389        end
390    end
391    %- Skip prolog from the xml string
392    str = str(b:end);
393
394%--------------------------------------------------------------------------
395function str = strip(str)
396    str(isspace(str)) = '';
397
398%--------------------------------------------------------------------------
399function str = normalize(str)
400    % Find white characters (space, newline, carriage return, tabs, ...)
401    i = isspace(str);
402    i = find(i == 1);
403    str(i) = ' ';
404    % replace several white characters by only one
405    if ~isempty(i)
406        j = i - [i(2:end) i(end)];
407        str(i(j == -1)) = [];
408    end
409
410%--------------------------------------------------------------------------
411function str = entity(str)
412    str = strrep(str,'&lt;','<');
413    str = strrep(str,'&gt;','>');
414    str = strrep(str,'&quot;','"');
415    str = strrep(str,'&apos;','''');
416    str = strrep(str,'&amp;','&');
417   
418%--------------------------------------------------------------------------
419function str = erode(str)
420    if ~isempty(str) && str(1)==' ', str(1)=''; end;
421    if ~isempty(str) && str(end)==' ', str(end)=''; end;
Note: See TracBrowser for help on using the repository browser.