source: trunk/src/polyfitn.m @ 935

Last change on this file since 935 was 877, checked in by sommeria, 10 years ago

polyfitn added

File size: 12.3 KB
Line 
1function polymodel = polyfitn(indepvar,depvar,modelterms)
2% polyfitn: fits a general polynomial regression model in n dimensions
3% usage: polymodel = polyfitn(indepvar,depvar,modelterms)
4%
5% Polyfitn fits a polynomial regression model of one or more
6% independent variables, of the general form:
7%
8%   z = f(x,y,...) + error
9%
10% arguments: (input)
11%  indepvar - (n x p) array of independent variables as columns
12%        n is the number of data points
13%        p is the dimension of the independent variable space
14%
15%        IF n == 1, then I will assume there is only a
16%        single independent variable.
17%
18%  depvar   - (n x 1 or 1 x n) vector - dependent variable
19%        length(depvar) must be n.
20%
21%        Only 1 dependent variable is allowed, since I also
22%        return statistics on the model.
23%
24%  modelterms - defines the terms used in the model itself
25%
26%        IF modelterms is a scalar integer, then it designates
27%           the overall order of the model. All possible terms
28%           up to that order will be employed. Thus, if order
29%           is 2 and p == 2 (i.e., there are two variables) then
30%           the terms selected will be:
31%
32%              { x^2, x*y, x, y^2, y, constant}
33%
34%           Beware the consequences of high order polynomial
35%           models.
36%
37%        IF modelterms is a (k x p) numeric array, then each
38%           row of this array designates the exponents of one
39%           term in the model. Thus to designate a model with
40%           the above list of terms, we would define modelterms as
41%           
42%           modelterms = [0 0;1 0;2 0;0 1;1 1;0 2]
43%
44%        If modelterms is a character string, then it will be
45%           parsed as a list of terms in the regression model.
46%           The terms will be assume to be separated by a comma
47%           or by blanks. The variable names used must be legal
48%           matlab variable names. Exponents in the model may
49%           may be any real number, positive or negative.
50%
51%           For example, 'constant, x, y, x*y, x^2, x*y*y'
52%           will be parsed as a model specification as if you
53%           had supplied:
54%           modelterms = [0 0;1 0;0 1;1 1;2 0;1 2]
55%           
56%           The word 'constant' is a keyword, and will denote a
57%           constant terms in the model. Variable names will be
58%           sorted in alphabetical order as defined by sort.
59%           This order will assign them to columns of the
60%           independent array. Note that 'xy' will be parsed as
61%           a single variable name, not as the product of x and y.
62%
63%        If modelterms is a cell array, then it will be taken
64%           to be a list of character terms. Similarly,
65%           
66%           {'constant', 'x', 'y', 'x*y', 'x^2', 'x*y^-1'}
67%
68%           will be parsed as a model specification as if you
69%           had supplied:
70%
71%           modelterms = [0 0;1 0;0 1;1 1;2 0;1 -1]
72%
73% Arguments: (output)
74%  polymodel - A structure containing the regression model
75%        polymodel.ModelTerms = list of terms in the model
76%        polymodel.Coefficients = regression coefficients
77%        polymodel.ParameterVar = variances of model coefficients
78%        polymodel.ParameterStd = standard deviation of model coefficients
79%        polymodel.R2 = R^2 for the regression model
80%        polymodel.AdjustedR2 = Adjusted R^2 for the regression model
81%        polymodel.RMSE = Root mean squared error
82%        polymodel.VarNames = Cell array of variable names
83%           as parsed from a char based model specification.
84
85%        Note 1: Because the terms in a general polynomial
86%        model can be arbitrarily chosen by the user, I must
87%        package the erms and coefficients together into a
88%        structure. This also forces use of a special evaluation
89%        tool: polyvaln.
90%
91%        Note 2: A polymodel can be evaluated for any set
92%        of values with the function polyvaln. However, if
93%        you wish to manipulate the result symbolically using
94%        my own sympoly tools, this structure can be converted
95%        to a sympoly using the function polyn2sympoly. There
96%        is also a polyn2sym tool, for those who prefer the
97%        symbolic TB.
98%
99%        Note 3: When no constant term is included in the model,
100%        the traditional R^2 can be negative. This case is
101%        identified, and then a more appropriate computation
102%        for R^2 is then used.
103%
104%        Note 4: Adjusted R^2 accounts for changing degrees of
105%        freedom in the model. It CAN be negative, and will always
106%        be less than the traditional R^2 values.
107%
108% Find my sympoly toolbox here:
109% http://www.mathworks.com/matlabcentral/fileexchange/loadFile.do?objectId=9577&objectType=FILE
110%
111% See also: polyvaln, polyfit, polyval, polyn2sympoly, sympoly
112%
113% Author: John D'Errico
114% Release: 2.0
115% Release date: 2/19/06
116
117if nargin<1
118  help polyfitn
119  return
120end
121
122% get sizes, test for consistency
123[n,p] = size(indepvar);
124if n == 1
125  indepvar = indepvar';
126  [n,p] = size(indepvar);
127end
128[m,q] = size(depvar);
129if m == 1
130  depvar = depvar';
131  [m,q] = size(depvar);
132end
133% only 1 dependent variable allowed at a time
134if q~=1
135  error 'Only 1 dependent variable allowed at a time.'
136end
137
138if n~=m
139  error 'indepvar and depvar are of inconsistent sizes.'
140end
141
142% Automatically scale the independent variables to unit variance
143stdind = sqrt(diag(cov(indepvar)));
144if any(stdind==0)
145  warning 'Constant terms in the model must be entered using modelterms'
146  stdind(stdind==0) = 1;
147end
148% scaled variables
149indepvar_s = indepvar*diag(1./stdind);
150
151% do we need to parse a supplied model?
152if iscell(modelterms) || ischar(modelterms)
153  [modelterms,varlist] = parsemodel(modelterms,p);
154  if size(modelterms,2) < p
155    modelterms = [modelterms, zeros(size(modelterms,1),p - size(modelterms,2))];
156  end 
157elseif length(modelterms) == 1
158  % do we need to generate a set of modelterms?
159  [modelterms,varlist] = buildcompletemodel(modelterms,p);
160elseif size(modelterms,2) ~= p
161  error 'ModelTerms must be a scalar or have the same # of columns as indepvar'
162else
163  varlist = repmat({''},1,p);
164end
165nt = size(modelterms,1);
166
167% check for replicate terms
168if nt>1
169  mtu = unique(modelterms,'rows');
170  if size(mtu,1)<nt
171    warning 'Replicate terms identified in the model.'
172  end
173end
174
175% build the design matrix
176M = ones(n,nt);
177scalefact = ones(1,nt);
178for i = 1:nt
179  for j = 1:p
180    M(:,i) = M(:,i).*indepvar_s(:,j).^modelterms(i,j);
181    scalefact(i) = scalefact(i)/(stdind(j)^modelterms(i,j));
182  end
183end
184
185% estimate the model using QR. do it this way to provide a
186% covariance matrix when all done. Use a pivoted QR for
187% maximum stability.
188[Q,R,E] = qr(M,0);
189
190polymodel.ModelTerms = modelterms;
191polymodel.Coefficients(E) = R\(Q'*depvar);
192yhat = M*polymodel.Coefficients(:);
193
194% recover the scaling
195polymodel.Coefficients=polymodel.Coefficients.*scalefact;
196
197% variance of the regression parameters
198s = norm(depvar - yhat);
199if n > nt
200  Rinv = R\eye(nt);
201  Var(E) = s^2*sum(Rinv.^2,2)/(n-nt);
202  polymodel.ParameterVar = Var.*(scalefact.^2);
203  polymodel.ParameterStd = sqrt(polymodel.ParameterVar);
204else
205  % we cannot form variance or standard error estimates
206  % unless there are at least as many data points as
207  % parameters to estimate.
208  polymodel.ParameterVar = inf(1,nt);
209  polymodel.ParameterStd = inf(1,nt);
210end
211
212% degrees of freedom
213polymodel.DoF = n - nt;
214
215% coefficient/sd ratio for a p-value
216t = polymodel.Coefficients./polymodel.ParameterStd;
217
218% twice the upper tail probability from the t distribution,
219% as a transformation from an incomplete beta. This provides
220% a two-sided test for the corresponding coefficient.
221% I could have used tcdf, if I wanted to presume the
222% stats toolbox was present. Of course, then regstats is
223% an option. In that case, the comparable result would be
224% found in:    STATS.tstat.pval
225polymodel.p = betainc(polymodel.DoF./(t.^2 + polymodel.DoF),polymodel.DoF/2,1/2);
226
227% R^2
228% is there a constant term in the model? If not, then
229% we cannot use the standard R^2 computation, as it
230% frequently yields negative values for R^2.
231if any((M(1,:) ~= 0) & all(diff(M,1,1) == 0,1))
232  % we have a constant term in the model, so the
233  % traditional R^2 form is acceptable.
234  polymodel.R2 = max(0,1 - (s/norm(depvar-mean(depvar)) )^2);
235  % compute adjusted R^2, taking into account the number of
236  % degrees of freedom
237  polymodel.AdjustedR2 = 1 - (1 - polymodel.R2).*((n - 1)./(n - nt));
238else
239  % no constant term was found in the model
240  polymodel.R2 = max(0,1 - (s/norm(depvar))^2);
241  % compute adjusted R^2, taking into account the number of
242  % degrees of freedom
243  polymodel.AdjustedR2 = 1 - (1 - polymodel.R2).*(n./(n - nt));
244end
245
246% RMSE
247polymodel.RMSE = sqrt(mean((depvar - yhat).^2));
248
249% if a character 'model' was supplied, return the list
250% of variables as parsed out
251polymodel.VarNames = varlist;
252
253% ==================================================
254% =============== begin subfunctions ===============
255% ==================================================
256function [modelterms,varlist] = buildcompletemodel(order,p)
257%
258% arguments: (input)
259%  order - scalar integer, defines the total (maximum) order
260%
261%  p     - scalar integer - defines the dimension of the
262%          independent variable space
263%
264% arguments: (output)
265%  modelterms - exponent array for the model
266%
267%  varlist - cell array of character variable names
268
269% build the exponent array recursively
270if p == 0
271  % terminal case
272  modelterms = [];
273elseif (order == 0)
274  % terminal case
275  modelterms = zeros(1,p);
276elseif (p==1)
277  % terminal case
278  modelterms = (order:-1:0)';
279else
280  % general recursive case
281  modelterms = zeros(0,p);
282  for k = order:-1:0
283    t = buildcompletemodel(order-k,p-1);
284    nt = size(t,1);
285    modelterms = [modelterms;[repmat(k,nt,1),t]];
286  end
287end
288
289% create a list of variable names for the variables on the fly
290varlist = cell(1,p);
291for i = 1:p
292  varlist{i} = ['X',num2str(i)];
293end
294
295
296% ==================================================
297function [modelterms,varlist] = parsemodel(model,p);
298%
299% arguments: (input)
300%  model - character string or cell array of strings
301%
302%  p     - number of independent variables in the model
303%
304% arguments: (output)
305%  modelterms - exponent array for the model
306
307modelterms = zeros(0,p);
308if ischar(model)
309  model = deblank(model);
310end
311
312varlist = {};
313while ~isempty(model)
314  if iscellstr(model)
315    term = model{1};
316    model(1) = [];
317  else
318    [term,model] = strtok(model,' ,');
319  end
320 
321  % We've stripped off a model term. Now parse it.
322 
323  % Is it the reserved keyword 'constant'?
324  if strcmpi(term,'constant')
325    modelterms(end+1,:) = 0;
326  else
327    % pick this term apart
328    expon = zeros(1,p);
329    while ~isempty(term)
330      vn = strtok(term,'*/^. ,');
331      k = find(strncmp(vn,varlist,length(vn)));
332      if isempty(k)
333        % its a variable name we have not yet seen
334       
335        % is it a legal name?
336        nv = length(varlist);
337        if ismember(vn(1),'1234567890_')
338          error(['Variable is not a valid name: ''',vn,''''])
339        elseif nv>=p
340          error 'More variables in the model than columns of indepvar'
341        end
342       
343        varlist{nv+1} = vn;
344       
345        k = nv+1;
346      end
347      % variable must now be in the list of vars.
348     
349      % drop that variable from term
350      i = strfind(term,vn);
351      term = term((i+length(vn)):end);
352     
353      % is there an exponent?
354      eflag = false;
355      if strncmp('^',term,1)
356        term(1) = [];
357        eflag = true;
358      elseif strncmp('.^',term,2)
359        term(1:2) = [];
360        eflag = true;
361      end
362
363      % If there was one, get it
364      ev = 1;
365      if eflag
366        ev = sscanf(term,'%f');
367        if isempty(ev)
368            error 'Problem with an exponent in parsing the model'
369        end
370      end
371      expon(k) = expon(k) + ev;
372
373      % next monomial subterm?
374      k1 = strfind(term,'*');
375      if isempty(k1)
376        term = '';
377      else
378        term(k1(1)) = ' ';
379      end
380     
381    end
382 
383    modelterms(end+1,:) = expon; 
384   
385  end
386 
387end
388
389% Once we have compiled the list of variables and
390% exponents, we need to sort them in alphabetical order
391[varlist,tags] = sort(varlist);
392modelterms = modelterms(:,tags);
Note: See TracBrowser for help on using the repository browser.