Skip to content
Snippets Groups Projects
Select Git revision
  • c15397ab6cd3cd7b91c2c01009ccfb8b2e375b77
  • main default protected
  • 1.3.0.1
  • 1.3.0.0
4 results

TheilSen.m

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    TheilSen.m 3.39 KiB
    function coef = TheilSen(X, y)
    % THEILSEN performs Theil-Sen robust, simple linear regression(s) of X on y.
    % 
    % For convenience, the input X may contain more than one predictor variable.
    % But note that two or more predictor variables in X are treated as independent
    % simple regressions; do not confuse the output with multiple regression.
    %
    % THEILSEN treats NaNs in X or y as missing values, and ignores them.
    %
    % INPUT
    %   X: One or more columns vectors containing explanatory/predictor variables.
    %      Rows contain the observed predictor variables.
    %   y: A column vector containing the observations of the response variable.
    %
    % OUTPUT
    %   coef: Estimated regression coefficients for each predictor column in X, with
    %         respect to the response variable y. Each column in coef corresponds
    %         to one predictor in X, i.e. it has as many columns as X does.
    %         The first row, i.e. coef(1, :), contains the estimated offset(s).
    %         The second row, i.e. coef(2, :), contains the estimated slope(s).
    %         (This output format is chosen to avoid confusion, e.g. with previous
    %          versions of this code.)
    %
    % EXAMPLE
    %   See accompanying file example.m.
    %
    % REFERENCE
    %   - Gilbert, Richard O. (1987), "6.5 Sen's Nonparametric Estimator of Slope",
    %     Statistical Methods for Environmental Pollution Monitoring,
    %     John Wiley and Sons, pp. 217-219, ISBN 978-0-471-28878-7
    %
    % AUTHORS
    %   2014-15 Zachary Danziger
    %   2022 Johannes Keyser
    %
    % LICENSE
    %   BSD 2-clause "simplified" license, see accompanying file license.txt.
    
    sizeX = size(X);
    sizeY = size(y);
    
    if length(sizeY) ~= 2 || sizeY(1) < 2 || sizeY(2) ~= 1 || ~isnumeric(X)
        error('Input y must be a column array of at least 2 observed responses.')
    end
    
    if length(sizeX) ~= 2 || ~isnumeric(X)
        error('Input X must be one or more column arrays of predictor variables.')
    end
    
    if sizeX(1) ~= sizeY(1)
        error('The number of rows (observations) of X and y must match.')
    end
    
    Num_Obs = sizeX(1);  % rows in X and y are observations
    Num_Pred = sizeX(2);  % columns in X are (independent) predictor variables
    
    if Num_Obs < 2
        error('Expecting a data matrix Obs x Dim with at least 2 observations.')
    end
    
    %%% For the curious, here more readable code for 1 predictor column in X.
    %%% However, this special case is omitted for the sake of code simplicity.
    % % calculate slope for all pairs of data points
    % C = nan(Num_Obs, Num_Obs);
    % for i = 1:Num_Obs-1
    %     C(i, i:end) = (y(i) - y(i:end)) ./ (X(i) - X(i:end));
    % end
    % % estimate slope as the median of all pairwise slopes
    % b1 = median(C(:), 'omitnan');
    % % estimate offset as the median of all pairwise offsets
    % b0 = median(y - b1 * X, 'omitnan');
    
    % calculate slope, per predictor, for all pairs of data points
    C = nan(Num_Obs, Num_Pred, Num_Obs);
    for i = 1:Num_Obs
        C(:, :, i) = bsxfun(@rdivide, ...
            bsxfun(@minus, y(i), y(:)), ...
            bsxfun(@minus, X(i, 1:end), X(:, 1:end)));
    end
    
    % stack layers of C to 2D
    Cprm = reshape( permute(C, [1, 3, 2]), [], size(C, 2), 1 );
    
    % estimate slope as the median of all pairwise slopes (per predictor column)
    b1s = median(Cprm, 1, 'omitnan');
    
    % estimate offset as the median of all pairwise offsets (per predictor column)
    b0s = median(bsxfun(@minus, y(:), ...
                 bsxfun(@times, b1s, X(:, 1:end))), ...
                'omitnan');
    
    coef = [b0s; b1s];
    end