diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..0c3586e8e2f7da96a7da6d067bfc6af5358c5c75 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.svg filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index bf5fd18c7cf122c139ba052b1d8861c97933722c..bf2feaf0e757844f36d897e80402fa603b863942 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Description -A stand-alone Theil-Sen estimator for robust regression in Matlab. +A stand-alone Theil-Sen estimator for robust simple regression in Matlab. (Stand-alone: No toolbox required.) @@ -42,13 +42,6 @@ Note how a few "unlucky" outliers can bias the least-squares estimate, but have <img src="example.svg" alt="plot from example.m" width=500px /> -## Roadmap - -- Add more documentation. - - [ ] Mention original use case. -- [ ] Change input interface to match Matlab's `robustfit()`. -- [ ] Add more checks for input sanity. - ## Changelog - October 2014 by Z. Danziger: Original version. diff --git a/TheilSen.m b/TheilSen.m index 67897e7b6d39d2fd6bfdc402050628be22d701d1..8588c2838759e312e118f0649070950da580b94a 100644 --- a/TheilSen.m +++ b/TheilSen.m @@ -1,7 +1,8 @@ -function [b1, b0] = TheilSen(X, y) +function [b0, b1] = TheilSen(X, y) % THEILSEN performs Theil-Sen robust, simple linear regression(s) of X on y. % -% Note that two or more predictor variables in X are treated as independent +% For convenience, the input X may contain more than one predictor variable. +% But note that two or more predictor variables in X are treated as independent % simple regressions; do not confuse the output with multiple regression. % % THEILSEN treats NaNs in X or y as missing values, and ignores them. @@ -12,10 +13,12 @@ function [b1, b0] = TheilSen(X, y) % y: A column vector containing the observations of the response variable. % % OUTPUT -% b1: Estimated slope(s) for each predictor variable in X with respect to the -% response variable. Therefore, b1 will be a vector with as many slopes as -% column vectors in X. -% b0: Estimated offset(s) for each predictor variable in X. +% b0: Estimated offset(s) for each predictor variable in X with respect to y +% (as independent simple regression offsets per predictor). +% b1: Estimated slope(s) for each predictor variable in X with respect to y +% (as independent simple regression slopes per predictor). +% +% Vectors b0 and b1 contain as many entries as column vectors in X. % % EXAMPLE % See accompanying file example.m. @@ -47,50 +50,44 @@ if sizeX(1) ~= sizeY(1) error('The number of rows (observations) of X and y must match.') end -Num_Obs = sizeX(1); % rows are number of observations -Num_Pred = sizeX(2); % columns are number of (independent) predictor variables +Num_Obs = sizeX(1); % rows in X and y are observations +Num_Pred = sizeX(2); % columns in X are (independent) predictor variables if Num_Obs < 2 error('Expecting a data matrix Obs x Dim with at least 2 observations.') end -if Num_Pred == 1 % X is a vector, i.e. normal 2D case - C = nan(Num_Obs); - - % calculate slopes of all possible data point pairs - for i = 1:Num_Obs-1 - C(i, i:end) = (y(i) - y(i:end)) ./ (X(i) - X(i:end)); - end +%%% For the curious, here more readable code for 1 predictor column in X. +%%% However, this special case is omitted for the sake of code simplicity. +% % calculate slope for all pairs of data points +% C = nan(Num_Obs, Num_Obs); +% for i = 1:Num_Obs-1 +% C(i, i:end) = (y(i) - y(i:end)) ./ (X(i) - X(i:end)); +% end +% % estimate slope as the median of all pairwise slopes +% b1 = median(C(:), 'omitnan'); +% % estimate offset as the median of all pairwise offsets +% b0 = median(y - b1 * X, 'omitnan'); - % the slope estimate is the median of all pairwise slopes - b1 = median(C(:), 'omitnan'); - - if nargout == 2 - % calculate intercept if requested - b0 = median(y - b1 * X, 'omitnan'); - end - -else % more than 1 predictor variable in X +% calculate slope, per predictor, for all pairs of data points +C = nan(Num_Obs, Num_Pred, Num_Obs); +for i = 1:Num_Obs + C(:, :, i) = bsxfun(@rdivide, ... + bsxfun(@minus, y(i), y(:)), ... + bsxfun(@minus, X(i, 1:end), X(:, 1:end))); +end - C = nan(Num_Obs, Num_Pred, Num_Obs); +% stack layers of C to 2D +Cprm = reshape( permute(C, [1, 3, 2]), [], size(C, 2), 1 ); - for i = 1:Num_Obs - % accumulate slopes - C(:, :, i) = bsxfun( @rdivide, ... - y(i) - y(:), ... - bsxfun(@minus, X(i, 1:end), X(:, 1:end)) ); - end +% estimate slope as the median of all pairwise slopes (per predictor column) +b1 = median(Cprm, 1, 'omitnan'); - % stack layers of C to 2D - Cprm = reshape( permute(C, [1, 3, 2]), [], size(C, 2), 1 ); - b1 = median(Cprm, 1, 'omitnan'); % calculate slope estimate - - if nargout == 2 - % calculate all intercepts if requested - b0 = median( bsxfun(@minus, y(:), ... - bsxfun(@times, b1, X(:, 1:end))), ... - 'omitnan'); - end +% estimate offset as the median of all pairwise offsets (per predictor column) +if nargout == 2 % only compute if requested/used + b0 = median(bsxfun(@minus, y(:), ... + bsxfun(@times, b1, X(:, 1:end))), ... + 'omitnan'); end -end +end \ No newline at end of file diff --git a/example.m b/example.m index 2f792ed9c842913a9b74fedd8708bc09456465e6..f383b58e254792204a49c44735e3be8939c7d034 100644 --- a/example.m +++ b/example.m @@ -1,4 +1,4 @@ -% This example demonstrates the use of TheilSen.m with simulated data. +%% Simulate data and compare Theil-Sen estimator with least squares. clearvars * % Choose total sample size and fraction of outliers: @@ -30,8 +30,8 @@ data_y(outlr_idx) = outlr_y; est_ls = [ones(N_total, 1), data_x] \ data_y; % Estimate Theil-Sen parameters. -[m, b] = TheilSen(data_x, data_y); -est_ts = [b, m]; +[b0, b1] = TheilSen(data_x, data_y); +est_ts = [b0, b1]; % Plot everything and add comparison of estimates to title. figure() @@ -40,8 +40,27 @@ normal_idx = setdiff(1:N_total, outlr_idx); plot(data_x(normal_idx), data_y(normal_idx), 'ko', ... data_x(outlr_idx), data_y(outlr_idx), 'rx', ... plims, plims * est_ls(2) + est_ls(1), '-r', ... - plims, plims * m + b, 'b-', ... + plims, plims * est_ts(2) + est_ts(1), 'b-', ... 'linewidth', 2) legend('Normal data', 'Outlier', 'Least Squares', 'Theil-Sen', 'location', 'NW') title(sprintf('True: [%.3g, %.3g], LS: [%.3g, %.3g], TS: [%.3g, %.3g]', ... - [true_b0, true_b1, est_ls(1), est_ls(2), est_ts(1), est_ts(2)])) \ No newline at end of file + [true_b0, true_b1, est_ls(1), est_ls(2), est_ts(1), est_ts(2)])) + +%% Demonstrate use of multiple predictor variables in X. +% NOTE: You must execute cell above first. +% create 2 different predictor columns +data_x1 = data_x; +data_x2 = data_x1 * -2 + 10 + randn(size(data_x1)) * SDx_usual; +X = [data_x1, data_x2]; +[b0, b1] = TheilSen(X, data_y); + +% plot both simple regressions; note mainly the difference between x-axes +num_pred = size(X, 2); +figure() +for pp = 1:num_pred + est_ts = [b0(pp), b1(pp)]; + subplot(1, num_pred, pp) + plims = [min(X(:, pp)), max(X(:, pp))]'; + plot(X(:, pp), data_y, 'k^', ... + plims, plims * est_ts(2) + est_ts(1), 'b-', 'linewidth', 2) +end \ No newline at end of file