Skip to content
Snippets Groups Projects
Commit 75234a97 authored by Keyser, Johannes's avatar Keyser, Johannes
Browse files

Remove special case for 2D, flip outputs, and more

- match calls in example
- add example with >1 predictor
- clarify documentation
parent 7ca7d65a
Branches
No related tags found
No related merge requests found
*.svg filter=lfs diff=lfs merge=lfs -text
......@@ -2,7 +2,7 @@
## Description
A stand-alone Theil-Sen estimator for robust regression in Matlab.
A stand-alone Theil-Sen estimator for robust simple regression in Matlab.
(Stand-alone: No toolbox required.)
......@@ -42,13 +42,6 @@ Note how a few "unlucky" outliers can bias the least-squares estimate, but have
<img src="example.svg" alt="plot from example.m" width=500px />
## Roadmap
- Add more documentation.
- [ ] Mention original use case.
- [ ] Change input interface to match Matlab's `robustfit()`.
- [ ] Add more checks for input sanity.
## Changelog
- October 2014 by Z. Danziger: Original version.
......
function [b1, b0] = TheilSen(X, y)
function [b0, b1] = TheilSen(X, y)
% THEILSEN performs Theil-Sen robust, simple linear regression(s) of X on y.
%
% Note that two or more predictor variables in X are treated as independent
% For convenience, the input X may contain more than one predictor variable.
% But note that two or more predictor variables in X are treated as independent
% simple regressions; do not confuse the output with multiple regression.
%
% THEILSEN treats NaNs in X or y as missing values, and ignores them.
......@@ -12,10 +13,12 @@ function [b1, b0] = TheilSen(X, y)
% y: A column vector containing the observations of the response variable.
%
% OUTPUT
% b1: Estimated slope(s) for each predictor variable in X with respect to the
% response variable. Therefore, b1 will be a vector with as many slopes as
% column vectors in X.
% b0: Estimated offset(s) for each predictor variable in X.
% b0: Estimated offset(s) for each predictor variable in X with respect to y
% (as independent simple regression offsets per predictor).
% b1: Estimated slope(s) for each predictor variable in X with respect to y
% (as independent simple regression slopes per predictor).
%
% Vectors b0 and b1 contain as many entries as column vectors in X.
%
% EXAMPLE
% See accompanying file example.m.
......@@ -47,50 +50,44 @@ if sizeX(1) ~= sizeY(1)
error('The number of rows (observations) of X and y must match.')
end
Num_Obs = sizeX(1); % rows are number of observations
Num_Pred = sizeX(2); % columns are number of (independent) predictor variables
Num_Obs = sizeX(1); % rows in X and y are observations
Num_Pred = sizeX(2); % columns in X are (independent) predictor variables
if Num_Obs < 2
error('Expecting a data matrix Obs x Dim with at least 2 observations.')
end
if Num_Pred == 1 % X is a vector, i.e. normal 2D case
C = nan(Num_Obs);
% calculate slopes of all possible data point pairs
for i = 1:Num_Obs-1
C(i, i:end) = (y(i) - y(i:end)) ./ (X(i) - X(i:end));
end
% the slope estimate is the median of all pairwise slopes
b1 = median(C(:), 'omitnan');
if nargout == 2
% calculate intercept if requested
b0 = median(y - b1 * X, 'omitnan');
end
else % more than 1 predictor variable in X
%%% For the curious, here more readable code for 1 predictor column in X.
%%% However, this special case is omitted for the sake of code simplicity.
% % calculate slope for all pairs of data points
% C = nan(Num_Obs, Num_Obs);
% for i = 1:Num_Obs-1
% C(i, i:end) = (y(i) - y(i:end)) ./ (X(i) - X(i:end));
% end
% % estimate slope as the median of all pairwise slopes
% b1 = median(C(:), 'omitnan');
% % estimate offset as the median of all pairwise offsets
% b0 = median(y - b1 * X, 'omitnan');
% calculate slope, per predictor, for all pairs of data points
C = nan(Num_Obs, Num_Pred, Num_Obs);
for i = 1:Num_Obs
% accumulate slopes
C(:, :, i) = bsxfun(@rdivide, ...
y(i) - y(:), ...
bsxfun(@minus, y(i), y(:)), ...
bsxfun(@minus, X(i, 1:end), X(:, 1:end)));
end
% stack layers of C to 2D
Cprm = reshape( permute(C, [1, 3, 2]), [], size(C, 2), 1 );
b1 = median(Cprm, 1, 'omitnan'); % calculate slope estimate
if nargout == 2
% calculate all intercepts if requested
% estimate slope as the median of all pairwise slopes (per predictor column)
b1 = median(Cprm, 1, 'omitnan');
% estimate offset as the median of all pairwise offsets (per predictor column)
if nargout == 2 % only compute if requested/used
b0 = median(bsxfun(@minus, y(:), ...
bsxfun(@times, b1, X(:, 1:end))), ...
'omitnan');
end
end
end
\ No newline at end of file
% This example demonstrates the use of TheilSen.m with simulated data.
%% Simulate data and compare Theil-Sen estimator with least squares.
clearvars *
% Choose total sample size and fraction of outliers:
......@@ -30,8 +30,8 @@ data_y(outlr_idx) = outlr_y;
est_ls = [ones(N_total, 1), data_x] \ data_y;
% Estimate Theil-Sen parameters.
[m, b] = TheilSen(data_x, data_y);
est_ts = [b, m];
[b0, b1] = TheilSen(data_x, data_y);
est_ts = [b0, b1];
% Plot everything and add comparison of estimates to title.
figure()
......@@ -40,8 +40,27 @@ normal_idx = setdiff(1:N_total, outlr_idx);
plot(data_x(normal_idx), data_y(normal_idx), 'ko', ...
data_x(outlr_idx), data_y(outlr_idx), 'rx', ...
plims, plims * est_ls(2) + est_ls(1), '-r', ...
plims, plims * m + b, 'b-', ...
plims, plims * est_ts(2) + est_ts(1), 'b-', ...
'linewidth', 2)
legend('Normal data', 'Outlier', 'Least Squares', 'Theil-Sen', 'location', 'NW')
title(sprintf('True: [%.3g, %.3g], LS: [%.3g, %.3g], TS: [%.3g, %.3g]', ...
[true_b0, true_b1, est_ls(1), est_ls(2), est_ts(1), est_ts(2)]))
%% Demonstrate use of multiple predictor variables in X.
% NOTE: You must execute cell above first.
% create 2 different predictor columns
data_x1 = data_x;
data_x2 = data_x1 * -2 + 10 + randn(size(data_x1)) * SDx_usual;
X = [data_x1, data_x2];
[b0, b1] = TheilSen(X, data_y);
% plot both simple regressions; note mainly the difference between x-axes
num_pred = size(X, 2);
figure()
for pp = 1:num_pred
est_ts = [b0(pp), b1(pp)];
subplot(1, num_pred, pp)
plims = [min(X(:, pp)), max(X(:, pp))]';
plot(X(:, pp), data_y, 'k^', ...
plims, plims * est_ts(2) + est_ts(1), 'b-', 'linewidth', 2)
end
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment