diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6d725d9bc713997003e05b713dd3cfe887a6c679 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.m~ diff --git a/README.md b/README.md index 5c14b26da865d466755779894103dba089d9e563..3c8baeb0ee0426f1cd4b657f8a04c9ec098163ae 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Theil-Sen-Matlab +# Theil-Sen estimator for Matlab ## Description @@ -6,16 +6,18 @@ A fast, stand-alone Theil-Sen estimator for robust regression in Matlab. ### Theil-Sen estimator -A [Theil-Sen estimator](https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator) provides robust linear regression in the 2D plane. +A [Theil-Sen estimator](https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator) provides robust linear regression in the 2D plane: The resulting estimates of slope and intercept are relatively insensitive to outliers. +FIXME: Mention main idea (data point pairs), and resulting complexity. + ### No dependency on Matlab toolboxes This code is based on [Theil-Sen Robust Linear Regression](https://mathworks.com/matlabcentral/fileexchange/48294-theil-sen-robust-linear-regression), version 1.2.0.0, by [Zachary Danziger](https://mathworks.com/matlabcentral/profile/authors/1044524). It was modified mainly by re-defining its call to the `nanmedian()` function, which creates a dependency on the (commercially licensed) [Statistics Toolbox](https://mathworks.com/products/statistics.html). There are several other implementations of the Theil-Sen estimator on Mathworks's [File Exchange](https://mathworks.com/matlabcentral/fileexchange). -Unfortunately all were found to depend on the Statistics Toolbox, [except one one](https://mathworks.com/matlabcentral/fileexchange/43135-regression-utilities), but that implementation was judged to be slower and less versatile. +Unfortunately all were found to depend on the Statistics Toolbox, [except one](https://mathworks.com/matlabcentral/fileexchange/43135-regression-utilities), which was judged to be slower and less versatile. ## Installation @@ -31,21 +33,27 @@ TODO: Add examples, and show some expected output. - [ ] Mention original use case. - [ ] Add usage example and plot. - [ ] Add a drop-in for `nanmean()` to become independent of Statistics Toolbox. -- [ ] Adjust formatting to my taste. -- [ ] Adapt license file. +- [ ] Change input interface to match Matlab's `robustfit()`. +- [ ] Add more checks for input sanity. + + +## Changelog + +- October 2014 by Z. Danziger: Original version. +- September 2015 by Z. Danziger: Updated help, speed increase for 2D case +- March 2022 by J. Keyser: Adjusted formatting, added documentation, ## Contributing and project status This project is relatively unmaintained, and only shared as-is, in the hope to be helpful. -The authors do not accept any liability. -If you find a bug, feel free to the author(s) know (see below). +If you find a bug, feel free to let the author(s) know. Feature requests should be directed to the original author. ## Authors -1. Zachary Danziger, principal author ([Matlab profile](https://de.mathworks.com/matlabcentral/profile/authors/1044524), [Lab webpage](https://anil.fiu.edu/)) -2. Johannes Keyser [JLU GitLab profile](@jkeyser) +1. Zachary Danziger, original author ([Matlab profile](https://de.mathworks.com/matlabcentral/profile/authors/1044524), [Lab webpage](https://anil.fiu.edu/)) +2. Johannes Keyser ## License -Please see [license.txt](license.txt). +[Simplified BSD License](https://en.wikipedia.org/wiki/BSD_licenses#2-clause_license_(%22Simplified_BSD_License%22_or_%22FreeBSD_License%22)), see [license.txt](license.txt). diff --git a/TheilSen.m b/TheilSen.m index 0f464bd6ef130f0a77c4057c44a27a9630c05f19..f66e19a159a691bcad8ebcf78bbf126e18e549eb 100644 --- a/TheilSen.m +++ b/TheilSen.m @@ -3,27 +3,30 @@ function [m, b] = TheilSen(data) % % [m, b] = TheilSen(data) % -% data: A MxD matrix with M observations. The first D-1 columns are the -% explanatory variables and the Dth column is the response such that -% data = [x1, x2, ..., x(D-1), y]; +% INPUT +% data: A MxD matrix with M observations. The first D-1 columns are the +% explanatory variables and the Dth column is the response such that +% data = [x1, x2, ..., x(D-1), y]; % -% m: Estimated slope of each explanatory variable with respect to the -% response varuable. Therefore, m will be a vector of D-1 slopes. -% b: Estimated offsets. +% OUTPUT +% m: Estimated slope of each explanatory variable with respect to the +% response varuable. Therefore, m will be a vector of D-1 slopes. +% b: Estimated offsets. % +% EXAMPLE +% See accompanying file example.m. % -% Source: -% Gilbert, Richard O. (1987), "6.5 Sen's Nonparametric Estimator of -% Slope", Statistical Methods for Environmental Pollution Monitoring, -% John Wiley and Sons, pp. 217-219, ISBN 978-0-471-28878-7 +% REFERENCE +% - Gilbert, Richard O. (1987), "6.5 Sen's Nonparametric Estimator of Slope", +% Statistical Methods for Environmental Pollution Monitoring, +% John Wiley and Sons, pp. 217-219, ISBN 978-0-471-28878-7 % +% AUTHORS +% 2014-2015 Zachary Danziger +% 2022 Johannes Keyser % -% -% %%% Z. Danziger October 2014 %%% -% edits Z. Danziger September 2015: -% - updated help -% - speed increase for 2D case -% +% LICENSE +% Simplified BSD license, see accompanying file license.txt. sz = size(data); if length(sz) ~= 2 || sz(1) < 2 @@ -64,4 +67,4 @@ else b = nanmedian( bsxfun(@minus, data(:, end), ... bsxfun(@times, m, data(:, 1:end-1))) ); end -end +end \ No newline at end of file diff --git a/example.m b/example.m index 62a4d590e9caf0989ee7eb98bc4ab151234bca12..1d88b18dc9dda4252b7a5f362d03479c9dad37d8 100644 --- a/example.m +++ b/example.m @@ -1,15 +1,47 @@ -n = 100; -outN = round(0.2 * n); -noise = randn(n, 2) * 0.1; -noise(randi(n * 2, [outN, 1])) = randn(outN, 1) * 5; -data = [linspace(0, 10, n)', linspace(0,5,n)'] + noise; -Bhat = [ones(n, 1), data(:, 1)] \ data(:, 2); -[m, b] = TheilSen(data); -plims = [min(data(:, 1)), max(data(:, 1))]'; +% This example demonstrates the use of TheilSen.m with simulated data. +clearvars * + +% Choose total sample size and fraction of outliers: +N_total = 20; +N_outlr = round(0.2 * N_total); + +% Simulate a linear data set +true_b0 = -2; +true_b1 = 10; +data_x = linspace(0, 1, N_total)'; +data_y = true_b0 + true_b1 .* data_x; + +% add some "usual" Gaussian noise to both dimensions +SDx_usual = 0.1; +SDy_usual = 2.0; +data_x = data_x + randn(size(data_x)) * SDx_usual; +data_y = data_y + randn(size(data_y)) * SDy_usual; + +% simulate "outliers" as signal-dependent, positive noise in y dimension +SDy_outlr = 20 * SDy_usual; +outlr_idx = randperm(N_total, N_outlr); % randomly select data points +outlr_y = data_y(outlr_idx); % start with original value +outlr_y = abs(outlr_y) + 3 * SDy_usual; % set to unlikely high value +outlr_y = outlr_y + abs(randn(N_outlr, 1) * SDy_outlr); % over-dispersion +outlr_y = outlr_y .* data_x(outlr_idx); % scale by x magnitude +data_y(outlr_idx) = outlr_y; + +% estimate least squares parameters +est_ls = [ones(N_total, 1), data_x] \ data_y; + +% estimate Theil-Sen parameters +[m, b] = TheilSen([data_x, data_y]); +est_ts = [b, m]; + +% plot everything figure() -plot(data(:, 1), data(:, 2), 'k.', ... - plims, plims * m + b, '-r', ... - plims, plims * Bhat(2) + Bhat(1), '-b', 'linewidth', 2) -legend('Data', 'Theil-Sen', 'Least Squares', 'location', 'NW') -title(sprintf('Acual Slope = %2.3g, LS est. = %2.3g, TS est. = %2.3g', ... - [0.5 Bhat(2) m])) +plims = [min(data_x), max(data_x)]'; +normal_idx = setdiff(1:N_total, outlr_idx); +plot(data_x(normal_idx), data_y(normal_idx), 'ko', ... + data_x(outlr_idx), data_y(outlr_idx), 'rx', ... + plims, plims * est_ls(2) + est_ls(1), '-r', ... + plims, plims * m + b, 'b-', ... + 'linewidth', 2) +legend('Normal data', 'Outlier', 'Least Squares', 'Theil-Sen', 'location', 'NW') +title(sprintf('True: [%.3g, %.3g], LS: [%.3g, %.3g], TS: [%.3g, %.3g]', ... + [true_b0, true_b1, est_ls(1), est_ls(2), est_ts(1), est_ts(2)])) \ No newline at end of file