diff --git a/TheilSen.m b/TheilSen.m index 8f1fba375210861e41950fe9a3440843129d63c2..7d2f7df396ea472e47dea90e78c5e67fa79368a8 100644 --- a/TheilSen.m +++ b/TheilSen.m @@ -5,6 +5,11 @@ function coef = TheilSen(X, y) % but note that two or more predictor variables in X are treated as independent % simple regressions: Do not confuse the output with multiple regression. % +% With x denoting a single column vector of X, the returned slope estimator is +% the median of the set of slopes of data point pairs, +% (y(j) - y(i)) / (x(j) - x(i)), with x(i) ≠ x(j). +% The returned intercept is the median of y - slope_estimator * x. +% % THEILSEN treats NaNs in X or y as missing values and ignores them. % % INPUT @@ -28,6 +33,9 @@ function coef = TheilSen(X, y) % - Gilbert, Richard O. (1987), "6.5 Sen's Nonparametric Estimator of Slope", % Statistical Methods for Environmental Pollution Monitoring, % John Wiley and Sons, pp. 217-219, ISBN 978-0-471-28878-7 +% - Sen, P. K. (1968). Estimates of the Regression Coefficient Based on +% Kendall’s Tau. Journal of the American Statistical Association, 63(324), +% 1379–1389. https://doi.org/10.1080/01621459.1968.10480934 % % AUTHORS % 2014-15 Zachary Danziger @@ -63,6 +71,8 @@ Num_Pred = sizeX(2); % columns in X are (independent) predictor variables % for i = 1:Num_Obs-1 % C(i, i:end) = (y(i) - y(i:end)) ./ (X(i) - X(i:end)); % end +% % relabel infinite values (due to div by 0 = x(i) - x(j)) as NaNs-to-ignore. +% C(isinf(C)) = NaN; % % estimate slope as the median of all pairwise slopes % b1 = median(C(:), 'omitnan'); % % estimate offset as the median of all pairwise offsets @@ -76,6 +86,11 @@ for i = 1:Num_Obs bsxfun(@minus, X(i, :), X)); end +% Relabel infinite values as NaN, to be omitted in the median calculation below. +% Infinite values can occur if a data pair has identical x coordinates, leading +% to division by zero, yielding -Inf or +Inf, depending on the numerator. +C(isinf(C)) = NaN; + % stack layers of C to 2D Cprm = reshape(permute(C, [1, 3, 2]), ... [], size(C, 2), 1); @@ -89,4 +104,9 @@ b0s = median(bsxfun(@minus, y, ... 'omitnan'); coef = [b0s; b1s]; + +if any(isnan(coef)) + warning('TheilSen:NaNoutput', ... + 'Output contains NaN; check for degenerate inputs.') +end end diff --git a/example.m b/example.m index 12ed39122635d96f93434b3fe3b62dec481e9fa5..f3c4412698624cd02986bdb9359f1c3df559a57b 100644 --- a/example.m +++ b/example.m @@ -1,5 +1,5 @@ %% Simulate data and compare Theil-Sen estimator with least squares. -clearvars * +clearvars('*') % Choose total sample size and fraction of outliers: N_total = 20; @@ -62,4 +62,9 @@ for pp = 1:num_pred plims = [min(X(:, pp)), max(X(:, pp))]'; plot(X(:, pp), data_y, 'k^', ... plims, est_ts(1, pp) + plims * est_ts(2, pp), 'b-', 'linewidth', 2) -end \ No newline at end of file +end + +%% Check that only identical x coordinates lead to warning and NaNs in output. +data_x = ones(5); +data_y = rand(5, 1); +TheilSen(data_x, data_y)