Implementation of the methodology in section 6 in the paper
Usage
RegSDChybrid(
y,
clusters = NULL,
xLocal = NULL,
xGlobal = NULL,
clusterPieces = NULL,
xClusterPieces = NULL,
groupedClusters = NULL,
xGroupedClusters = NULL,
alternative = NULL,
alpha = NULL,
ySim = NULL,
returnParts = FALSE,
epsAlpha = 1e-07,
makeunique = TRUE,
tolerance = sqrt(.Machine$double.eps)
)
Arguments
- y
Matrix of confidential variables
- clusters
Vector of cluster coding
- xLocal
Matrix of x-variables to be crossed with clusters
- xGlobal
Matrix of x-variables NOT to be crossed with clusters
- clusterPieces
Vector of coding of cluster pieces
- xClusterPieces
Matrix of x-variables to be crossed with cluster pieces
- groupedClusters
Vector of coding of grouped clusters
- xGroupedClusters
Matrix of x-variables to be crossed with grouped clusters
- alternative
One of "" (default), "a", "b" or "c"
- alpha
Possible to specify parameter used internally by alternative "c"
- ySim
Possible to specify the internally simulated data manually
- returnParts
Alternative output six matrices: y1 and y2 (fitted), e3s and e4s (new residuals), e3 and e4 (original residuals)
- epsAlpha
Precision constant for alpha calculation
- makeunique
Parameter to be used in GenQR
- tolerance
Parameter to
Cdiff
used within the algorithm
Details
Input matrices are subjected to EnsureMatrix
.
Necessary constant terms (intercept) are automatically included.
That is, a column of ones is not needed in the input matrices.
Examples
#################################################
# Generate example data for introductory examples
#################################################
y <- matrix(rnorm(30) + 1:30, 10, 3)
x <- matrix(1:10, 10, 1) # x <- 1:10 is equivalent
# Same as RegSDCipso(y)
yOut <- RegSDChybrid(y)
# With a single cluster both are same as RegSDCipso(y, x)
yOut <- RegSDChybrid(y, xLocal = x)
yOut <- RegSDChybrid(y, xGlobal = x)
# Define two clusters
clust <- rep(1:2, each = 5)
# MHa and MHb in paper
yMHa <- RegSDChybrid(y, clusters = clust, xLocal = x)
yMHb <- RegSDChybrid(y, clusterPieces = clust, xLocal = x)
# An extended variant of MHb as mentioned in paper paragraph below definition of MHa/MHb
yMHbExt <- RegSDChybrid(y, clusterPieces = clust, xClusterPieces = x)
# Identical means within clusters
aggregate(y, list(clust = clust), mean)
#> clust V1 V2 V3
#> 1 1 2.616377 12.40017 22.38400
#> 2 2 7.031693 18.11947 28.33711
aggregate(yMHa, list(clust = clust), mean)
#> clust V1 V2 V3
#> 1 1 2.616377 12.40017 22.38400
#> 2 2 7.031693 18.11947 28.33711
aggregate(yMHb, list(clust = clust), mean)
#> clust V1 V2 V3
#> 1 1 2.616377 12.40017 22.38400
#> 2 2 7.031693 18.11947 28.33711
aggregate(yMHbExt, list(clust = clust), mean)
#> clust V1 V2 V3
#> 1 1 2.616377 12.40017 22.38400
#> 2 2 7.031693 18.11947 28.33711
# Identical global regression results
summary(lm(y[, 1] ~ x))
#>
#> Call:
#> lm(formula = y[, 1] ~ x)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -0.99930 -0.36426 -0.24060 -0.09118 2.21208
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.6580 0.6594 -0.998 0.348
#> x 0.9967 0.1063 9.379 1.37e-05 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared: 0.9166, Adjusted R-squared: 0.9062
#> F-statistic: 87.97 on 1 and 8 DF, p-value: 1.367e-05
#>
summary(lm(yMHa[, 1] ~ x))
#>
#> Call:
#> lm(formula = yMHa[, 1] ~ x)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.5952 -0.4758 -0.1294 0.4943 1.6172
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.6580 0.6594 -0.998 0.348
#> x 0.9967 0.1063 9.379 1.37e-05 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared: 0.9166, Adjusted R-squared: 0.9062
#> F-statistic: 87.97 on 1 and 8 DF, p-value: 1.367e-05
#>
summary(lm(yMHb[, 1] ~ x))
#>
#> Call:
#> lm(formula = yMHb[, 1] ~ x)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.8857 -0.5874 0.2406 0.5186 1.1848
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.6580 0.6594 -0.998 0.348
#> x 0.9967 0.1063 9.379 1.37e-05 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared: 0.9166, Adjusted R-squared: 0.9062
#> F-statistic: 87.97 on 1 and 8 DF, p-value: 1.367e-05
#>
summary(lm(yMHbExt[, 1] ~ x))
#>
#> Call:
#> lm(formula = yMHbExt[, 1] ~ x)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.1368 -0.6332 -0.4726 0.9707 1.2041
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.6580 0.6594 -0.998 0.348
#> x 0.9967 0.1063 9.379 1.37e-05 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared: 0.9166, Adjusted R-squared: 0.9062
#> F-statistic: 87.97 on 1 and 8 DF, p-value: 1.367e-05
#>
# MHa: Identical local regression results
summary(lm(y[, 1] ~ x, subset = clust == 1))
#>
#> Call:
#> lm(formula = y[, 1] ~ x, subset = clust == 1)
#>
#> Residuals:
#> 1 2 3 4 5
#> -0.02312 -0.03791 -0.54574 1.29771 -0.69093
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.2644 0.9500 -2.384 0.0973 .
#> x 1.6269 0.2864 5.680 0.0108 *
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.9058 on 3 degrees of freedom
#> Multiple R-squared: 0.9149, Adjusted R-squared: 0.8866
#> F-statistic: 32.26 on 1 and 3 DF, p-value: 0.01081
#>
summary(lm(yMHa[, 1] ~ x, subset = clust == 1))
#>
#> Call:
#> lm(formula = yMHa[, 1] ~ x, subset = clust == 1)
#>
#> Residuals:
#> 1 2 3 4 5
#> -0.61897 0.01182 1.33297 -0.22551 -0.50031
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.2644 0.9500 -2.384 0.0973 .
#> x 1.6269 0.2864 5.680 0.0108 *
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.9058 on 3 degrees of freedom
#> Multiple R-squared: 0.9149, Adjusted R-squared: 0.8866
#> F-statistic: 32.26 on 1 and 3 DF, p-value: 0.01081
#>
# MHb: Different results
summary(lm(yMHb[, 1] ~ x, subset = clust == 1))
#>
#> Call:
#> lm(formula = yMHb[, 1] ~ x, subset = clust == 1)
#>
#> Residuals:
#> 1 2 3 4 5
#> 0.38023 -0.06638 -0.93097 0.54016 0.07696
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.6187 0.6940 -0.892 0.4383
#> x 1.0784 0.2092 5.154 0.0142 *
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.6617 on 3 degrees of freedom
#> Multiple R-squared: 0.8985, Adjusted R-squared: 0.8647
#> F-statistic: 26.56 on 1 and 3 DF, p-value: 0.01416
#>
# MHbExt: Same estimates and different std. errors
summary(lm(yMHbExt[, 1] ~ x, subset = clust == 1))
#>
#> Call:
#> lm(formula = yMHbExt[, 1] ~ x, subset = clust == 1)
#>
#> Residuals:
#> 1 2 3 4 5
#> -0.16064 -0.27625 0.69392 0.08347 -0.34050
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.2644 0.5090 -4.449 0.02113 *
#> x 1.6269 0.1535 10.601 0.00179 **
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.4853 on 3 degrees of freedom
#> Multiple R-squared: 0.974, Adjusted R-squared: 0.9653
#> F-statistic: 112.4 on 1 and 3 DF, p-value: 0.001793
#>
###################################################
# Generate example data for more advanced examples
###################################################
x <- matrix((1:90) * (1 + runif(90)), 30, 3)
x1 <- x[, 1]
x2 <- x[, 2]
x3 <- x[, 3]
y <- matrix(rnorm(90), 30, 3) + x
clust <- paste("c", rep(1:3, each = 10), sep = "")
######## Run main algorithm
z0 <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2))
# Corresponding models by lm
lmy <- lm(y ~ clust + x1 + x2 + x3:clust)
lm0 <- lm(z0 ~ clust + x1 + x2 + x3:clust)
# Preserved regression coef (x3 within clusters)
coef(lmy) - coef(lm0)
#> [,1] [,2] [,3]
#> (Intercept) -8.770762e-15 -3.241851e-14 1.178085e-13
#> clustc2 -3.996803e-15 -6.661338e-16 -1.496303e-13
#> clustc3 -4.773959e-14 -4.174439e-14 -7.949197e-14
#> x1 2.220446e-16 6.657001e-16 -1.220812e-15
#> x2 5.898060e-17 0.000000e+00 -6.834810e-16
#> clustc1:x3 8.847090e-17 -3.361027e-18 -4.440892e-16
#> clustc2:x3 8.326673e-17 7.806256e-17 8.881784e-16
#> clustc3:x3 2.853620e-16 3.586541e-16 5.551115e-16
# Preservation of x3 coef locally can also be seen by local regression
coef(lm(y ~ x3, subset = clust == "c2")) - coef(lm(z0 ~ x3, subset = clust == "c2"))
#> [,1] [,2] [,3]
#> (Intercept) -1.776357e-14 -4.263256e-14 -1.079137e-13
#> x3 6.938894e-17 2.220446e-16 6.661338e-16
# Covariance matrix preserved
cov(resid(lmy)) - cov(resid(lm0))
#> [,1] [,2] [,3]
#> [1,] -1.154632e-14 -1.665335e-15 3.566591e-15
#> [2,] -1.665335e-15 1.887379e-15 -2.081668e-15
#> [3,] 3.566591e-15 -2.081668e-15 3.774758e-15
# But not preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lm0)[clust == "c2", ])
#> [,1] [,2] [,3]
#> [1,] -0.05283525 0.02628186 0.041320945
#> [2,] 0.02628186 0.11017441 -0.118935218
#> [3,] 0.04132094 -0.11893522 -0.007472039
######## Modification (a)
za <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2), alternative = "a")
lma <- lm(za ~ clust + x1 + x2 + x3:clust)
# Now covariance matrices preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lma)[clust == "c2", ])
#> [,1] [,2] [,3]
#> [1,] 1.776357e-15 -2.053913e-15 -3.275158e-15
#> [2,] -2.053913e-15 1.132427e-14 -5.107026e-15
#> [3,] -3.275158e-15 -5.107026e-15 1.887379e-15
# If we estimate coef for x1 and x2 within clusters,
# they become identical and identical to global estimates
coef(lma)
#> [,1] [,2] [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2 -1.502492508 1.3682670844 -0.178200192
#> clustc3 -1.894139410 -0.4133242439 2.567418769
#> x1 1.037195424 -0.0038745052 0.003657673
#> x2 0.008993409 1.0099219019 -0.013992845
#> clustc1:x3 0.005125876 0.0009687508 1.011450684
#> clustc2:x3 0.008278076 -0.0124104955 1.007585746
#> clustc3:x3 0.005526337 0.0018204153 0.986441964
coef(lm(za ~ clust + x1:clust + x2:clust + x3:clust))
#> [,1] [,2] [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2 -1.502492508 1.3682670844 -0.178200192
#> clustc3 -1.894139410 -0.4133242439 2.567418769
#> clustc1:x1 1.037195424 -0.0038745052 0.003657673
#> clustc2:x1 1.037195424 -0.0038745052 0.003657673
#> clustc3:x1 1.037195424 -0.0038745052 0.003657673
#> clustc1:x2 0.008993409 1.0099219019 -0.013992845
#> clustc2:x2 0.008993409 1.0099219019 -0.013992845
#> clustc3:x2 0.008993409 1.0099219019 -0.013992845
#> clustc1:x3 0.005125876 0.0009687508 1.011450684
#> clustc2:x3 0.008278076 -0.0124104955 1.007585746
#> clustc3:x3 0.005526337 0.0018204153 0.986441964
######## Modification (c) with automatic calculation of alpha
# The result depends on the randomly generated data
# When the result is that alpha=1, modification (b) is equivalent
zc <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2), alternative = "c")
#>
#> c1: clusterAlpha = 1.880, Alpha = 1.000
#> c2: clusterAlpha = 3.065, Alpha = 1.000
#> c3: clusterAlpha = 1.321, Alpha = 1.000
lmc <- lm(zc ~ clust + x1 + x2 + x3:clust)
# Preserved regression coef as above
coef(lmy) - coef(lmc)
#> [,1] [,2] [,3]
#> (Intercept) -2.009504e-14 -1.576517e-14 5.676015e-15
#> clustc2 0.000000e+00 -6.439294e-15 -1.849076e-13
#> clustc3 -4.130030e-14 -2.686740e-14 -1.456613e-13
#> x1 2.220446e-16 2.662801e-16 -4.401861e-16
#> x2 1.543904e-16 -2.220446e-16 3.313322e-16
#> clustc1:x3 9.194034e-17 -3.068292e-17 -4.440892e-16
#> clustc2:x3 4.857226e-17 7.979728e-17 1.110223e-15
#> clustc3:x3 2.159731e-16 2.589075e-16 8.881784e-16
# Again covariance matrices preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lmc)[clust == "c2", ])
#> [,1] [,2] [,3]
#> [1,] 2.109424e-15 3.885781e-16 2.942091e-15
#> [2,] 3.885781e-16 2.442491e-15 5.273559e-15
#> [3,] 2.942091e-15 5.273559e-15 -1.454392e-14
# If we estimate coef for x1 and x2 within clusters,
# results are different from modification (a) above
coef(lmc)
#> [,1] [,2] [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2 -1.502492508 1.3682670844 -0.178200192
#> clustc3 -1.894139410 -0.4133242439 2.567418769
#> x1 1.037195424 -0.0038745052 0.003657673
#> x2 0.008993409 1.0099219019 -0.013992845
#> clustc1:x3 0.005125876 0.0009687508 1.011450684
#> clustc2:x3 0.008278076 -0.0124104955 1.007585746
#> clustc3:x3 0.005526337 0.0018204153 0.986441964
coef(lm(zc ~ clust + x1:clust + x2:clust + x3:clust))
#> [,1] [,2] [,3]
#> (Intercept) 0.148294545 0.743268966 -1.92253955
#> clustc2 -3.316413326 -0.144389429 1.16173957
#> clustc3 -4.070496639 -1.673605741 8.88917470
#> clustc1:x1 1.046242317 0.066634617 -0.02923443
#> clustc2:x1 1.054697315 -0.046662381 0.06809831
#> clustc3:x1 1.043717774 -0.004080565 -0.03469546
#> clustc1:x2 -0.011894602 0.982859060 0.02491830
#> clustc2:x2 0.016711386 1.022488986 -0.01835825
#> clustc3:x2 0.021789554 1.012583585 -0.05577610
#> clustc1:x3 0.005652463 -0.001251337 1.01126158
#> clustc2:x3 0.007517097 -0.007875766 1.00199499
#> clustc3:x3 0.005515666 0.001800353 0.98628208
####################################################
# Make groups of clusters (d) and cluster pieces (e)
####################################################
clustGr <- paste("gr", ceiling(rep(1:3, each = 10)/2 + 0.1), sep = "")
clustP <- c("a", "a", rep("b", 28))
######## Modifications (c), (d) and (e)
zGrP <- RegSDChybrid(y, clusters = clust, clusterPieces = clustP, groupedClusters = clustGr,
xLocal = x3, xGroupedClusters = x2, xGlobal = x1, alternative = "c")
#>
#> c1: clusterAlpha = 2.087, Alpha = 1.000
#> c2: clusterAlpha = 1.304, Alpha = 1.000
#> c3: clusterAlpha = 1.662, Alpha = 1.000
# Corresponding models by lm
lmGrP <- lm(zGrP ~ clust:clustP + x1 + x2:clustGr + x3:clust - 1)
lmY <- lm(y ~ clust:clustP + x1 + x2:clustGr + x3:clust - 1)
# Preserved regression coef
coef(lmY) - coef(lmGrP)
#> [,1] [,2] [,3]
#> x1 2.220446e-16 -3.382711e-16 7.285839e-17
#> clustc1:clustPa -2.886580e-15 -3.587061e-14 1.087463e-13
#> clustc2:clustPa NA NA NA
#> clustc3:clustPa NA NA NA
#> clustc1:clustPb -3.108624e-15 2.636780e-14 3.907985e-14
#> clustc2:clustPb 2.220446e-16 -8.881784e-16 -6.328271e-14
#> clustc3:clustPb -1.953993e-14 -2.120526e-14 3.086420e-14
#> x2:clustGrgr1 1.006140e-16 8.881784e-16 7.060325e-16
#> x2:clustGrgr2 -8.066464e-17 -2.220446e-16 -7.025630e-17
#> clustc1:x3 2.255141e-17 -4.386682e-16 -4.440892e-16
#> clustc2:x3 6.158268e-17 7.979728e-17 2.220446e-16
#> clustc3:x3 2.029626e-16 3.061787e-16 -1.110223e-16
# Identical means within cluster pieces
aggregate(y, list(clust = clust, clustP = clustP), mean)
#> clust clustP V1 V2 V3
#> 1 c1 a 2.875937 45.57646 98.07293
#> 2 c1 b 11.414383 57.33851 113.35811
#> 3 c2 b 22.222100 66.88903 119.61147
#> 4 c3 b 38.492482 72.32188 124.98224
aggregate(zGrP, list(clust = clust, clustP = clustP), mean)
#> clust clustP V1 V2 V3
#> 1 c1 a 2.875937 45.57646 98.07293
#> 2 c1 b 11.414383 57.33851 113.35811
#> 3 c2 b 22.222100 66.88903 119.61147
#> 4 c3 b 38.492482 72.32188 124.98224
# Covariance matrix preserved
cov(resid(lmY)) - cov(resid(lmGrP))
#> [,1] [,2] [,3]
#> [1,] -5.884182e-15 -1.179612e-15 -2.248202e-15
#> [2,] -1.179612e-15 3.552714e-15 1.053324e-14
#> [3,] -2.248202e-15 1.053324e-14 1.687539e-14
# Covariance matrices preserved within clusters
cov(resid(lmY)[clust == "c2", ]) - cov(resid(lmGrP)[clust == "c2", ])
#> [,1] [,2] [,3]
#> [1,] 5.440093e-15 -7.938095e-15 -8.326673e-16
#> [2,] -7.938095e-15 3.330669e-15 2.103873e-14
#> [3,] -8.326673e-16 2.103873e-14 2.220446e-16
# Covariance matrices not preserved within cluster pieces
cov(resid(lmY)[clustP == "a", ]) - cov(resid(lmGrP)[clustP == "a", ])
#> [,1] [,2] [,3]
#> [1,] -0.04089730 0.056294035 -0.08208031
#> [2,] 0.05629403 -0.004892891 0.01946627
#> [3,] -0.08208031 0.019466271 -0.04426915