Skip to contents

Implementation of the methodology in section 6 in the paper

Usage

RegSDChybrid(
  y,
  clusters = NULL,
  xLocal = NULL,
  xGlobal = NULL,
  clusterPieces = NULL,
  xClusterPieces = NULL,
  groupedClusters = NULL,
  xGroupedClusters = NULL,
  alternative = NULL,
  alpha = NULL,
  ySim = NULL,
  returnParts = FALSE,
  epsAlpha = 1e-07,
  makeunique = TRUE,
  tolerance = sqrt(.Machine$double.eps)
)

Arguments

y

Matrix of confidential variables

clusters

Vector of cluster coding

xLocal

Matrix of x-variables to be crossed with clusters

xGlobal

Matrix of x-variables NOT to be crossed with clusters

clusterPieces

Vector of coding of cluster pieces

xClusterPieces

Matrix of x-variables to be crossed with cluster pieces

groupedClusters

Vector of coding of grouped clusters

xGroupedClusters

Matrix of x-variables to be crossed with grouped clusters

alternative

One of "" (default), "a", "b" or "c"

alpha

Possible to specify parameter used internally by alternative "c"

ySim

Possible to specify the internally simulated data manually

returnParts

Alternative output six matrices: y1 and y2 (fitted), e3s and e4s (new residuals), e3 and e4 (original residuals)

epsAlpha

Precision constant for alpha calculation

makeunique

Parameter to be used in GenQR

tolerance

Parameter to Cdiff used within the algorithm

Value

Generated version of y

Details

Input matrices are subjected to EnsureMatrix. Necessary constant terms (intercept) are automatically included. That is, a column of ones is not needed in the input matrices.

Author

Øyvind Langsrud

Examples

#################################################
# Generate example data for introductory examples
################################################# 
y <- matrix(rnorm(30) + 1:30, 10, 3)
x <- matrix(1:10, 10, 1)  # x <- 1:10 is equivalent

# Same as RegSDCipso(y)
yOut <- RegSDChybrid(y)

# With a single cluster both are same as RegSDCipso(y, x)
yOut <- RegSDChybrid(y, xLocal = x)
yOut <- RegSDChybrid(y, xGlobal = x)

# Define two clusters
clust <- rep(1:2, each = 5)

# MHa and MHb in paper
yMHa <- RegSDChybrid(y, clusters = clust, xLocal = x)
yMHb <- RegSDChybrid(y, clusterPieces = clust, xLocal = x)

# An extended variant of MHb as mentioned in paper paragraph below definition of MHa/MHb
yMHbExt <- RegSDChybrid(y, clusterPieces = clust, xClusterPieces = x)

# Identical means within clusters
aggregate(y, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711
aggregate(yMHa, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711
aggregate(yMHb, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711
aggregate(yMHbExt, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711

# Identical global regression results
summary(lm(y[, 1] ~ x))
#> 
#> Call:
#> lm(formula = y[, 1] ~ x)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.99930 -0.36426 -0.24060 -0.09118  2.21208 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 
summary(lm(yMHa[, 1] ~ x))
#> 
#> Call:
#> lm(formula = yMHa[, 1] ~ x)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1.5952 -0.4758 -0.1294  0.4943  1.6172 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 
summary(lm(yMHb[, 1] ~ x))
#> 
#> Call:
#> lm(formula = yMHb[, 1] ~ x)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1.8857 -0.5874  0.2406  0.5186  1.1848 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 
summary(lm(yMHbExt[, 1] ~ x))
#> 
#> Call:
#> lm(formula = yMHbExt[, 1] ~ x)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1.1368 -0.6332 -0.4726  0.9707  1.2041 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 

# MHa: Identical local regression results
summary(lm(y[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = y[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#> -0.02312 -0.03791 -0.54574  1.29771 -0.69093 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)  -2.2644     0.9500  -2.384   0.0973 .
#> x             1.6269     0.2864   5.680   0.0108 *
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9058 on 3 degrees of freedom
#> Multiple R-squared:  0.9149,	Adjusted R-squared:  0.8866 
#> F-statistic: 32.26 on 1 and 3 DF,  p-value: 0.01081
#> 
summary(lm(yMHa[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = yMHa[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#> -0.61897  0.01182  1.33297 -0.22551 -0.50031 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)  -2.2644     0.9500  -2.384   0.0973 .
#> x             1.6269     0.2864   5.680   0.0108 *
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9058 on 3 degrees of freedom
#> Multiple R-squared:  0.9149,	Adjusted R-squared:  0.8866 
#> F-statistic: 32.26 on 1 and 3 DF,  p-value: 0.01081
#> 

# MHb: Different results
summary(lm(yMHb[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = yMHb[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#>  0.38023 -0.06638 -0.93097  0.54016  0.07696 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)  -0.6187     0.6940  -0.892   0.4383  
#> x             1.0784     0.2092   5.154   0.0142 *
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.6617 on 3 degrees of freedom
#> Multiple R-squared:  0.8985,	Adjusted R-squared:  0.8647 
#> F-statistic: 26.56 on 1 and 3 DF,  p-value: 0.01416
#> 

# MHbExt: Same estimates and different std. errors
summary(lm(yMHbExt[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = yMHbExt[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#> -0.16064 -0.27625  0.69392  0.08347 -0.34050 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)   
#> (Intercept)  -2.2644     0.5090  -4.449  0.02113 * 
#> x             1.6269     0.1535  10.601  0.00179 **
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.4853 on 3 degrees of freedom
#> Multiple R-squared:  0.974,	Adjusted R-squared:  0.9653 
#> F-statistic: 112.4 on 1 and 3 DF,  p-value: 0.001793
#> 

###################################################
#  Generate example data for more advanced examples
###################################################
x <- matrix((1:90) * (1 + runif(90)), 30, 3)
x1 <- x[, 1]
x2 <- x[, 2]
x3 <- x[, 3]
y <- matrix(rnorm(90), 30, 3) + x
clust <- paste("c", rep(1:3, each = 10), sep = "")

######## Run main algorithm
z0 <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2))

# Corresponding models by lm
lmy <- lm(y ~ clust + x1 + x2 + x3:clust)
lm0 <- lm(z0 ~ clust + x1 + x2 + x3:clust)

# Preserved regression coef (x3 within clusters)
coef(lmy) - coef(lm0)
#>                      [,1]          [,2]          [,3]
#> (Intercept) -8.770762e-15 -3.241851e-14  1.178085e-13
#> clustc2     -3.996803e-15 -6.661338e-16 -1.496303e-13
#> clustc3     -4.773959e-14 -4.174439e-14 -7.949197e-14
#> x1           2.220446e-16  6.657001e-16 -1.220812e-15
#> x2           5.898060e-17  0.000000e+00 -6.834810e-16
#> clustc1:x3   8.847090e-17 -3.361027e-18 -4.440892e-16
#> clustc2:x3   8.326673e-17  7.806256e-17  8.881784e-16
#> clustc3:x3   2.853620e-16  3.586541e-16  5.551115e-16

# Preservation of x3 coef locally can also be seen by local regression
coef(lm(y ~ x3, subset = clust == "c2")) - coef(lm(z0 ~ x3, subset = clust == "c2"))
#>                      [,1]          [,2]          [,3]
#> (Intercept) -1.776357e-14 -4.263256e-14 -1.079137e-13
#> x3           6.938894e-17  2.220446e-16  6.661338e-16

# Covariance matrix preserved
cov(resid(lmy)) - cov(resid(lm0))
#>               [,1]          [,2]          [,3]
#> [1,] -1.154632e-14 -1.665335e-15  3.566591e-15
#> [2,] -1.665335e-15  1.887379e-15 -2.081668e-15
#> [3,]  3.566591e-15 -2.081668e-15  3.774758e-15

# But not preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lm0)[clust == "c2", ])
#>             [,1]        [,2]         [,3]
#> [1,] -0.05283525  0.02628186  0.041320945
#> [2,]  0.02628186  0.11017441 -0.118935218
#> [3,]  0.04132094 -0.11893522 -0.007472039

######## Modification (a)
za <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2), alternative = "a")
lma <- lm(za ~ clust + x1 + x2 + x3:clust)

# Now covariance matrices preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lma)[clust == "c2", ])
#>               [,1]          [,2]          [,3]
#> [1,]  1.776357e-15 -2.053913e-15 -3.275158e-15
#> [2,] -2.053913e-15  1.132427e-14 -5.107026e-15
#> [3,] -3.275158e-15 -5.107026e-15  1.887379e-15

# If we estimate coef for x1 and x2 within clusters, 
# they become identical and identical to global estimates
coef(lma)
#>                     [,1]          [,2]         [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2     -1.502492508  1.3682670844 -0.178200192
#> clustc3     -1.894139410 -0.4133242439  2.567418769
#> x1           1.037195424 -0.0038745052  0.003657673
#> x2           0.008993409  1.0099219019 -0.013992845
#> clustc1:x3   0.005125876  0.0009687508  1.011450684
#> clustc2:x3   0.008278076 -0.0124104955  1.007585746
#> clustc3:x3   0.005526337  0.0018204153  0.986441964
coef(lm(za ~ clust + x1:clust + x2:clust + x3:clust))
#>                     [,1]          [,2]         [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2     -1.502492508  1.3682670844 -0.178200192
#> clustc3     -1.894139410 -0.4133242439  2.567418769
#> clustc1:x1   1.037195424 -0.0038745052  0.003657673
#> clustc2:x1   1.037195424 -0.0038745052  0.003657673
#> clustc3:x1   1.037195424 -0.0038745052  0.003657673
#> clustc1:x2   0.008993409  1.0099219019 -0.013992845
#> clustc2:x2   0.008993409  1.0099219019 -0.013992845
#> clustc3:x2   0.008993409  1.0099219019 -0.013992845
#> clustc1:x3   0.005125876  0.0009687508  1.011450684
#> clustc2:x3   0.008278076 -0.0124104955  1.007585746
#> clustc3:x3   0.005526337  0.0018204153  0.986441964

######## Modification (c) with automatic calculation of alpha 
# The result depends on the randomly generated data
# When the result is that alpha=1, modification (b) is equivalent
zc <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2), alternative = "c")
#> 
#>                c1: clusterAlpha =   1.880, Alpha = 1.000
#>                c2: clusterAlpha =   3.065, Alpha = 1.000
#>                c3: clusterAlpha =   1.321, Alpha = 1.000
lmc <- lm(zc ~ clust + x1 + x2 + x3:clust)

# Preserved regression coef as above
coef(lmy) - coef(lmc)
#>                      [,1]          [,2]          [,3]
#> (Intercept) -2.009504e-14 -1.576517e-14  5.676015e-15
#> clustc2      0.000000e+00 -6.439294e-15 -1.849076e-13
#> clustc3     -4.130030e-14 -2.686740e-14 -1.456613e-13
#> x1           2.220446e-16  2.662801e-16 -4.401861e-16
#> x2           1.543904e-16 -2.220446e-16  3.313322e-16
#> clustc1:x3   9.194034e-17 -3.068292e-17 -4.440892e-16
#> clustc2:x3   4.857226e-17  7.979728e-17  1.110223e-15
#> clustc3:x3   2.159731e-16  2.589075e-16  8.881784e-16

# Again covariance matrices preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lmc)[clust == "c2", ])
#>              [,1]         [,2]          [,3]
#> [1,] 2.109424e-15 3.885781e-16  2.942091e-15
#> [2,] 3.885781e-16 2.442491e-15  5.273559e-15
#> [3,] 2.942091e-15 5.273559e-15 -1.454392e-14

# If we estimate coef for x1 and x2 within clusters, 
# results are different from modification (a) above
coef(lmc)
#>                     [,1]          [,2]         [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2     -1.502492508  1.3682670844 -0.178200192
#> clustc3     -1.894139410 -0.4133242439  2.567418769
#> x1           1.037195424 -0.0038745052  0.003657673
#> x2           0.008993409  1.0099219019 -0.013992845
#> clustc1:x3   0.005125876  0.0009687508  1.011450684
#> clustc2:x3   0.008278076 -0.0124104955  1.007585746
#> clustc3:x3   0.005526337  0.0018204153  0.986441964
coef(lm(zc ~ clust + x1:clust + x2:clust + x3:clust))
#>                     [,1]         [,2]        [,3]
#> (Intercept)  0.148294545  0.743268966 -1.92253955
#> clustc2     -3.316413326 -0.144389429  1.16173957
#> clustc3     -4.070496639 -1.673605741  8.88917470
#> clustc1:x1   1.046242317  0.066634617 -0.02923443
#> clustc2:x1   1.054697315 -0.046662381  0.06809831
#> clustc3:x1   1.043717774 -0.004080565 -0.03469546
#> clustc1:x2  -0.011894602  0.982859060  0.02491830
#> clustc2:x2   0.016711386  1.022488986 -0.01835825
#> clustc3:x2   0.021789554  1.012583585 -0.05577610
#> clustc1:x3   0.005652463 -0.001251337  1.01126158
#> clustc2:x3   0.007517097 -0.007875766  1.00199499
#> clustc3:x3   0.005515666  0.001800353  0.98628208


####################################################
# Make groups of clusters (d) and cluster pieces (e)
####################################################
clustGr <- paste("gr", ceiling(rep(1:3, each = 10)/2 + 0.1), sep = "")
clustP <- c("a", "a", rep("b", 28))

######## Modifications (c), (d) and (e)
zGrP <- RegSDChybrid(y, clusters = clust, clusterPieces = clustP, groupedClusters = clustGr,
                     xLocal = x3, xGroupedClusters = x2, xGlobal = x1, alternative = "c")
#> 
#>                c1: clusterAlpha =   2.087, Alpha = 1.000
#>                c2: clusterAlpha =   1.304, Alpha = 1.000
#>                c3: clusterAlpha =   1.662, Alpha = 1.000

# Corresponding models by lm
lmGrP <- lm(zGrP ~ clust:clustP + x1 + x2:clustGr + x3:clust - 1)
lmY <- lm(y ~ clust:clustP + x1 + x2:clustGr + x3:clust - 1)

# Preserved regression coef
coef(lmY) - coef(lmGrP)
#>                          [,1]          [,2]          [,3]
#> x1               2.220446e-16 -3.382711e-16  7.285839e-17
#> clustc1:clustPa -2.886580e-15 -3.587061e-14  1.087463e-13
#> clustc2:clustPa            NA            NA            NA
#> clustc3:clustPa            NA            NA            NA
#> clustc1:clustPb -3.108624e-15  2.636780e-14  3.907985e-14
#> clustc2:clustPb  2.220446e-16 -8.881784e-16 -6.328271e-14
#> clustc3:clustPb -1.953993e-14 -2.120526e-14  3.086420e-14
#> x2:clustGrgr1    1.006140e-16  8.881784e-16  7.060325e-16
#> x2:clustGrgr2   -8.066464e-17 -2.220446e-16 -7.025630e-17
#> clustc1:x3       2.255141e-17 -4.386682e-16 -4.440892e-16
#> clustc2:x3       6.158268e-17  7.979728e-17  2.220446e-16
#> clustc3:x3       2.029626e-16  3.061787e-16 -1.110223e-16

# Identical means within cluster pieces
aggregate(y, list(clust = clust, clustP = clustP), mean)
#>   clust clustP        V1       V2        V3
#> 1    c1      a  2.875937 45.57646  98.07293
#> 2    c1      b 11.414383 57.33851 113.35811
#> 3    c2      b 22.222100 66.88903 119.61147
#> 4    c3      b 38.492482 72.32188 124.98224
aggregate(zGrP, list(clust = clust, clustP = clustP), mean)
#>   clust clustP        V1       V2        V3
#> 1    c1      a  2.875937 45.57646  98.07293
#> 2    c1      b 11.414383 57.33851 113.35811
#> 3    c2      b 22.222100 66.88903 119.61147
#> 4    c3      b 38.492482 72.32188 124.98224

# Covariance matrix preserved
cov(resid(lmY)) - cov(resid(lmGrP))
#>               [,1]          [,2]          [,3]
#> [1,] -5.884182e-15 -1.179612e-15 -2.248202e-15
#> [2,] -1.179612e-15  3.552714e-15  1.053324e-14
#> [3,] -2.248202e-15  1.053324e-14  1.687539e-14

# Covariance matrices preserved within clusters
cov(resid(lmY)[clust == "c2", ]) - cov(resid(lmGrP)[clust == "c2", ])
#>               [,1]          [,2]          [,3]
#> [1,]  5.440093e-15 -7.938095e-15 -8.326673e-16
#> [2,] -7.938095e-15  3.330669e-15  2.103873e-14
#> [3,] -8.326673e-16  2.103873e-14  2.220446e-16

# Covariance matrices not preserved within cluster pieces
cov(resid(lmY)[clustP == "a", ]) - cov(resid(lmGrP)[clustP == "a", ])
#>             [,1]         [,2]        [,3]
#> [1,] -0.04089730  0.056294035 -0.08208031
#> [2,]  0.05629403 -0.004892891  0.01946627
#> [3,] -0.08208031  0.019466271 -0.04426915