Regression-based SDC Tools - Generalized microaggregation

Implementation of the methodology in section 6 in the paper

Usage

RegSDChybrid(
  y,
  clusters = NULL,
  xLocal = NULL,
  xGlobal = NULL,
  clusterPieces = NULL,
  xClusterPieces = NULL,
  groupedClusters = NULL,
  xGroupedClusters = NULL,
  alternative = NULL,
  alpha = NULL,
  ySim = NULL,
  returnParts = FALSE,
  epsAlpha = 1e-07,
  makeunique = TRUE,
  tolerance = sqrt(.Machine$double.eps)
)

Arguments

y: Matrix of confidential variables
clusters: Vector of cluster coding
xLocal: Matrix of x-variables to be crossed with clusters
xGlobal: Matrix of x-variables NOT to be crossed with clusters
clusterPieces: Vector of coding of cluster pieces
xClusterPieces: Matrix of x-variables to be crossed with cluster pieces
groupedClusters: Vector of coding of grouped clusters
xGroupedClusters: Matrix of x-variables to be crossed with grouped clusters
alternative: One of "" (default), "a", "b" or "c"
alpha: Possible to specify parameter used internally by alternative "c"
ySim: Possible to specify the internally simulated data manually
returnParts: Alternative output six matrices: y1 and y2 (fitted), e3s and e4s (new residuals), e3 and e4 (original residuals)
epsAlpha: Precision constant for alpha calculation
makeunique: Parameter to be used in GenQR
tolerance: Parameter to Cdiff used within the algorithm

Value

Generated version of y

Details

Input matrices are subjected to EnsureMatrix. Necessary constant terms (intercept) are automatically included. That is, a column of ones is not needed in the input matrices.

Author

Øyvind Langsrud

Examples

#################################################
# Generate example data for introductory examples
################################################# 
y <- matrix(rnorm(30) + 1:30, 10, 3)
x <- matrix(1:10, 10, 1)  # x <- 1:10 is equivalent

# Same as RegSDCipso(y)
yOut <- RegSDChybrid(y)

# With a single cluster both are same as RegSDCipso(y, x)
yOut <- RegSDChybrid(y, xLocal = x)
yOut <- RegSDChybrid(y, xGlobal = x)

# Define two clusters
clust <- rep(1:2, each = 5)

# MHa and MHb in paper
yMHa <- RegSDChybrid(y, clusters = clust, xLocal = x)
yMHb <- RegSDChybrid(y, clusterPieces = clust, xLocal = x)

# An extended variant of MHb as mentioned in paper paragraph below definition of MHa/MHb
yMHbExt <- RegSDChybrid(y, clusterPieces = clust, xClusterPieces = x)

# Identical means within clusters
aggregate(y, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711
aggregate(yMHa, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711
aggregate(yMHb, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711
aggregate(yMHbExt, list(clust = clust), mean)
#>   clust       V1       V2       V3
#> 1     1 2.616377 12.40017 22.38400
#> 2     2 7.031693 18.11947 28.33711

# Identical global regression results
summary(lm(y[, 1] ~ x))
#> 
#> Call:
#> lm(formula = y[, 1] ~ x)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.99930 -0.36426 -0.24060 -0.09118  2.21208 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 
summary(lm(yMHa[, 1] ~ x))
#> 
#> Call:
#> lm(formula = yMHa[, 1] ~ x)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1.5952 -0.4758 -0.1294  0.4943  1.6172 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 
summary(lm(yMHb[, 1] ~ x))
#> 
#> Call:
#> lm(formula = yMHb[, 1] ~ x)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1.8857 -0.5874  0.2406  0.5186  1.1848 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 
summary(lm(yMHbExt[, 1] ~ x))
#> 
#> Call:
#> lm(formula = yMHbExt[, 1] ~ x)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1.1368 -0.6332 -0.4726  0.9707  1.2041 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  -0.6580     0.6594  -0.998    0.348    
#> x             0.9967     0.1063   9.379 1.37e-05 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9652 on 8 degrees of freedom
#> Multiple R-squared:  0.9166,	Adjusted R-squared:  0.9062 
#> F-statistic: 87.97 on 1 and 8 DF,  p-value: 1.367e-05
#> 

# MHa: Identical local regression results
summary(lm(y[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = y[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#> -0.02312 -0.03791 -0.54574  1.29771 -0.69093 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)  -2.2644     0.9500  -2.384   0.0973 .
#> x             1.6269     0.2864   5.680   0.0108 *
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9058 on 3 degrees of freedom
#> Multiple R-squared:  0.9149,	Adjusted R-squared:  0.8866 
#> F-statistic: 32.26 on 1 and 3 DF,  p-value: 0.01081
#> 
summary(lm(yMHa[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = yMHa[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#> -0.61897  0.01182  1.33297 -0.22551 -0.50031 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)  -2.2644     0.9500  -2.384   0.0973 .
#> x             1.6269     0.2864   5.680   0.0108 *
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.9058 on 3 degrees of freedom
#> Multiple R-squared:  0.9149,	Adjusted R-squared:  0.8866 
#> F-statistic: 32.26 on 1 and 3 DF,  p-value: 0.01081
#> 

# MHb: Different results
summary(lm(yMHb[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = yMHb[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#>  0.38023 -0.06638 -0.93097  0.54016  0.07696 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)  
#> (Intercept)  -0.6187     0.6940  -0.892   0.4383  
#> x             1.0784     0.2092   5.154   0.0142 *
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.6617 on 3 degrees of freedom
#> Multiple R-squared:  0.8985,	Adjusted R-squared:  0.8647 
#> F-statistic: 26.56 on 1 and 3 DF,  p-value: 0.01416
#> 

# MHbExt: Same estimates and different std. errors
summary(lm(yMHbExt[, 1] ~ x, subset = clust == 1))
#> 
#> Call:
#> lm(formula = yMHbExt[, 1] ~ x, subset = clust == 1)
#> 
#> Residuals:
#>        1        2        3        4        5 
#> -0.16064 -0.27625  0.69392  0.08347 -0.34050 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)   
#> (Intercept)  -2.2644     0.5090  -4.449  0.02113 * 
#> x             1.6269     0.1535  10.601  0.00179 **
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.4853 on 3 degrees of freedom
#> Multiple R-squared:  0.974,	Adjusted R-squared:  0.9653 
#> F-statistic: 112.4 on 1 and 3 DF,  p-value: 0.001793
#> 

###################################################
#  Generate example data for more advanced examples
###################################################
x <- matrix((1:90) * (1 + runif(90)), 30, 3)
x1 <- x[, 1]
x2 <- x[, 2]
x3 <- x[, 3]
y <- matrix(rnorm(90), 30, 3) + x
clust <- paste("c", rep(1:3, each = 10), sep = "")

######## Run main algorithm
z0 <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2))

# Corresponding models by lm
lmy <- lm(y ~ clust + x1 + x2 + x3:clust)
lm0 <- lm(z0 ~ clust + x1 + x2 + x3:clust)

# Preserved regression coef (x3 within clusters)
coef(lmy) - coef(lm0)
#>                      [,1]          [,2]          [,3]
#> (Intercept) -8.770762e-15 -3.241851e-14  1.178085e-13
#> clustc2     -3.996803e-15 -6.661338e-16 -1.496303e-13
#> clustc3     -4.773959e-14 -4.174439e-14 -7.949197e-14
#> x1           2.220446e-16  6.657001e-16 -1.220812e-15
#> x2           5.898060e-17  0.000000e+00 -6.834810e-16
#> clustc1:x3   8.847090e-17 -3.361027e-18 -4.440892e-16
#> clustc2:x3   8.326673e-17  7.806256e-17  8.881784e-16
#> clustc3:x3   2.853620e-16  3.586541e-16  5.551115e-16

# Preservation of x3 coef locally can also be seen by local regression
coef(lm(y ~ x3, subset = clust == "c2")) - coef(lm(z0 ~ x3, subset = clust == "c2"))
#>                      [,1]          [,2]          [,3]
#> (Intercept) -1.776357e-14 -4.263256e-14 -1.079137e-13
#> x3           6.938894e-17  2.220446e-16  6.661338e-16

# Covariance matrix preserved
cov(resid(lmy)) - cov(resid(lm0))
#>               [,1]          [,2]          [,3]
#> [1,] -1.154632e-14 -1.665335e-15  3.566591e-15
#> [2,] -1.665335e-15  1.887379e-15 -2.081668e-15
#> [3,]  3.566591e-15 -2.081668e-15  3.774758e-15

# But not preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lm0)[clust == "c2", ])
#>             [,1]        [,2]         [,3]
#> [1,] -0.05283525  0.02628186  0.041320945
#> [2,]  0.02628186  0.11017441 -0.118935218
#> [3,]  0.04132094 -0.11893522 -0.007472039

######## Modification (a)
za <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2), alternative = "a")
lma <- lm(za ~ clust + x1 + x2 + x3:clust)

# Now covariance matrices preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lma)[clust == "c2", ])
#>               [,1]          [,2]          [,3]
#> [1,]  1.776357e-15 -2.053913e-15 -3.275158e-15
#> [2,] -2.053913e-15  1.132427e-14 -5.107026e-15
#> [3,] -3.275158e-15 -5.107026e-15  1.887379e-15

# If we estimate coef for x1 and x2 within clusters, 
# they become identical and identical to global estimates
coef(lma)
#>                     [,1]          [,2]         [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2     -1.502492508  1.3682670844 -0.178200192
#> clustc3     -1.894139410 -0.4133242439  2.567418769
#> x1           1.037195424 -0.0038745052  0.003657673
#> x2           0.008993409  1.0099219019 -0.013992845
#> clustc1:x3   0.005125876  0.0009687508  1.011450684
#> clustc2:x3   0.008278076 -0.0124104955  1.007585746
#> clustc3:x3   0.005526337  0.0018204153  0.986441964
coef(lm(za ~ clust + x1:clust + x2:clust + x3:clust))
#>                     [,1]          [,2]         [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2     -1.502492508  1.3682670844 -0.178200192
#> clustc3     -1.894139410 -0.4133242439  2.567418769
#> clustc1:x1   1.037195424 -0.0038745052  0.003657673
#> clustc2:x1   1.037195424 -0.0038745052  0.003657673
#> clustc3:x1   1.037195424 -0.0038745052  0.003657673
#> clustc1:x2   0.008993409  1.0099219019 -0.013992845
#> clustc2:x2   0.008993409  1.0099219019 -0.013992845
#> clustc3:x2   0.008993409  1.0099219019 -0.013992845
#> clustc1:x3   0.005125876  0.0009687508  1.011450684
#> clustc2:x3   0.008278076 -0.0124104955  1.007585746
#> clustc3:x3   0.005526337  0.0018204153  0.986441964

######## Modification (c) with automatic calculation of alpha 
# The result depends on the randomly generated data
# When the result is that alpha=1, modification (b) is equivalent
zc <- RegSDChybrid(y, clusters = clust, xLocal = x3, xGlobal = cbind(x1, x2), alternative = "c")
#> 
#>                c1: clusterAlpha =   1.880, Alpha = 1.000
#>                c2: clusterAlpha =   3.065, Alpha = 1.000
#>                c3: clusterAlpha =   1.321, Alpha = 1.000
lmc <- lm(zc ~ clust + x1 + x2 + x3:clust)

# Preserved regression coef as above
coef(lmy) - coef(lmc)
#>                      [,1]          [,2]          [,3]
#> (Intercept) -2.009504e-14 -1.576517e-14  5.676015e-15
#> clustc2      0.000000e+00 -6.439294e-15 -1.849076e-13
#> clustc3     -4.130030e-14 -2.686740e-14 -1.456613e-13
#> x1           2.220446e-16  2.662801e-16 -4.401861e-16
#> x2           1.543904e-16 -2.220446e-16  3.313322e-16
#> clustc1:x3   9.194034e-17 -3.068292e-17 -4.440892e-16
#> clustc2:x3   4.857226e-17  7.979728e-17  1.110223e-15
#> clustc3:x3   2.159731e-16  2.589075e-16  8.881784e-16

# Again covariance matrices preserved within clusters
cov(resid(lmy)[clust == "c2", ]) - cov(resid(lmc)[clust == "c2", ])
#>              [,1]         [,2]          [,3]
#> [1,] 2.109424e-15 3.885781e-16  2.942091e-15
#> [2,] 3.885781e-16 2.442491e-15  5.273559e-15
#> [3,] 2.942091e-15 5.273559e-15 -1.454392e-14

# If we estimate coef for x1 and x2 within clusters, 
# results are different from modification (a) above
coef(lmc)
#>                     [,1]          [,2]         [,3]
#> (Intercept) -0.853660177 -0.3350771414 -0.116021373
#> clustc2     -1.502492508  1.3682670844 -0.178200192
#> clustc3     -1.894139410 -0.4133242439  2.567418769
#> x1           1.037195424 -0.0038745052  0.003657673
#> x2           0.008993409  1.0099219019 -0.013992845
#> clustc1:x3   0.005125876  0.0009687508  1.011450684
#> clustc2:x3   0.008278076 -0.0124104955  1.007585746
#> clustc3:x3   0.005526337  0.0018204153  0.986441964
coef(lm(zc ~ clust + x1:clust + x2:clust + x3:clust))
#>                     [,1]         [,2]        [,3]
#> (Intercept)  0.148294545  0.743268966 -1.92253955
#> clustc2     -3.316413326 -0.144389429  1.16173957
#> clustc3     -4.070496639 -1.673605741  8.88917470
#> clustc1:x1   1.046242317  0.066634617 -0.02923443
#> clustc2:x1   1.054697315 -0.046662381  0.06809831
#> clustc3:x1   1.043717774 -0.004080565 -0.03469546
#> clustc1:x2  -0.011894602  0.982859060  0.02491830
#> clustc2:x2   0.016711386  1.022488986 -0.01835825
#> clustc3:x2   0.021789554  1.012583585 -0.05577610
#> clustc1:x3   0.005652463 -0.001251337  1.01126158
#> clustc2:x3   0.007517097 -0.007875766  1.00199499
#> clustc3:x3   0.005515666  0.001800353  0.98628208


####################################################
# Make groups of clusters (d) and cluster pieces (e)
####################################################
clustGr <- paste("gr", ceiling(rep(1:3, each = 10)/2 + 0.1), sep = "")
clustP <- c("a", "a", rep("b", 28))

######## Modifications (c), (d) and (e)
zGrP <- RegSDChybrid(y, clusters = clust, clusterPieces = clustP, groupedClusters = clustGr,
                     xLocal = x3, xGroupedClusters = x2, xGlobal = x1, alternative = "c")
#> 
#>                c1: clusterAlpha =   2.087, Alpha = 1.000
#>                c2: clusterAlpha =   1.304, Alpha = 1.000
#>                c3: clusterAlpha =   1.662, Alpha = 1.000

# Corresponding models by lm
lmGrP <- lm(zGrP ~ clust:clustP + x1 + x2:clustGr + x3:clust - 1)
lmY <- lm(y ~ clust:clustP + x1 + x2:clustGr + x3:clust - 1)

# Preserved regression coef
coef(lmY) - coef(lmGrP)
#>                          [,1]          [,2]          [,3]
#> x1               2.220446e-16 -3.382711e-16  7.285839e-17
#> clustc1:clustPa -2.886580e-15 -3.587061e-14  1.087463e-13
#> clustc2:clustPa            NA            NA            NA
#> clustc3:clustPa            NA            NA            NA
#> clustc1:clustPb -3.108624e-15  2.636780e-14  3.907985e-14
#> clustc2:clustPb  2.220446e-16 -8.881784e-16 -6.328271e-14
#> clustc3:clustPb -1.953993e-14 -2.120526e-14  3.086420e-14
#> x2:clustGrgr1    1.006140e-16  8.881784e-16  7.060325e-16
#> x2:clustGrgr2   -8.066464e-17 -2.220446e-16 -7.025630e-17
#> clustc1:x3       2.255141e-17 -4.386682e-16 -4.440892e-16
#> clustc2:x3       6.158268e-17  7.979728e-17  2.220446e-16
#> clustc3:x3       2.029626e-16  3.061787e-16 -1.110223e-16

# Identical means within cluster pieces
aggregate(y, list(clust = clust, clustP = clustP), mean)
#>   clust clustP        V1       V2        V3
#> 1    c1      a  2.875937 45.57646  98.07293
#> 2    c1      b 11.414383 57.33851 113.35811
#> 3    c2      b 22.222100 66.88903 119.61147
#> 4    c3      b 38.492482 72.32188 124.98224
aggregate(zGrP, list(clust = clust, clustP = clustP), mean)
#>   clust clustP        V1       V2        V3
#> 1    c1      a  2.875937 45.57646  98.07293
#> 2    c1      b 11.414383 57.33851 113.35811
#> 3    c2      b 22.222100 66.88903 119.61147
#> 4    c3      b 38.492482 72.32188 124.98224

# Covariance matrix preserved
cov(resid(lmY)) - cov(resid(lmGrP))
#>               [,1]          [,2]          [,3]
#> [1,] -5.884182e-15 -1.179612e-15 -2.248202e-15
#> [2,] -1.179612e-15  3.552714e-15  1.053324e-14
#> [3,] -2.248202e-15  1.053324e-14  1.687539e-14

# Covariance matrices preserved within clusters
cov(resid(lmY)[clust == "c2", ]) - cov(resid(lmGrP)[clust == "c2", ])
#>               [,1]          [,2]          [,3]
#> [1,]  5.440093e-15 -7.938095e-15 -8.326673e-16
#> [2,] -7.938095e-15  3.330669e-15  2.103873e-14
#> [3,] -8.326673e-16  2.103873e-14  2.220446e-16

# Covariance matrices not preserved within cluster pieces
cov(resid(lmY)[clustP == "a", ]) - cov(resid(lmGrP)[clustP == "a", ])
#>             [,1]         [,2]        [,3]
#> [1,] -0.04089730  0.056294035 -0.08208031
#> [2,]  0.05629403 -0.004892891  0.01946627
#> [3,] -0.08208031  0.019466271 -0.04426915