#================================================================ # # # R SOFTWARE FOR COMPUTING POSI CONSTANTS: SAMPLE CODE # # # Authors: Andreas Buja, Kai Zhang # 2014/12/11 # # Reference: http://arxiv.org/abs/1306.1059 # "Valid Post-Selection Inference," # by Berk, R., Brown, L., Buja, A., Zhang, K., Zhao,L., # The Annals of Statistics, 41 (2), 802-837 (2013) # # #================================================================ ## ## ## PoSI ## ## Valid Post-Selection Inference for multiple linear regression ## ## Description: ## ## 'PoSI()' creates an object of class "PoSI" from a design/predictor ## matrix. PoSI objects are used to obtain the standard error ## multipliers or z-test/t-test thresholds for valid post-selection ## inference in multiple linear regression according to Berk et ## al. (2013, Annals of Statistics). These multipliers are valid ## after performing arbitrary variable selection in the specified set ## of submodels. ## ## The functions 'summary()' and 'K()' compute the actual multipliers ## from an object of class 'PoSI' returned by the function 'PoSI()'. ## The function 'summary()' provides PoSI multipliers for given ## confidence levels as well as more conservative multipliers based ## on the Scheffe method of simultaneous inference and Bonferroni ## adjustment; 'K()' provides just the PoSI multipliers. By default ## the multipliers are provided for z-tests assuming the error ## variance is known; if error degrees of freedom are specified ## ('df.err'), the multipliers are for t-tests. ## ## ## Usage: ## ## PoSI(X, modelSZ=1:ncol(X), Nsim=1000, bundleSZ=10000, ## eps=1E-8, center=T, scale=T, verbose=1) ## ## summary(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, ## eps.PoSI=1E-6, digits=3) ## ## K(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, ## eps.PoSI=1E-6, digits=3 ## ## Arguments: ## ## X: a design/predictor/covariate matrix for multiple linear ## regression ## ## [When there are predictors that are 'protected', that ## is, forced into the model and not subjected to variable ## selection, one should adjust the selectable predictors ## for the protected predictors (i.e., replace the ## selectable predictors with their residual vectors when ## regressing them on the protected predictors); thereafter ## simply omit the protected predictors from X.] ## ## modelSZ: vector of submodel sizes to be searched ## Default: 1:ncol(X), i.e., all submodels of all sizes. ## If only models up to size 5 (e.g.) are considered, use 1:5. ## ## Nsim: the number of random unit vectors sampled in the column ## space of X; default: Nsim=1000 ## 'PoSI' is simulation-based. Larger 'Nsim' increases ## precision but may cause running out of memory. ## ## bundleSZ: the number of contrasts/adjusted predictor vectors ## processed at any given time; default: bundleSZ=10000 ## 'PoSI' is more efficient with larger 'bundleSZ', but it ## may cause running out of memory. ## ## eps: threshold below which singular values of X are ## considered to be zero ## In case of highly collinear columns of X this threshold ## determines the effective dimension of the column space ## of X. Default: eps=1E-6 ## ## center: whether the columns of X should be centered ## In most applications the intercept is not among ## variables to be selected. Centering effectively adjusts ## for the presence of an intercept that should be included ## in all submodels. ## ## scale: whether the columns of X should be standardized ## This is appropriate to prevent numeric problems from ## predictors with vastly differing scales. ## ## verbose: verbosity reporting levels ## 0: no reports ## 1: report bundle completion (default); ## 2: report each processed submodel (for debugging) ## ## confidence: list of confidence levels for which standard error ## multipliers should be provided ## default: confidence=c(.95,.99) ## ## alpha: if specified, sets 'confidence = 1-alpha' ## ## df.err: error degrees of freedom for t-tests ## default: df.err=NULL (z-tests) ## ## eps.PoSI: precision to which standard error multipliers are ## computed ## ## digits: number of digits to which standard error multipliers are ## rounded ## ## Details: ## ## 'PoSI' is simulation-based and requires specification of a ## number 'Nsim' of random unit vectors to be sampled in the column ## space of 'X'. Large 'Nsim' yields greater precision but ## requires more memory. The memory demands can be lowered by ## decreasing 'bundleSZ' at the cost of some efficiency. ## 'bundleSZ' determines how many adjusted predictor vectors are ## being processed at any given time. ## ## The computational limit of the PoSI method is in the exponential ## growth of the number of adjusted predictor vectors: ## - If p=ncol(X) and all submodels are being searched, the number ## of adjusted predictor vectors is p*2^(p-1). ## Example: p=20, m=1:20 ==> |{adj.vecs.}| = 10,485,760 ## - If only models of a given size 'm' are being searched, the number ## of adjusted predictor vectors is m*choose(p,m). ## Example: p=50, m=1:5 ==> |{adj.vecs.}| = 11,576,300 ## Thus limiting PoSI to small submodel sizes such as 'm=1:4' ## ("sparse PoSI") puts larger p=ncol(X) within reach. ## ## Value: ## ## 'PoSI' returns an object of 'class' '"PoSI"'. ## ## 'summary' returns a small matrix with standard error multipliers ## for all requested confidence levels as well as computed by three ## different methods: the PoSI method, Scheffe simultaneous ## inference, and Bonferroni adjustment. ## ## 'K' returns just the PoSI-based standard error multipliers for ## the requested confidence levels or for specified Type I errors ## 'alpha' (default: NULL, overwrites 'confidence = 1 - alpha' if ## non-NULL). ## ## #================================================================ # FIRST STEP: SOURCE THE POSI FUNCTIONS # Copy/paste the following line into your R interpreter: source("http://stat.wharton.upenn.edu/~buja/PAPERS/PoSI-source-code.R") #================================================================ # OBSERVATIONS ON COMPUTATIONAL EXPENSE: # *** Full PoSI: # Here is the list of total number of linear constrasts in ALL submodels. p <- 1:20; cbind("predictors"=p, "PoSI contrasts"=p*2^(p-1)) ## predictors PoSI contrasts ## [1,] 1 1 ## [2,] 2 4 ## [3,] 3 12 ## [4,] 4 32 ## [5,] 5 80 ## [6,] 6 192 ## [7,] 7 448 ## [8,] 8 1024 ## [9,] 9 2304 ## [10,] 10 5120 ## [11,] 11 11264 ## [12,] 12 24576 ## [13,] 13 53248 ## [14,] 14 114688 ## [15,] 15 245760 ## [16,] 16 524288 ## [17,] 17 1114112 ## [18,] 18 2359296 ## [19,] 19 4980736 ## [20,] 20 10485760 # E.g., p=20 predictors involves inference for about 10.5 million linear contrasts. # This takes about 25 minutes (Lenovo X230 in 2014). # *** Sparse PoSI: # If we are interested only in models of a certain size, # here is how to calculate the number of contrasts: m <- 1:5 # model sizes 1, 2, 3, 4, 5 p <- 64 # total number of predictors sum(m*choose(p,m)) ## [1] 40793152 # ==> ~41 million linear contrasts # ==> Takes about 100 min (Lenovo X230 in 2014). #================================================================ # TESTS EXAMPLES: # All variables 'X.' are for predictor matrices. # All variables 'UL.' are for PoSI objects. # The PoSI cut-offs will vary quite strongly in the examples # where the X matrices is simulated. # The PoSI cut-offs will vary mildly where X is fixed # because PoSI computations require a degree of simulation. # Just 1 predictor: X.1 <- as.matrix(rnorm(100)) UL.max.1 <- PoSI(X.1) summary(UL.max.1) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 1.960 1.960 1.960 ## 99% 2.576 2.576 2.576 summary(UL.max.1, df.err=4) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 2.776 2.776 2.776 ## 99% 4.604 4.604 4.604 # Small example, testing whether the automatic removal of intercept and small N works: p <- 6; N <- 4 X.small <- cbind(1,matrix(rnorm(N*p), ncol=p)) UL.max.small <- PoSI(X.small, modelSZ=c(4,3,1), Nsim=1000, bundleSZ=5, verbose=2) summary(UL.max.small, df.err=4) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 4.228 4.447 9.256 ## 99% 6.733 7.077 13.988 # Orthogonal design: p <- 10; N <- 10 X.orth <- qr.Q(qr(matrix(rnorm(p*N), ncol=p))) UL.max.orth <- PoSI(X.orth) summary(UL.max.orth) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.379 4.113 4.422 ## 99% 3.885 4.655 4.758 # Boston Housing data: X.boston <- read.table("http://stat.wharton.upenn.edu/~buja/STAT-541/boston.dat", header=T) dim(X.boston) # [1] 506 14 colnames(X.boston) UL.max.boston <- PoSI(X.boston[,-14]) # 4.87 seconds summary(UL.max.boston, df.err=506-13-1) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.620 4.756 4.967 ## 99% 4.115 5.307 5.287 # Large p=50, small N=20: p <- 50; N <- 20 X.p100.N20 <- matrix(rnorm(p*N), ncol=p) # Sparse submodels m=4 out of p=50 in d=19 dimensions -- 1.3 minutes UL.max.p100.N20 <- PoSI(X.p100.N20, Nsim=1000, bundleSZ=100000, modelSZ=1:4) ## p = 50 , d = 19 processed 982500 tests in 251175 models. Times in seconds: ## user system elapsed ## 170.32 13.65 184.16 summary(UL.max.p100.N20) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 4.297 5.490 5.448 ## 99% 4.751 6.016 5.728 # Autism data: can't give out the data, but here is the size p <- 84; N <- 2758 X.84 <- matrix(rnorm(p*N), ncol=p) # Submodel sizes m=84 and m=83 with more simulations (10,000) for precision -- 11.5 sec UL.max.84 <- PoSI(X.84, Nsim=10000, bundleSZ=10000, modelSZ=c(p-1,p), verbose=2) # Out of interest, what is the distribution of inner products? hist(UL.max.84, col="gray", breaks=40) # Sparse submodels of size m=4 out of p=84 in p=d=84 dimensions, 7.7 million contrasts -- 25.5 minutes (!!!!) UL.max.84.4 <- PoSI(X.84, Nsim=1000, bundleSZ=100000, modelSZ=4) ## p = 84 , d = 84 processed 7718004 tests in 1929501 models. Times in seconds: ## user system elapsed ## 1916.80 106.21 2023.77 summary(UL.max.84.4) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.555 10.315 5.804 ## 99% 3.970 10.819 6.068 summary(UL.max.84.4, df.err=2758-84-1) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.560 10.338 5.823 ## 99% 3.976 10.855 6.089 # Big experiment 1: full large PoSI (takes about 13.5 minutes (*)) p <- 20; N <- 1000 X.p20 <- matrix(rnorm(N*p), ncol=p) UL.max.p20 <- PoSI(X.p20, bundleSZ=100000) # 2014/04/19: ## p = 20 , d = 20 processed 10485760 tests in 1048575 models. Times in seconds: ## user system elapsed ## 1110.37 114.68 1225.31 # about 20min summary(UL.max.p20) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.171 5.605 5.855 ## 99% 3.619 6.129 6.117 summary(UL.max.p20, df.err=1000-21) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.180 5.624 5.908 ## 99% 3.631 6.160 6.177 # Big experiment 2: sparse large PoSI (models <= 5; takes about 32 minutes (*)) p <- 50; N <- 1000 X.p50.m5 <- matrix(rnorm(N*p), ncol=p) UL.max.p50.m5 <- PoSI(X.p50.m5, bundleSZ=100000, modelSZ=1:5) # 2014/04/19: ## p = 50 , d = 50 processed 11576300 tests in 2369935 models. Times in seconds: ## user system elapsed ## 1901.24 117.49 2036.86 summary(UL.max.p50.m5) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.522 8.216 5.871 ## 99% 3.951 8.727 6.133 # (*) Elapsed times were measured in R version 3.0.2, 32 bit, # on a processor Intel(R) Core(TM), 2.9 GHz, under Windows 7. # 2014/09/11 #================================================================ # EXCHANGEABLE DESIGNS: # parametrized as quarter circle from xj=e to xj = {ej adjusted for e=(1...1)' and normalized} # {...} = (ej - e/p)/sqrt(1-1/p) # Function to create exchangeable designs this way: design.exch <- function(p,a) { (1-a)*diag(p) + a*matrix(1,p,p) } p <- 12; a <- 0.5; # Exchangeable design with parameter 'a': X.exch <- design.exch(p=p, a=a) UL.exch <- PoSI(X.exch, verbose=0, modelSZ=1:p) K(UL.exch) # ORTHOGONAL DESIGNS: p <- 12 X.orth <- diag(p) UL.orth <- PoSI(X.orth, center=F, verbose=0) K(UL.orth) # ==> Quite bit lower than exchangeable design. #================================================================