#================================================================ # # # R SOFTWARE FOR COMPUTING POSI CONSTANTS # # # Authors: Andreas Buja, Kai Zhang # 2014/09/11 # # Reference: http://arxiv.org/abs/1306.1059 # "Valid Post-Selection Inference," # by Berk, R., Brown, L., Buja, A., Zhang, K., Zhao,L., # The Annals of Statistics, 41 (2), 802-837 (2013) # # #================================================================ # Computations of inner products of random unit vectors (U) # with 'linear contrasts', that is, adjusted predictor vectors (L): PoSI <- function(X, Nsim=1000, bundleSZ=100000, modelSZ=1:ncol(X), eps=1E-8, center=T, scale=T, verbose=1) { if(verbose>=1) { proc.time.init <- proc.time() } # Standardize the columns of X for numerical stability; if intercept is to be selected, set 'center=F': X.scaled <- scale(X, center=center, scale=scale) # Remove columns with NAs: sel <- apply(X.scaled, 2, function(x) !any(is.na(x))) X.scaled <- X.scaled[,sel] if(verbose >= 1 & any(!sel)) cat("Removed column(s)",which(!sel),"due to NAs after standardizing'\n") # Map X to Scheffe canonical coordinates X.cc: X.cc' X.cc = X'X (d x p) X.svd <- svd(X.scaled) sel <- (X.svd$d > eps) if(verbose >= 1 & any(!sel)) cat("Reduced rank:",sum(sel)," (Change eps?)\n") X.cc <- X.svd$d[sel] * t(X.svd$v[,sel]) # Full-model Scheffe canonical coords. of X. # Clean up: rm(X.scaled, X.svd) # Notation: p <- ncol(X.cc) # Number of predictors d <- nrow(X.cc) # d=rank(X), can be < p when X is rank-deficient, as when p>n # Submodels can only be of size <= d; drop those exceeding d: modelSZ <- modelSZ[modelSZ <= d] # Remove models that are too large. # Random unit vectors for which inner products with adjusted predictors (L) will be formed: U <- matrix(rnorm(d*Nsim),ncol=d) # Nsim x d U <- U / sqrt(rowSums(U^2)) # Matrix of unit vectors u in R^d (=rows) for max_l || # Prepare loop over models to store all possible linear combinations l: UL.max <- rep(0,Nsim) # To contain max_l || for each random row u of U. L <- matrix(NA, nrow=bundleSZ, ncol=d) # One bundle of adjusted predictors l (bundleSZ x d) Nmodels <- 0 # To be returned among attributes() Ntests <- 0 # To be returned among attributes() ibundles <- 0 # For verbose>=1 if(verbose>=1) { cat("Number of contrasts/adjusted predictors to process:",sum(choose(p,modelSZ)*modelSZ),"\n") cat("Number of bundles:",sum(ceiling(choose(p,modelSZ)*modelSZ/bundleSZ)),"\n") } for(m in modelSZ) { # Loop over model sizes. bundleSZ <- max(bundleSZ, m) # Make sure one bundle is able to contain at least one model. i.store <- 1 # Pointer into next free row of L (= d-vectors) M <- 1:m # Current submodel, initialized to first submodel M.last <- (p-m+1):p # The last submodel of size m repeat { # Loop over bundles of adjusted predictors # Verbosity level 2: print all models & dims if(verbose>=2) cat("Model:", M,"\n") X.m <- X.cc[,M] # Select predictors for this submodel X.qr <- qr(t(X.m)%*%X.m) # QR decomposition of submodel in canon. coords. if(X.qr$rank < length(M)) next # Rank-deficient submodel ==> omit. L[i.store:(i.store+m-1),] <- t(X.m %*% qr.solve(X.qr)) # Submodel adjusted predictors (rows of L) Nmodels <- Nmodels + 1 Ntests <- Ntests + m i.store <- i.store+m # Move pointer to next free column of L. # If bundle L of adjusted predictors l is full, or if this was the last model, then # accumulate max_l || and re-initialize L: if(( (i.store+m-1) > bundleSZ ) | !(any(M < M.last) )) { L[1:(i.store-1),] <- L[1:(i.store-1),] / # Normalize rows of L to unit length: sqrt(rowSums(L[1:(i.store-1),,drop=F]^2)) # Implementation note: Against 'help()', '..%*%t(..)' is a touch faster than 'tcrossprod()'. UL <- abs(U %*% t(L[1:(i.store-1),,drop=F])) # Inner prods || of rows of U and L UL.max <- pmax(UL.max, apply(UL, 1, max)) # Accumulate max_l || i.store <- 1 # Initialize L to fill up a new bundle # Verbosity level 1: print on bundle completion if(verbose>=1) { ibundles <- ibundles + 1 cat(" Done with bundle",ibundles," model sz =",m,"\n") } } # Find next model in enumeration: if(any(M < M.last)) { # Not at the last submodel yet, hence find the next one: i <- max(which(M < M.last)) # Pointer to the highest predictor index not in the last model M[i] <- M[i]+1 # Move this predictor index to point to the next predictor. if(i < m) { M[(i+1):m] <- (M[i]+1):(M[i]+m-i) } # Set the remaining to their minima. } else break } # end of 'repeat{' } # end of 'for(m in modelSZ) {' if(verbose>=1) { cat("p =",p,", d =",d," processed",Ntests,"tests in",Nmodels,"models. Times in seconds:\n") print(proc.time() - proc.time.init) } attributes(UL.max)$d <- d attributes(UL.max)$p <- p attributes(UL.max)$Nmodels <- Nmodels attributes(UL.max)$Ntests <- Ntests class(UL.max) <- "PoSI" UL.max } # end of 'PoSI <- function(...)' #================================================================ # Computation of PoSI, Scheffe and Bonferroni constants: summary.PoSI <- function(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3) { if(class(PoSI.object) != "PoSI") { cat("K.PoSI: first argument is not of class 'PoSI'.\n"); return } rU <- 1/PoSI.object df <- attributes(PoSI.object)$d if(is.null(alpha)) { alpha <- 1-confidence } else { confidence <- 1-alpha } # The following functions rely on lexical scoping: if(is.null(df.err)) { # sigma known ==> z-statistics prob.PoSI <- function(K) mean(pchisq((K*rU)^2, df)) quant.Sch <- function() sqrt(qchisq(confidence, df=df)) quant.Bonf <- function() qnorm(1 - alpha/2/attributes(PoSI.object)$Ntests) } else { # sigma estimated with df.err ==> t-statistics prob.PoSI <- function(K) mean(pf((K*rU)^2/df, df, df.err)) quant.Sch <- function() sqrt(df*qf(confidence, df1=df, df2=df.err)) quant.Bonf <- function() qt(1 - alpha/2/attributes(PoSI.object)$Ntests, df=df.err) } # Bisection search for K.PoSI: K.PoSI <- sapply(confidence, function(cvg) { K.lo <- 0 K.hi <- 30 while(abs(K.lo - K.hi) > eps.PoSI) { K <- (K.lo + K.hi)/2 if(mean(prob.PoSI(K)) > cvg) { K.hi <- K } else { K.lo <- K } } K } ) K.Sch <- quant.Sch() K.Bonf <- quant.Bonf() Ks <- cbind("K.PoSI"=K.PoSI, "K.Scheffe"=K.Sch, "K.Bonferroni"=K.Bonf) rownames(Ks) <- paste(confidence*100,"%",sep="") round(Ks, digits) } # end of 'summary.PoSI <- function(...' K.PoSI <- function(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3) { if(class(PoSI.object) != "PoSI") { cat("K.PoSI: first argument is not of class 'PoSI'.\n"); return } rU <- 1/PoSI.object df <- attributes(PoSI.object)$d if(is.null(alpha)) { alpha <- 1-confidence } else { confidence <- 1-alpha } if(is.null(df.err)) { # sigma known ==> z-statistics prob.PoSI <- function(K) mean(pchisq((K*rU)^2, df)) } else { # sigma estimated with df.err ==> t-statistics prob.PoSI <- function(K) mean(pf((K*rU)^2/df, df, df.err)) } # Bisection search for K.PoSI: K.PoSI <- sapply(confidence, function(cvg) { K.lo <- 0 K.hi <- 30 while(abs(K.lo - K.hi) > eps.PoSI) { K <- (K.lo + K.hi)/2 if(mean(prob.PoSI(K)) > cvg) { K.hi <- K } else { K.lo <- K } } K } ) names(K.PoSI) <- confidence round(K.PoSI, digits) } # end of 'K.PoSI <- function(...' # Generic function: K <- function(object, ...) { UseMethod("K") } #================================================================ PoSI Valid Post-Selection Inference for multiple linear regression Description: 'PoSI()' creates an object of class "PoSI" from a design/predictor matrix. PoSI objects are used to obtain the standard error multipliers or z-test/t-test thresholds for valid post-selection inference in multiple linear regression according to Berk et al. (2013, Annals of Statistics). These multipliers are valid after performing arbitrary variable selection in the specified set of submodels. The functions 'summary()' and 'K()' compute the actual multipliers from an object of class 'PoSI' returned by the function 'PoSI()'. The function 'summary()' provides PoSI multipliers for given confidence levels as well as more conservative multipliers based on the Scheffe method of simultaneous inference and Bonferroni adjustment; 'K()' provides just the PoSI multipliers. By default the multipliers are provided for z-tests assuming the error variance is known; if error degrees of freedom are specified ('df.err'), the multipliers are for t-tests. Usage: PoSI(X, modelSZ=1:ncol(X), Nsim=1000, bundleSZ=10000, eps=1E-8, center=T, scale=T, verbose=1) summary(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3) K(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3 Arguments: X: a design/predictor/covariate matrix for multiple linear regression [When there are predictors that are 'protected', that is, forced into the model and not subjected to variable selection, one should adjust the selectable predictors for the protected predictors (i.e., replace the selectable predictors with their residual vectors when regressing them on the protected predictors); thereafter simply omit the protected predictors from X.] modelSZ: vector of submodel sizes to be searched Default: 1:ncol(X), i.e., all submodels of all sizes. If only models up to size 5 (e.g.) are considered, use 1:5. Nsim: the number of random unit vectors sampled in the column space of X; default: Nsim=1000 'PoSI' is simulation-based. Larger 'Nsim' increases precision but may cause running out of memory. bundleSZ: the number of contrasts/adjusted predictor vectors processed at any given time; default: bundleSZ=10000 'PoSI' is more efficient with larger 'bundleSZ', but it may cause running out of memory. eps: threshold below which singular values of X are considered to be zero In case of highly collinear columns of X this threshold determines the effective dimension of the column space of X. Default: eps=1E-6 center: whether the columns of X should be centered In most applications the intercept is not among variables to be selected. Centering effectively adjusts for the presence of an intercept that should be included in all submodels. scale: whether the columns of X should be standardized This is appropriate to prevent numeric problems from predictors with vastly differing scales. verbose: verbosity reporting levels 0: no reports 1: report bundle completion (default); 2: report each processed submodel (for debugging) confidence: list of confidence levels for which standard error multipliers should be provided default: confidence=c(.95,.99) alpha: if specified, sets 'confidence = 1-alpha' df.err: error degrees of freedom for t-tests default: df.err=NULL (z-tests) eps.PoSI: precision to which standard error multipliers are computed digits: number of digits to which standard error multipliers are rounded Details: 'PoSI' is simulation-based and requires specification of a number 'Nsim' of random unit vectors to be sampled in the column space of 'X'. Large 'Nsim' yields greater precision but requires more memory. The memory demands can be lowered by decreasing 'bundleSZ' at the cost of some efficiency. 'bundleSZ' determines how many adjusted predictor vectors are being processed at any given time. The computational limit of the PoSI method is in the exponential growth of the number of adjusted predictor vectors: - If p=ncol(X) and all submodels are being searched, the number of adjusted predictor vectors is p*2^(p-1). Example: p=20, m=1:20 ==> |{adj.vecs.}| = 10,485,760 - If only models of a given size 'm' are being searched, the number of adjusted predictor vectors is m*choose(p,m). Example: p=50, m=1:5 ==> |{adj.vecs.}| = 11,576,300 Thus limiting PoSI to small submodel sizes such as 'm=1:4' ("sparse PoSI") puts larger p=ncol(X) within reach. Value: 'PoSI' returns an object of 'class' '"PoSI"'. 'summary' returns a small matrix with standard error multipliers for all requested confidence levels as well as computed by three different methods: the PoSI method, Scheffe simultaneous inference, and Bonferroni adjustment. 'K' returns just the PoSI-based standard error multipliers for the requested confidence levels or for specified Type I errors 'alpha' (default: NULL, overwrites 'confidence = 1 - alpha' if non-NULL). #================================================================ # OBSERVATIONS ON COMPUTATIONAL EXPENSE: # *** Full PoSI: # Here is the list of total number of linear constrasts in ALL submodels. p <- 1:20; cbind("predictors"=p, "PoSI contrasts"=p*2^(p-1)) ## predictors PoSI contrasts ## [1,] 1 1 ## [2,] 2 4 ## [3,] 3 12 ## [4,] 4 32 ## [5,] 5 80 ## [6,] 6 192 ## [7,] 7 448 ## [8,] 8 1024 ## [9,] 9 2304 ## [10,] 10 5120 ## [11,] 11 11264 ## [12,] 12 24576 ## [13,] 13 53248 ## [14,] 14 114688 ## [15,] 15 245760 ## [16,] 16 524288 ## [17,] 17 1114112 ## [18,] 18 2359296 ## [19,] 19 4980736 ## [20,] 20 10485760 # E.g., p=20 predictors involves inference for about 10.5 million linear contrasts. # This takes about 25 minutes (Lenovo X230 in 2014). # *** Sparse PoSI: # If we are interested only in models of a certain size, # here is how to calculate the number of contrasts: m <- 1:5 # model sizes 1, 2, 3, 4, 5 p <- 64 # total number of predictors sum(m*choose(p,m)) ## [1] 40793152 # ==> ~41 million linear contrasts # ==> Takes about 100 min (Lenovo X230 in 2014). #================================================================ # TESTS: # Just 1 predictor: X.1 <- as.matrix(rnorm(100)) UL.max.1 <- PoSI(X.1) summary(UL.max.1) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 1.960 1.960 1.960 ## 99% 2.576 2.576 2.576 summary(UL.max.1, df.err=4) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 2.776 2.776 2.776 ## 99% 4.604 4.604 4.604 # Small example, testing whether removal of intercept and small N works: p <- 6; N <- 4 X.small <- cbind(1,matrix(rnorm(N*p), ncol=p)) UL.max.small <- PoSI(X.small, modelSZ=c(4,3,1), Nsim=1000, bundleSZ=5, verbose=2) summary(UL.max.small, df.err=4) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 4.228 4.447 9.256 ## 99% 6.733 7.077 13.988 # Orthogonal design: p <- 10; N <- 10 X.orth <- qr.Q(qr(matrix(rnorm(p*N), ncol=p))) UL.max.orth <- PoSI(X.orth) summary(UL.max.orth) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.379 4.113 4.422 ## 99% 3.885 4.655 4.758 # Boston Housing data: X.boston <- read.table("http://stat.wharton.upenn.edu/~buja/STAT-541/boston.dat", header=T) dim(X.boston) # [1] 506 14 colnames(X.boston) UL.max.boston <- PoSI(X.boston[,-14]) # 4.87 seconds summary(UL.max.boston, df.err=506-13-1) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.620 4.756 4.967 ## 99% 4.115 5.307 5.287 # Large p=50, small N=20: p <- 50; N <- 20 X.p100.N20 <- matrix(rnorm(p*N), ncol=p) # Sparse submodels m=4 out of p=50 in d=19 dimensions -- 1.3 minutes UL.max.p100.N20 <- PoSI(X.p100.N20, Nsim=1000, bundleSZ=100000, modelSZ=1:4) ## p = 50 , d = 19 processed 982500 tests in 251175 models. Times in seconds: ## user system elapsed ## 170.32 13.65 184.16 summary(UL.max.p100.N20) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 4.297 5.490 5.448 ## 99% 4.751 6.016 5.728 # Autism data: can't give out the data, but here is the size p <- 84; N <- 2758 X.84 <- matrix(rnorm(p*N), ncol=p) # Submodel sizes m=84 and m=83 with more simulations (10,000) for precision -- 11.5 sec UL.max.84 <- PoSI(X.84, Nsim=10000, bundleSZ=10000, modelSZ=c(p-1,p), verbose=2) # Out of interest, what is the distribution of inner products? hist(UL.max.84, col="gray", breaks=40) # Sparse submodels of size m=4 out of p=84 in p=d=84 dimensions, 7.7 million contrasts -- 25.5 minutes (!!!!) UL.max.84.4 <- PoSI(X.84, Nsim=1000, bundleSZ=100000, modelSZ=4) ## p = 84 , d = 84 processed 7718004 tests in 1929501 models. Times in seconds: ## user system elapsed ## 1916.80 106.21 2023.77 summary(UL.max.84.4) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.555 10.315 5.804 ## 99% 3.970 10.819 6.068 summary(UL.max.84.4, df.err=2758-84-1) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.560 10.338 5.823 ## 99% 3.976 10.855 6.089 # Big experiment 1: full large PoSI (takes about 13.5 minutes (*)) p <- 20; N <- 1000 X.p20 <- matrix(rnorm(N*p), ncol=p) UL.max.p20 <- PoSI(X.p20, bundleSZ=100000) # 2014/04/19: ## p = 20 , d = 20 processed 10485760 tests in 1048575 models. Times in seconds: ## user system elapsed ## 1110.37 114.68 1225.31 # about 20min summary(UL.max.p20) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.171 5.605 5.855 ## 99% 3.619 6.129 6.117 summary(UL.max.p20, df.err=1000-21) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.180 5.624 5.908 ## 99% 3.631 6.160 6.177 # Big experiment 2: sparse large PoSI (models <= 5; takes about 32 minutes (*)) p <- 50; N <- 1000 X.p50.m5 <- matrix(rnorm(N*p), ncol=p) UL.max.p50.m5 <- PoSI(X.p50.m5, bundleSZ=100000, modelSZ=1:5) # 2014/04/19: ## p = 50 , d = 50 processed 11576300 tests in 2369935 models. Times in seconds: ## user system elapsed ## 1901.24 117.49 2036.86 summary(UL.max.p50.m5) ## K.PoSI K.Scheffe K.Bonferroni ## 95% 3.522 8.216 5.871 ## 99% 3.951 8.727 6.133 # (*) Elapsed times were measured in R version 3.0.2, 32 bit, # on a processor Intel(R) Core(TM), 2.9 GHz, under Windows 7. # 2014/09/11 #================================================================ # EXCHANGEABLE DESIGNS: # parametrized as quarter circle from xj=e to xj = {ej adjusted for e=(1...1)' and normalized} # {...} = (ej - e/p)/sqrt(1-1/p) # Function to create exchangeable designs this way: design.exch <- function(p,a) { (1-a)*diag(p) + a*matrix(1,p,p) } p <- 12; a <- 0.5; # Exchangeable design with parameter 'a': K(PoSI(X=design.exch(p=p, a=a) , verbose=0, modelSZ=1:p)) # Orthogonal design: K(PoSI(X=diag(p), center=F, verbose=0)) # ==> quite bit lower than exchangeable design, # so asymptotic theory sets for much bigger p #================================================================