#================================================================ # # # R SOFTWARE FOR COMPUTING POSI CONSTANTS: SOURCE CODE # # # Authors: Andreas Buja, Kai Zhang # 2014/12/11 # # Reference: http://arxiv.org/abs/1306.1059 # "Valid Post-Selection Inference," # by Berk, R., Brown, L., Buja, A., Zhang, K., Zhao,L., # The Annals of Statistics, 41 (2), 802-837 (2013) # # #================================================================ ## ## ## PoSI ## ## Valid Post-Selection Inference for multiple linear regression ## ## Description: ## ## 'PoSI()' creates an object of class "PoSI" from a design/predictor ## matrix. PoSI objects are used to obtain the standard error ## multipliers or z-test/t-test thresholds for valid post-selection ## inference in multiple linear regression according to Berk et ## al. (2013, Annals of Statistics). These multipliers are valid ## after performing arbitrary variable selection in the specified set ## of submodels. ## ## The functions 'summary()' and 'K()' compute the actual multipliers ## from an object of class 'PoSI' returned by the function 'PoSI()'. ## The function 'summary()' provides PoSI multipliers for given ## confidence levels as well as more conservative multipliers based ## on the Scheffe method of simultaneous inference and Bonferroni ## adjustment; 'K()' provides just the PoSI multipliers. By default ## the multipliers are provided for z-tests assuming the error ## variance is known; if error degrees of freedom are specified ## ('df.err'), the multipliers are for t-tests. ## ## ## Usage: ## ## PoSI(X, modelSZ=1:ncol(X), Nsim=1000, bundleSZ=10000, ## eps=1E-8, center=T, scale=T, verbose=1) ## ## summary(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, ## eps.PoSI=1E-6, digits=3) ## ## K(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, ## eps.PoSI=1E-6, digits=3 ## ## Arguments: ## ## X: a design/predictor/covariate matrix for multiple linear ## regression ## ## [When there are predictors that are 'protected', that ## is, forced into the model and not subjected to variable ## selection, one should adjust the selectable predictors ## for the protected predictors (i.e., replace the ## selectable predictors with their residual vectors when ## regressing them on the protected predictors); thereafter ## simply omit the protected predictors from X.] ## ## modelSZ: vector of submodel sizes to be searched ## Default: 1:ncol(X), i.e., all submodels of all sizes. ## If only models up to size 5 (e.g.) are considered, use 1:5. ## ## Nsim: the number of random unit vectors sampled in the column ## space of X; default: Nsim=1000 ## 'PoSI' is simulation-based. Larger 'Nsim' increases ## precision but may cause running out of memory. ## ## bundleSZ: the number of contrasts/adjusted predictor vectors ## processed at any given time; default: bundleSZ=10000 ## 'PoSI' is more efficient with larger 'bundleSZ', but it ## may cause running out of memory. ## ## eps: threshold below which singular values of X are ## considered to be zero ## In case of highly collinear columns of X this threshold ## determines the effective dimension of the column space ## of X. Default: eps=1E-6 ## ## center: whether the columns of X should be centered ## In most applications the intercept is not among ## variables to be selected. Centering effectively adjusts ## for the presence of an intercept that should be included ## in all submodels. ## ## scale: whether the columns of X should be standardized ## This is appropriate to prevent numeric problems from ## predictors with vastly differing scales. ## ## verbose: verbosity reporting levels ## 0: no reports ## 1: report bundle completion (default); ## 2: report each processed submodel (for debugging) ## ## confidence: list of confidence levels for which standard error ## multipliers should be provided ## default: confidence=c(.95,.99) ## ## alpha: if specified, sets 'confidence = 1-alpha' ## ## df.err: error degrees of freedom for t-tests ## default: df.err=NULL (z-tests) ## ## eps.PoSI: precision to which standard error multipliers are ## computed ## ## digits: number of digits to which standard error multipliers are ## rounded ## ## Details: ## ## 'PoSI' is simulation-based and requires specification of a ## number 'Nsim' of random unit vectors to be sampled in the column ## space of 'X'. Large 'Nsim' yields greater precision but ## requires more memory. The memory demands can be lowered by ## decreasing 'bundleSZ' at the cost of some efficiency. ## 'bundleSZ' determines how many adjusted predictor vectors are ## being processed at any given time. ## ## The computational limit of the PoSI method is in the exponential ## growth of the number of adjusted predictor vectors: ## - If p=ncol(X) and all submodels are being searched, the number ## of adjusted predictor vectors is p*2^(p-1). ## Example: p=20, m=1:20 ==> |{adj.vecs.}| = 10,485,760 ## - If only models of a given size 'm' are being searched, the number ## of adjusted predictor vectors is m*choose(p,m). ## Example: p=50, m=1:5 ==> |{adj.vecs.}| = 11,576,300 ## Thus limiting PoSI to small submodel sizes such as 'm=1:4' ## ("sparse PoSI") puts larger p=ncol(X) within reach. ## ## Value: ## ## 'PoSI' returns an object of 'class' '"PoSI"'. ## ## 'summary' returns a small matrix with standard error multipliers ## for all requested confidence levels as well as computed by three ## different methods: the PoSI method, Scheffe simultaneous ## inference, and Bonferroni adjustment. ## ## 'K' returns just the PoSI-based standard error multipliers for ## the requested confidence levels or for specified Type I errors ## 'alpha' (default: NULL, overwrites 'confidence = 1 - alpha' if ## non-NULL). ## ## #================================================================ # Computations of inner products of random unit vectors (U) # with 'linear contrasts', that is, adjusted predictor vectors (L): PoSI <- function(X, Nsim=1000, bundleSZ=100000, modelSZ=1:ncol(X), eps=1E-8, center=T, scale=T, verbose=1) { if(verbose>=1) { proc.time.init <- proc.time() } # Standardize the columns of X for numerical stability; if intercept is to be selected, set 'center=F': X.scaled <- scale(X, center=center, scale=scale) # Remove columns with NAs: sel <- apply(X.scaled, 2, function(x) !any(is.na(x))) X.scaled <- X.scaled[,sel] if(verbose >= 1 & any(!sel)) cat("Removed column(s)",which(!sel),"due to NAs after standardizing'\n") # Map X to Scheffe canonical coordinates X.cc: X.cc' X.cc = X'X (d x p) X.svd <- svd(X.scaled) sel <- (X.svd$d > eps) if(verbose >= 1 & any(!sel)) cat("Reduced rank:",sum(sel)," (Change eps?)\n") X.cc <- X.svd$d[sel] * t(X.svd$v[,sel]) # Full-model Scheffe canonical coords. of X. # Clean up: rm(X.scaled, X.svd) # Notation: p <- ncol(X.cc) # Number of predictors d <- nrow(X.cc) # d=rank(X), can be < p when X is rank-deficient, as when p>n # Submodels can only be of size <= d; drop those exceeding d: modelSZ <- modelSZ[modelSZ <= d] # Remove models that are too large. # Random unit vectors for which inner products with adjusted predictors (L) will be formed: U <- matrix(rnorm(d*Nsim),ncol=d) # Nsim x d U <- U / sqrt(rowSums(U^2)) # Matrix of unit vectors u in R^d (=rows) for max_l || # Prepare loop over models to store all possible linear combinations l: UL.max <- rep(0,Nsim) # To contain max_l || for each random row u of U. L <- matrix(NA, nrow=bundleSZ, ncol=d) # bundleSZ x d, one bundle of adjusted predictors l Nmodels <- 0 # Number of processed models; to be returned among attributes() Ntests <- 0 # Number of processed t-tests; to be returned among attributes() ibundles <- 0 # Bundle counter, to be reported when 'verbose >= 1'. if(verbose>=1) { cat("Number of contrasts/adjusted predictors to process:",sum(choose(p,modelSZ)*modelSZ),"\n") cat("Number of bundles:",sum(ceiling(choose(p,modelSZ)*modelSZ/bundleSZ)),"\n") } for(m in modelSZ) { # Loop over model sizes. if(m > bundleSZ) { # Silly: bundleSZ is too small to contain a single model; fix it. bundleSZ <- max(bundleSZ, m) L <- matrix(NA, nrow=bundleSZ, ncol=d) } i.store <- 1 # Pointer into next free row of L (= d-vectors) M <- 1:m # M = current submodel, initialized to first submodel M.last <- (p-m+1):p # M.last = last submodel of size m repeat { # Loop over bundles of adjusted predictors # Verbosity level 2: print all models & dims if(verbose>=2) cat("Model:", M,"\n") X.m <- X.cc[,M] # Select predictors for this submodel X.qr <- qr(t(X.m)%*%X.m) # QR decomposition of submodel in canon. coords. if(X.qr$rank == length(M)) { # Process only if full rank; else skip to next. L[i.store:(i.store+m-1),] <- t(X.m %*% qr.solve(X.qr)) # Submodel adjusted predictors (rows of L) Nmodels <- Nmodels + 1 Ntests <- Ntests + m i.store <- i.store+m # Move pointer to next free row of L. # If bundle L of adjusted predictors l is full, or if this was the last model, then # accumulate max_l || and re-initialize L: if(( (i.store-1+m) > bundleSZ ) | !(any(M < M.last) )) { ii.store <- 1:(i.store-1) # Filled rows in L L[ii.store,] <- L[ii.store,,drop=F] / # Normalize rows of L to unit length: sqrt(rowSums(L[ii.store,,drop=F]^2)) # Implementation note: Against 'help()', '..%*%t(..)' is a touch faster than 'tcrossprod()'. UL <- abs(U %*% t(L[ii.store,,drop=F])) # Inner prods || of rows of U and L # UL <- abs(tcrossprod(U, L[ii.store,,drop=F])) # Inner prods || of rows of U and L UL.max <- pmax(UL.max, apply(UL, 1, max)) # Accumulate max_l || i.store <- 1 # Initialize L to fill up a new bundle # Verbosity level 1: print on bundle completion if(verbose>=1) { ibundles <- ibundles + 1 cat(" Done with bundle",ibundles," model sz =",m,"\n") } } # end of 'if(( (i.store-1+m...' } # end of 'if(X.qr$rank...' # Find next model in enumeration: if(any(M < M.last)) { # Not at the last submodel yet, hence find the next one: i <- max(which(M < M.last)) # Pointer to the highest predictor index not in the last model M[i] <- M[i]+1 # Move this predictor index to point to the next predictor. if(i < m) { M[(i+1):m] <- (M[i]+1):(M[i]+m-i) } # Set the remaining to their minima. } else break } # end of 'repeat{' } # end of 'for(m in modelSZ) {' if(verbose>=1) { cat("p =",p,", d =",d," processed",Ntests,"tests in",Nmodels,"models. Times in seconds:\n") print(proc.time() - proc.time.init) } attributes(UL.max)$d <- d attributes(UL.max)$p <- p attributes(UL.max)$Nmodels <- Nmodels attributes(UL.max)$Ntests <- Ntests class(UL.max) <- "PoSI" UL.max } # end of 'PoSI <- function(...)' #================================================================ # Computation of PoSI, Scheffe and Bonferroni constants: summary.PoSI <- function(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3) { if(class(PoSI.object) != "PoSI") { cat("K.PoSI: first argument is not of class 'PoSI'.\n"); return } rU <- 1/PoSI.object df <- attributes(PoSI.object)$d if(is.null(alpha)) { alpha <- 1-confidence } else { confidence <- 1-alpha } # The following functions rely on lexical scoping: if(is.null(df.err)) { # sigma known ==> z-statistics prob.PoSI <- function(K) mean(pchisq((K*rU)^2, df)) quant.Sch <- function() sqrt(qchisq(confidence, df=df)) quant.Bonf <- function() qnorm(1 - alpha/2/attributes(PoSI.object)$Ntests) } else { # sigma estimated with df.err ==> t-statistics prob.PoSI <- function(K) mean(pf((K*rU)^2/df, df, df.err)) quant.Sch <- function() sqrt(df*qf(confidence, df1=df, df2=df.err)) quant.Bonf <- function() qt(1 - alpha/2/attributes(PoSI.object)$Ntests, df=df.err) } # Bisection search for K.PoSI: K.PoSI <- sapply(confidence, function(cvg) { K.lo <- 0 K.hi <- 30 while(abs(K.lo - K.hi) > eps.PoSI) { K <- (K.lo + K.hi)/2 if(mean(prob.PoSI(K)) > cvg) { K.hi <- K } else { K.lo <- K } } K } ) K.Sch <- quant.Sch() K.Bonf <- quant.Bonf() Ks <- cbind("K.PoSI"=K.PoSI, "K.Scheffe"=K.Sch, "K.Bonferroni"=K.Bonf) rownames(Ks) <- paste(confidence*100,"%",sep="") round(Ks, digits) } # end of 'summary.PoSI <- function(...' K.PoSI <- function(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3) { if(class(PoSI.object) != "PoSI") { cat("K.PoSI: first argument is not of class 'PoSI'.\n"); return } rU <- 1/PoSI.object df <- attributes(PoSI.object)$d if(is.null(alpha)) { alpha <- 1-confidence } else { confidence <- 1-alpha } if(is.null(df.err)) { # sigma known ==> z-statistics prob.PoSI <- function(K) mean(pchisq((K*rU)^2, df)) } else { # sigma estimated with df.err ==> t-statistics prob.PoSI <- function(K) mean(pf((K*rU)^2/df, df, df.err)) } # Bisection search for K.PoSI: K.PoSI <- sapply(confidence, function(cvg) { K.lo <- 0 K.hi <- 30 while(abs(K.lo - K.hi) > eps.PoSI) { K <- (K.lo + K.hi)/2 if(mean(prob.PoSI(K)) > cvg) { K.hi <- K } else { K.lo <- K } } K } ) names(K.PoSI) <- confidence round(K.PoSI, digits) } # end of 'K.PoSI <- function(...' # Generic function: K <- function(object, ...) { UseMethod("K") } #================================================================