#================================================================
#
#
#          R SOFTWARE FOR COMPUTING POSI CONSTANTS
#
#
# Authors: Andreas Buja, Kai Zhang
# 2014/09/11
#
# Reference: http://arxiv.org/abs/1306.1059
#            "Valid Post-Selection Inference,"
#            by Berk, R., Brown, L., Buja, A., Zhang, K., Zhao,L.,
#            The Annals of Statistics, 41 (2), 802-837 (2013)
#            
#
#================================================================


# Computations of inner products of random unit vectors (U)
# with 'linear contrasts', that is, adjusted predictor vectors (L):
PoSI <- function(X, Nsim=1000, bundleSZ=100000, modelSZ=1:ncol(X), 
                 eps=1E-8, center=T, scale=T, verbose=1)
{
    if(verbose>=1) { proc.time.init <- proc.time() }
    # Standardize the columns of X for numerical stability; if intercept is to be selected, set 'center=F':
    X.scaled <- scale(X, center=center, scale=scale)
    # Remove columns with NAs:
    sel <- apply(X.scaled, 2, function(x) !any(is.na(x)))
    X.scaled <- X.scaled[,sel]  
    if(verbose >= 1 & any(!sel)) cat("Removed column(s)",which(!sel),"due to NAs after standardizing'\n")
    # Map X to Scheffe canonical coordinates X.cc:  X.cc' X.cc = X'X  (d x p)
    X.svd   <- svd(X.scaled)
    sel <- (X.svd$d > eps)
    if(verbose >= 1 & any(!sel)) cat("Reduced rank:",sum(sel),"  (Change eps?)\n")
    X.cc <- X.svd$d[sel] * t(X.svd$v[,sel]) # Full-model Scheffe canonical coords. of X.
    # Clean up:
    rm(X.scaled, X.svd)
    # Notation:
    p <- ncol(X.cc)                         # Number of predictors
    d <- nrow(X.cc)                         # d=rank(X), can be < p when X is rank-deficient, as when p>n
    # Submodels can only be of size <= d; drop those exceeding d:
    modelSZ <- modelSZ[modelSZ <= d]        # Remove models that are too large.
    # Random unit vectors for which inner products with adjusted predictors (L) will be formed:
    U <- matrix(rnorm(d*Nsim),ncol=d)       # Nsim x d
    U <- U / sqrt(rowSums(U^2))             # Matrix of unit vectors u in R^d (=rows) for max_l |<u,l>|
    # Prepare loop over models to store all possible linear combinations l:
    UL.max <- rep(0,Nsim)                   # To contain  max_l |<u,l>|  for each random row u of U.
    L <- matrix(NA, nrow=bundleSZ, ncol=d)  # One bundle of adjusted predictors l (bundleSZ x d)
    Nmodels  <- 0                           # To be returned among attributes()
    Ntests   <- 0                           # To be returned among attributes()
    ibundles <- 0                           # For verbose>=1
    if(verbose>=1) {
        cat("Number of contrasts/adjusted predictors to process:",sum(choose(p,modelSZ)*modelSZ),"\n")
        cat("Number of bundles:",sum(ceiling(choose(p,modelSZ)*modelSZ/bundleSZ)),"\n")
    }
    for(m in modelSZ) {                     # Loop over model sizes.
        bundleSZ <- max(bundleSZ, m)        # Make sure one bundle is able to contain at least one model.
        i.store <- 1                        # Pointer into next free row of L (= d-vectors)
        M <- 1:m                            # Current submodel, initialized to first submodel
        M.last <- (p-m+1):p                 # The last submodel of size m
        repeat {                            # Loop over bundles of adjusted predictors
            # Verbosity level 2: print all models & dims
            if(verbose>=2) cat("Model:", M,"\n")
            X.m <- X.cc[,M]                 # Select predictors for this submodel
            X.qr <- qr(t(X.m)%*%X.m)        # QR decomposition of submodel in canon. coords.
            if(X.qr$rank < length(M)) next  # Rank-deficient submodel ==> omit.
            L[i.store:(i.store+m-1),] <- t(X.m %*% qr.solve(X.qr)) # Submodel adjusted predictors (rows of L)
            Nmodels <- Nmodels + 1
            Ntests  <- Ntests + m
            i.store <- i.store+m            # Move pointer to next free column of L.
            # If bundle L of adjusted predictors l is full, or if this was the last model, then
            # accumulate  max_l |<u,l>|  and re-initialize L:
            if(( (i.store+m-1) > bundleSZ ) | !(any(M < M.last) )) {
                L[1:(i.store-1),] <- L[1:(i.store-1),] /     # Normalize rows of L to unit length:
                                     sqrt(rowSums(L[1:(i.store-1),,drop=F]^2))
                # Implementation note: Against 'help()', '..%*%t(..)' is a touch faster than 'tcrossprod()'.
                UL <- abs(U %*% t(L[1:(i.store-1),,drop=F])) # Inner prods |<u,l>| of rows of U and L
                UL.max <- pmax(UL.max, apply(UL, 1, max))    # Accumulate  max_l |<u,l>|
                i.store <- 1                                 # Initialize L to fill up a new bundle
                # Verbosity level 1: print on bundle completion
                if(verbose>=1) {
                    ibundles <- ibundles + 1
                    cat("                         Done with bundle",ibundles,"   model sz =",m,"\n") 
                }
            }
            # Find next model in enumeration:
            if(any(M < M.last)) {            # Not at the last submodel yet, hence find the next one:
                i <- max(which(M < M.last))  # Pointer to the highest predictor index not in the last model
                M[i] <- M[i]+1   # Move this predictor index to point to the next predictor.
                if(i < m) { M[(i+1):m] <- (M[i]+1):(M[i]+m-i) } # Set the remaining to their minima.
            } else break
        } # end of 'repeat{'
    } # end of 'for(m in modelSZ) {'
    if(verbose>=1) {
        cat("p =",p,", d =",d,"  processed",Ntests,"tests in",Nmodels,"models.  Times in seconds:\n")
        print(proc.time() - proc.time.init)
    }
    attributes(UL.max)$d <- d
    attributes(UL.max)$p <- p
    attributes(UL.max)$Nmodels <- Nmodels
    attributes(UL.max)$Ntests  <- Ntests
    class(UL.max) <- "PoSI"
    UL.max
} # end of 'PoSI <- function(...)'


#================================================================


# Computation of PoSI, Scheffe and Bonferroni constants:
summary.PoSI <- function(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3)
{
    if(class(PoSI.object) != "PoSI") { cat("K.PoSI: first argument is not of class 'PoSI'.\n");  return }
    rU <- 1/PoSI.object
    df <- attributes(PoSI.object)$d
    if(is.null(alpha)) { alpha <- 1-confidence } else { confidence <- 1-alpha }
    # The following functions rely on lexical scoping:
    if(is.null(df.err)) { # sigma known  ==>  z-statistics
        prob.PoSI  <- function(K) mean(pchisq((K*rU)^2, df))
        quant.Sch  <- function() sqrt(qchisq(confidence, df=df))
        quant.Bonf <- function() qnorm(1 - alpha/2/attributes(PoSI.object)$Ntests)
    } else { # sigma estimated with df.err  ==>  t-statistics
        prob.PoSI  <- function(K) mean(pf((K*rU)^2/df, df, df.err)) 
        quant.Sch  <- function() sqrt(df*qf(confidence, df1=df, df2=df.err))
        quant.Bonf <- function() qt(1 - alpha/2/attributes(PoSI.object)$Ntests, df=df.err)
    }
    # Bisection search for K.PoSI:
    K.PoSI <- sapply(confidence,
                     function(cvg) {
                         K.lo <- 0
                         K.hi <- 30
                         while(abs(K.lo - K.hi) > eps.PoSI) {
                             K <- (K.lo + K.hi)/2
                             if(mean(prob.PoSI(K)) > cvg) {
                                 K.hi <- K
                             } else {
                                 K.lo <- K
                             } }
                         K }
                     )
    K.Sch <- quant.Sch()
    K.Bonf <- quant.Bonf()
    Ks <- cbind("K.PoSI"=K.PoSI, "K.Scheffe"=K.Sch, "K.Bonferroni"=K.Bonf)
    rownames(Ks) <- paste(confidence*100,"%",sep="")
    round(Ks, digits)
} # end of 'summary.PoSI <- function(...'

K.PoSI <- function(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL, eps.PoSI=1E-6, digits=3)
{
    if(class(PoSI.object) != "PoSI") { cat("K.PoSI: first argument is not of class 'PoSI'.\n");  return }
    rU <- 1/PoSI.object
    df <- attributes(PoSI.object)$d
    if(is.null(alpha)) { alpha <- 1-confidence } else { confidence <- 1-alpha }
    if(is.null(df.err)) { # sigma known  ==>  z-statistics
        prob.PoSI  <- function(K) mean(pchisq((K*rU)^2, df))
    } else { # sigma estimated with df.err  ==>  t-statistics
        prob.PoSI  <- function(K) mean(pf((K*rU)^2/df, df, df.err)) 
    }
    # Bisection search for K.PoSI:
    K.PoSI <- sapply(confidence,
                     function(cvg) {
                         K.lo <- 0
                         K.hi <- 30
                         while(abs(K.lo - K.hi) > eps.PoSI) {
                             K <- (K.lo + K.hi)/2
                             if(mean(prob.PoSI(K)) > cvg) {
                                 K.hi <- K
                             } else {
                                 K.lo <- K
                             } }
                         K }
                     )
    names(K.PoSI) <- confidence
    round(K.PoSI, digits)
} # end of 'K.PoSI <- function(...'

# Generic function:
K <- function(object, ...) { UseMethod("K") }


#================================================================

PoSI

Valid Post-Selection Inference for multiple linear regression

Description:

    'PoSI()' creates an object of class "PoSI" from a design/predictor
    matrix.  PoSI objects are used to obtain the standard error
    multipliers or z-test/t-test thresholds for valid post-selection
    inference in multiple linear regression according to Berk et
    al. (2013, Annals of Statistics).  These multipliers are valid
    after performing arbitrary variable selection in the specified set
    of submodels.

    The functions 'summary()' and 'K()' compute the actual multipliers
    from an object of class 'PoSI' returned by the function 'PoSI()'.
    The function 'summary()' provides PoSI multipliers for given
    confidence levels as well as more conservative multipliers based
    on the Scheffe method of simultaneous inference and Bonferroni
    adjustment; 'K()' provides just the PoSI multipliers.  By default
    the multipliers are provided for z-tests assuming the error
    variance is known; if error degrees of freedom are specified
    ('df.err'), the multipliers are for t-tests.
                                                           

Usage:

    PoSI(X, modelSZ=1:ncol(X), Nsim=1000, bundleSZ=10000, 
         eps=1E-8, center=T, scale=T, verbose=1)

    summary(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL,
            eps.PoSI=1E-6, digits=3)

    K(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL,
      eps.PoSI=1E-6, digits=3

Arguments:

           X: a design/predictor/covariate matrix for multiple linear
              regression

              [When there are predictors that are 'protected', that
              is, forced into the model and not subjected to variable
              selection, one should adjust the selectable predictors
              for the protected predictors (i.e., replace the
              selectable predictors with their residual vectors when
              regressing them on the protected predictors); thereafter
              simply omit the protected predictors from X.]
              
     modelSZ: vector of submodel sizes to be searched
              Default: 1:ncol(X), i.e., all submodels of all sizes.
              If only models up to size 5 (e.g.) are considered, use 1:5.

        Nsim: the number of random unit vectors sampled in the column
              space of X; default: Nsim=1000
              'PoSI' is simulation-based. Larger 'Nsim' increases
              precision but may cause running out of memory.

    bundleSZ: the number of contrasts/adjusted predictor vectors
              processed at any given time; default: bundleSZ=10000
              'PoSI' is more efficient with larger 'bundleSZ', but it
              may cause running out of memory.

         eps: threshold below which singular values of X are
              considered to be zero
              In case of highly collinear columns of X this threshold
              determines the effective dimension of the column space
              of X.  Default: eps=1E-6

      center: whether the columns of X should be centered
              In most applications the intercept is not among
              variables to be selected.  Centering effectively adjusts
              for the presence of an intercept that should be included
              in all submodels.

       scale: whether the columns of X should be standardized
              This is appropriate to prevent numeric problems from
              predictors with vastly differing scales.

     verbose: verbosity reporting levels
              0: no reports
              1: report bundle completion (default);
              2: report each processed submodel (for debugging)

  confidence: list of confidence levels for which standard error
              multipliers should be provided
              default: confidence=c(.95,.99)

       alpha: if specified, sets  'confidence = 1-alpha'

      df.err: error degrees of freedom for t-tests
              default: df.err=NULL (z-tests)

    eps.PoSI: precision to which standard error multipliers are
              computed

      digits: number of digits to which standard error multipliers are
              rounded

Details:

      'PoSI' is simulation-based and requires specification of a
      number 'Nsim' of random unit vectors to be sampled in the column
      space of 'X'.  Large 'Nsim' yields greater precision but
      requires more memory.  The memory demands can be lowered by
      decreasing 'bundleSZ' at the cost of some efficiency.
      'bundleSZ' determines how many adjusted predictor vectors are
      being processed at any given time.

      The computational limit of the PoSI method is in the exponential
      growth of the number of adjusted predictor vectors:
      - If p=ncol(X) and all submodels are being searched, the number
        of adjusted predictor vectors is p*2^(p-1).
        Example: p=20, m=1:20  ==>  |{adj.vecs.}| = 10,485,760
      - If only models of a given size 'm' are being searched, the number
        of adjusted predictor vectors is m*choose(p,m).
        Example: p=50, m=1:5   ==>  |{adj.vecs.}| = 11,576,300
      Thus limiting PoSI to small submodel sizes such as 'm=1:4'
      ("sparse PoSI") puts larger p=ncol(X) within reach.

Value:

      'PoSI' returns an object of 'class' '"PoSI"'.

      'summary' returns a small matrix with standard error multipliers
      for all requested confidence levels as well as computed by three
      different methods: the PoSI method, Scheffe simultaneous
      inference, and Bonferroni adjustment.

      'K' returns just the PoSI-based standard error multipliers for
      the requested confidence levels or for specified Type I errors
      'alpha' (default: NULL, overwrites 'confidence = 1 - alpha' if
      non-NULL).


#================================================================


# OBSERVATIONS ON COMPUTATIONAL EXPENSE:


# *** Full PoSI:
# Here is the list of total number of linear constrasts in ALL submodels.
p <- 1:20;  cbind("predictors"=p, "PoSI contrasts"=p*2^(p-1))
##         predictors   PoSI contrasts
##  [1,]            1                1
##  [2,]            2                4
##  [3,]            3               12
##  [4,]            4               32
##  [5,]            5               80
##  [6,]            6              192
##  [7,]            7              448
##  [8,]            8             1024
##  [9,]            9             2304
## [10,]           10             5120
## [11,]           11            11264
## [12,]           12            24576
## [13,]           13            53248
## [14,]           14           114688
## [15,]           15           245760
## [16,]           16           524288
## [17,]           17          1114112
## [18,]           18          2359296
## [19,]           19          4980736
## [20,]           20         10485760
# E.g., p=20 predictors involves inference for about 10.5 million linear contrasts.
# This takes about 25 minutes (Lenovo X230 in 2014).


# *** Sparse PoSI:
# If we are interested only in models of a certain size,
# here is how to calculate the number of contrasts:
m <- 1:5          # model sizes 1, 2, 3, 4, 5
p <- 64           # total number of predictors
sum(m*choose(p,m))
## [1] 40793152
# ==> ~41 million linear contrasts
# ==> Takes about 100 min (Lenovo X230 in 2014).


#================================================================


# TESTS:

# Just 1 predictor:
X.1 <- as.matrix(rnorm(100))
UL.max.1 <- PoSI(X.1)
summary(UL.max.1)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  1.960     1.960        1.960
## 99%  2.576     2.576        2.576
summary(UL.max.1, df.err=4)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  2.776     2.776       2.776
## 99%  4.604     4.604       4.604


# Small example, testing whether removal of intercept and small N works:
p <- 6;  N <- 4
X.small <- cbind(1,matrix(rnorm(N*p), ncol=p))
UL.max.small <- PoSI(X.small, modelSZ=c(4,3,1), Nsim=1000, bundleSZ=5, verbose=2)
summary(UL.max.small, df.err=4)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  4.228     4.447        9.256
## 99%  6.733     7.077       13.988


# Orthogonal design:
p <- 10; N <- 10
X.orth <- qr.Q(qr(matrix(rnorm(p*N), ncol=p)))
UL.max.orth <- PoSI(X.orth)
summary(UL.max.orth)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.379     4.113        4.422
## 99%  3.885     4.655        4.758


# Boston Housing data:
X.boston <- read.table("http://stat.wharton.upenn.edu/~buja/STAT-541/boston.dat", header=T)
dim(X.boston) # [1] 506  14
colnames(X.boston)
UL.max.boston <- PoSI(X.boston[,-14])   # 4.87 seconds
summary(UL.max.boston, df.err=506-13-1)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.620     4.756        4.967
## 99%  4.115     5.307        5.287


# Large p=50, small N=20: 
p <- 50;  N <- 20
X.p100.N20 <- matrix(rnorm(p*N), ncol=p)
# Sparse submodels m=4 out of p=50 in d=19 dimensions -- 1.3 minutes
UL.max.p100.N20 <- PoSI(X.p100.N20, Nsim=1000, bundleSZ=100000, modelSZ=1:4)
## p = 50 , d = 19   processed 982500 tests in 251175 models.  Times in seconds:
##    user  system elapsed 
##  170.32   13.65  184.16 
summary(UL.max.p100.N20)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  4.297     5.490        5.448
## 99%  4.751     6.016        5.728


# Autism data: can't give out the data, but here is the size
p <- 84;  N <- 2758
X.84 <- matrix(rnorm(p*N), ncol=p)
# Submodel sizes m=84 and m=83 with more simulations (10,000) for precision -- 11.5 sec
UL.max.84 <- PoSI(X.84, Nsim=10000, bundleSZ=10000, modelSZ=c(p-1,p), verbose=2)
# Out of interest, what is the distribution of inner products?
hist(UL.max.84, col="gray", breaks=40)

# Sparse submodels of size m=4 out of p=84 in p=d=84 dimensions, 7.7 million contrasts -- 25.5 minutes (!!!!)
UL.max.84.4 <- PoSI(X.84, Nsim=1000, bundleSZ=100000, modelSZ=4)
## p = 84 , d = 84   processed 7718004 tests in 1929501 models.  Times in seconds:
##    user  system elapsed 
## 1916.80  106.21 2023.77 
summary(UL.max.84.4)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.555    10.315        5.804
## 99%  3.970    10.819        6.068
summary(UL.max.84.4, df.err=2758-84-1)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.560    10.338        5.823
## 99%  3.976    10.855        6.089


# Big experiment 1: full large PoSI (takes about 13.5 minutes (*))
p <- 20;  N <- 1000
X.p20 <- matrix(rnorm(N*p), ncol=p)
UL.max.p20 <- PoSI(X.p20, bundleSZ=100000)  # 2014/04/19:
## p = 20 , d = 20   processed 10485760 tests in 1048575 models.  Times in seconds:
##    user  system elapsed 
## 1110.37  114.68 1225.31  # about 20min
summary(UL.max.p20)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.171     5.605        5.855
## 99%  3.619     6.129        6.117
summary(UL.max.p20, df.err=1000-21)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.180     5.624        5.908
## 99%  3.631     6.160        6.177

# Big experiment 2: sparse large PoSI (models <= 5; takes about 32 minutes (*))
p <- 50;  N <- 1000
X.p50.m5 <- matrix(rnorm(N*p), ncol=p)
UL.max.p50.m5 <- PoSI(X.p50.m5, bundleSZ=100000, modelSZ=1:5)  # 2014/04/19:
## p = 50 , d = 50   processed 11576300 tests in 2369935 models.  Times in seconds:
##    user  system elapsed 
## 1901.24  117.49 2036.86 
summary(UL.max.p50.m5)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.522     8.216        5.871
## 99%  3.951     8.727        6.133

# (*) Elapsed times were measured in R version 3.0.2, 32 bit,
#     on a processor Intel(R) Core(TM), 2.9 GHz, under Windows 7.
#     2014/09/11


#================================================================


# EXCHANGEABLE DESIGNS:
#   parametrized as quarter circle from  xj=e  to  xj = {ej adjusted for e=(1...1)' and normalized}
#   {...} = (ej - e/p)/sqrt(1-1/p)
# Function to create exchangeable designs this way:
design.exch <- function(p,a) { (1-a)*diag(p) + a*matrix(1,p,p) }
p <- 12;  a <- 0.5;  
# Exchangeable design with parameter 'a':
K(PoSI(X=design.exch(p=p, a=a) , verbose=0, modelSZ=1:p))
# Orthogonal design: 
K(PoSI(X=diag(p), center=F, verbose=0))   
# ==> quite bit lower than exchangeable design,
#     so asymptotic theory sets for much bigger p
      

#================================================================