#================================================================
#
#
#     R SOFTWARE FOR COMPUTING POSI CONSTANTS: SAMPLE CODE
#
#
# Authors: Andreas Buja, Kai Zhang
# 2014/12/11
#
# Reference: http://arxiv.org/abs/1306.1059
#            "Valid Post-Selection Inference,"
#            by Berk, R., Brown, L., Buja, A., Zhang, K., Zhao,L.,
#            The Annals of Statistics, 41 (2), 802-837 (2013)
#            
#
#================================================================
##
##
## PoSI
##
## Valid Post-Selection Inference for multiple linear regression
##
## Description:
##
##     'PoSI()' creates an object of class "PoSI" from a design/predictor
##     matrix.  PoSI objects are used to obtain the standard error
##     multipliers or z-test/t-test thresholds for valid post-selection
##     inference in multiple linear regression according to Berk et
##     al. (2013, Annals of Statistics).  These multipliers are valid
##     after performing arbitrary variable selection in the specified set
##     of submodels.
##
##     The functions 'summary()' and 'K()' compute the actual multipliers
##     from an object of class 'PoSI' returned by the function 'PoSI()'.
##     The function 'summary()' provides PoSI multipliers for given
##     confidence levels as well as more conservative multipliers based
##     on the Scheffe method of simultaneous inference and Bonferroni
##     adjustment; 'K()' provides just the PoSI multipliers.  By default
##     the multipliers are provided for z-tests assuming the error
##     variance is known; if error degrees of freedom are specified
##     ('df.err'), the multipliers are for t-tests.
##                                                           
##
## Usage:
##
##     PoSI(X, modelSZ=1:ncol(X), Nsim=1000, bundleSZ=10000, 
##          eps=1E-8, center=T, scale=T, verbose=1)
##
##     summary(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL,
##             eps.PoSI=1E-6, digits=3)
##
##     K(PoSI.object, confidence=c(.95,.99), alpha=NULL, df.err=NULL,
##       eps.PoSI=1E-6, digits=3
##
## Arguments:
##
##            X: a design/predictor/covariate matrix for multiple linear
##               regression
##
##               [When there are predictors that are 'protected', that
##               is, forced into the model and not subjected to variable
##               selection, one should adjust the selectable predictors
##               for the protected predictors (i.e., replace the
##               selectable predictors with their residual vectors when
##               regressing them on the protected predictors); thereafter
##               simply omit the protected predictors from X.]
##              
##      modelSZ: vector of submodel sizes to be searched
##               Default: 1:ncol(X), i.e., all submodels of all sizes.
##               If only models up to size 5 (e.g.) are considered, use 1:5.
##
##         Nsim: the number of random unit vectors sampled in the column
##               space of X; default: Nsim=1000
##               'PoSI' is simulation-based. Larger 'Nsim' increases
##               precision but may cause running out of memory.
##
##     bundleSZ: the number of contrasts/adjusted predictor vectors
##               processed at any given time; default: bundleSZ=10000
##               'PoSI' is more efficient with larger 'bundleSZ', but it
##               may cause running out of memory.
##
##          eps: threshold below which singular values of X are
##               considered to be zero
##               In case of highly collinear columns of X this threshold
##               determines the effective dimension of the column space
##               of X.  Default: eps=1E-6
##
##       center: whether the columns of X should be centered
##               In most applications the intercept is not among
##               variables to be selected.  Centering effectively adjusts
##               for the presence of an intercept that should be included
##               in all submodels.
##
##        scale: whether the columns of X should be standardized
##               This is appropriate to prevent numeric problems from
##               predictors with vastly differing scales.
##
##      verbose: verbosity reporting levels
##               0: no reports
##               1: report bundle completion (default);
##               2: report each processed submodel (for debugging)
##
##   confidence: list of confidence levels for which standard error
##               multipliers should be provided
##               default: confidence=c(.95,.99)
##
##        alpha: if specified, sets  'confidence = 1-alpha'
##
##       df.err: error degrees of freedom for t-tests
##               default: df.err=NULL (z-tests)
##
##     eps.PoSI: precision to which standard error multipliers are
##               computed
##
##       digits: number of digits to which standard error multipliers are
##               rounded
##
## Details:
##
##       'PoSI' is simulation-based and requires specification of a
##       number 'Nsim' of random unit vectors to be sampled in the column
##       space of 'X'.  Large 'Nsim' yields greater precision but
##       requires more memory.  The memory demands can be lowered by
##       decreasing 'bundleSZ' at the cost of some efficiency.
##       'bundleSZ' determines how many adjusted predictor vectors are
##       being processed at any given time.
##
##       The computational limit of the PoSI method is in the exponential
##       growth of the number of adjusted predictor vectors:
##       - If p=ncol(X) and all submodels are being searched, the number
##         of adjusted predictor vectors is p*2^(p-1).
##         Example: p=20, m=1:20  ==>  |{adj.vecs.}| = 10,485,760
##       - If only models of a given size 'm' are being searched, the number
##         of adjusted predictor vectors is m*choose(p,m).
##         Example: p=50, m=1:5   ==>  |{adj.vecs.}| = 11,576,300
##       Thus limiting PoSI to small submodel sizes such as 'm=1:4'
##       ("sparse PoSI") puts larger p=ncol(X) within reach.
##
## Value:
##
##       'PoSI' returns an object of 'class' '"PoSI"'.
##
##       'summary' returns a small matrix with standard error multipliers
##       for all requested confidence levels as well as computed by three
##       different methods: the PoSI method, Scheffe simultaneous
##       inference, and Bonferroni adjustment.
##
##       'K' returns just the PoSI-based standard error multipliers for
##       the requested confidence levels or for specified Type I errors
##       'alpha' (default: NULL, overwrites 'confidence = 1 - alpha' if
##       non-NULL).
##
##
#================================================================

# FIRST STEP: SOURCE THE POSI FUNCTIONS 

# Copy/paste the following line into your R interpreter:

    source("http://stat.wharton.upenn.edu/~buja/PAPERS/PoSI-source-code.R")


#================================================================


# OBSERVATIONS ON COMPUTATIONAL EXPENSE:


# *** Full PoSI:
# Here is the list of total number of linear constrasts in ALL submodels.
p <- 1:20;  cbind("predictors"=p, "PoSI contrasts"=p*2^(p-1))
##         predictors   PoSI contrasts
##  [1,]            1                1
##  [2,]            2                4
##  [3,]            3               12
##  [4,]            4               32
##  [5,]            5               80
##  [6,]            6              192
##  [7,]            7              448
##  [8,]            8             1024
##  [9,]            9             2304
## [10,]           10             5120
## [11,]           11            11264
## [12,]           12            24576
## [13,]           13            53248
## [14,]           14           114688
## [15,]           15           245760
## [16,]           16           524288
## [17,]           17          1114112
## [18,]           18          2359296
## [19,]           19          4980736
## [20,]           20         10485760
# E.g., p=20 predictors involves inference for about 10.5 million linear contrasts.
# This takes about 25 minutes (Lenovo X230 in 2014).


# *** Sparse PoSI:
# If we are interested only in models of a certain size,
# here is how to calculate the number of contrasts:
m <- 1:5          # model sizes 1, 2, 3, 4, 5
p <- 64           # total number of predictors
sum(m*choose(p,m))
## [1] 40793152
# ==> ~41 million linear contrasts
# ==> Takes about 100 min (Lenovo X230 in 2014).


#================================================================


# TESTS EXAMPLES:

# All variables 'X.' are for predictor matrices.
# All variables 'UL.' are for PoSI objects.

# The PoSI cut-offs will vary quite strongly in the examples
# where the X matrices is simulated.
# The PoSI cut-offs will vary mildly where X is fixed
# because PoSI computations require a degree of simulation.

# Just 1 predictor:
X.1 <- as.matrix(rnorm(100))
UL.max.1 <- PoSI(X.1)
summary(UL.max.1)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  1.960     1.960        1.960
## 99%  2.576     2.576        2.576
summary(UL.max.1, df.err=4)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  2.776     2.776       2.776
## 99%  4.604     4.604       4.604


# Small example, testing whether the automatic removal of intercept and small N works:
p <- 6;  N <- 4
X.small <- cbind(1,matrix(rnorm(N*p), ncol=p))
UL.max.small <- PoSI(X.small, modelSZ=c(4,3,1), Nsim=1000, bundleSZ=5, verbose=2)
summary(UL.max.small, df.err=4)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  4.228     4.447        9.256
## 99%  6.733     7.077       13.988


# Orthogonal design:
p <- 10; N <- 10
X.orth <- qr.Q(qr(matrix(rnorm(p*N), ncol=p)))
UL.max.orth <- PoSI(X.orth)
summary(UL.max.orth)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.379     4.113        4.422
## 99%  3.885     4.655        4.758


# Boston Housing data:
X.boston <- read.table("http://stat.wharton.upenn.edu/~buja/STAT-541/boston.dat", header=T)
dim(X.boston) # [1] 506  14
colnames(X.boston)
UL.max.boston <- PoSI(X.boston[,-14])   # 4.87 seconds
summary(UL.max.boston, df.err=506-13-1)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.620     4.756        4.967
## 99%  4.115     5.307        5.287


# Large p=50, small N=20: 
p <- 50;  N <- 20
X.p100.N20 <- matrix(rnorm(p*N), ncol=p)
# Sparse submodels m=4 out of p=50 in d=19 dimensions -- 1.3 minutes
UL.max.p100.N20 <- PoSI(X.p100.N20, Nsim=1000, bundleSZ=100000, modelSZ=1:4)
## p = 50 , d = 19   processed 982500 tests in 251175 models.  Times in seconds:
##    user  system elapsed 
##  170.32   13.65  184.16 
summary(UL.max.p100.N20)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  4.297     5.490        5.448
## 99%  4.751     6.016        5.728


# Autism data: can't give out the data, but here is the size
p <- 84;  N <- 2758
X.84 <- matrix(rnorm(p*N), ncol=p)
# Submodel sizes m=84 and m=83 with more simulations (10,000) for precision -- 11.5 sec
UL.max.84 <- PoSI(X.84, Nsim=10000, bundleSZ=10000, modelSZ=c(p-1,p), verbose=2)
# Out of interest, what is the distribution of inner products?
hist(UL.max.84, col="gray", breaks=40)

# Sparse submodels of size m=4 out of p=84 in p=d=84 dimensions, 7.7 million contrasts -- 25.5 minutes (!!!!)
UL.max.84.4 <- PoSI(X.84, Nsim=1000, bundleSZ=100000, modelSZ=4)
## p = 84 , d = 84   processed 7718004 tests in 1929501 models.  Times in seconds:
##    user  system elapsed 
## 1916.80  106.21 2023.77 
summary(UL.max.84.4)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.555    10.315        5.804
## 99%  3.970    10.819        6.068
summary(UL.max.84.4, df.err=2758-84-1)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.560    10.338        5.823
## 99%  3.976    10.855        6.089


# Big experiment 1: full large PoSI (takes about 13.5 minutes (*))
p <- 20;  N <- 1000
X.p20 <- matrix(rnorm(N*p), ncol=p)
UL.max.p20 <- PoSI(X.p20, bundleSZ=100000)  # 2014/04/19:
## p = 20 , d = 20   processed 10485760 tests in 1048575 models.  Times in seconds:
##    user  system elapsed 
## 1110.37  114.68 1225.31  # about 20min
summary(UL.max.p20)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.171     5.605        5.855
## 99%  3.619     6.129        6.117
summary(UL.max.p20, df.err=1000-21)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.180     5.624        5.908
## 99%  3.631     6.160        6.177

# Big experiment 2: sparse large PoSI (models <= 5; takes about 32 minutes (*))
p <- 50;  N <- 1000
X.p50.m5 <- matrix(rnorm(N*p), ncol=p)
UL.max.p50.m5 <- PoSI(X.p50.m5, bundleSZ=100000, modelSZ=1:5)  # 2014/04/19:
## p = 50 , d = 50   processed 11576300 tests in 2369935 models.  Times in seconds:
##    user  system elapsed 
## 1901.24  117.49 2036.86 
summary(UL.max.p50.m5)
##     K.PoSI K.Scheffe K.Bonferroni
## 95%  3.522     8.216        5.871
## 99%  3.951     8.727        6.133

# (*) Elapsed times were measured in R version 3.0.2, 32 bit,
#     on a processor Intel(R) Core(TM), 2.9 GHz, under Windows 7.
#     2014/09/11


#================================================================


# EXCHANGEABLE DESIGNS:
#   parametrized as quarter circle from  xj=e  to  xj = {ej adjusted for e=(1...1)' and normalized}
#   {...} = (ej - e/p)/sqrt(1-1/p)
# Function to create exchangeable designs this way:
design.exch <- function(p,a) { (1-a)*diag(p) + a*matrix(1,p,p) }
p <- 12;  a <- 0.5;  
# Exchangeable design with parameter 'a':
X.exch <- design.exch(p=p, a=a)
UL.exch <- PoSI(X.exch, verbose=0, modelSZ=1:p)
K(UL.exch)

# ORTHOGONAL DESIGNS: 
p <- 12
X.orth <- diag(p)
UL.orth <- PoSI(X.orth, center=F, verbose=0)
K(UL.orth)   
# ==> Quite bit lower than exchangeable design.
      

#================================================================