###################################### # Bayesian Model Averaging Program # ###################################### # This version: adjusted on 2011-05-05 # Martin Feldkircher # martin.feldkircher@gzpace.net, http://feldkircher.gzpace.net # Stefan Zeugner # stefan.zeugner@ulb.ac.be, http://www.zeugner.eu ##################### # The main code starts at line 169 with the function "bms=function(....)" and is written # by Martin Feldkircher and Stefan Zeugner as part of their work at the Institute for Advanced Studies (IHS), # Vienna in 2006/2007. Descriptions of the algorithms and priors used can be found in # Gary Koop ("Bayesian Econometrics", Wiley & Sons), Fernandez, C., E. Ley and M.F.J. Steel (2001b) # ?Model Uncertainty in CrossCountry Growth Regressions,? Journal of Applied Econometrics, # Fernandez, C., E. Ley and M.F.J. Steel (2001a) ?Benchmark Priors for Bayesian Model Averaging,? # Journal of Econometrics, 100: 381?427. and # Liang, F., R. Paulo, G. Molina, M.A. Clyde, and J.O. Berger (2008): "Mixtures of g Priors for Bayesian Variable Selection", # Journal of the American Statistical Association, 103: 410-423 #################### # USAGE # ################################################################################################### # bms <-function(X.data,burn=1000,iter=NA,nmodel=100,mcmc="bd",g="UIP",mprior="random",mprior.size=NA,user.int=TRUE, # start.value=NA,g.stats=TRUE,logfile=FALSE,logstep=10000,force.full.ols=FALSE) # ###################### # FUNCTION ARGUMENTS # ################################################################################################### #X.data data.frame or matrix: submit a data frame or a matrix, where the first column corresponds to the dependent variable # followed by the covariates, including a constant term is not necessary since y and X is # demeaned automatically. #burn integer >=0: is the number of burn-in draws, default 1000 (not taken into account if mcmc="enumerate") #iter integer >=0: is the number of posterior draws, default 3000 (not taken into account if mcmc="enumerate"); # if mcmc="enumerate", then iter is the number of models to be sampled, starting from 0 (default 2^K-1) - cf. start.value #nmodel integer >=0: is the number of best models for which information is stored. Convergence analysis of # the sampler by means of the correlation between analytic posterior model probabilities # (pmp's) and hat of the MCMC sampler is based on the number you have set in nmodels. Also # if you want to save the regression coefficients (beta.save=T), they are taken from the # nmodel best models. Setting nmodel500 slows down the MCMC sampler. Note that posterior # inclusion probabilites, and mean calculations are based on the MCMC frequencies as opposed # to exact analytical calculations (as in Fernandez, Ley and Steel). # Set nmodel=0 to speed up sampling (if topmodel information is not needed) #mcmc character: * MC3 samplers: default is "bd" which corresponds to a birth / death MCMC alogrithm. You can choose # also "rev.jump" where we have implemented a true reversible jump algorithm adding a "move" step to the birth / death steps from "bd". # * enumeraton sampler: moreover there is mcmc="enumerate"/"enum" which will enumerate all possible regressor combinations (not advisable for K>23) # * interaction sampler: adding an ".int" to an MC3 sampler (e.g. "mcmc="bd.int") does an interaction sampler # interaction terms will only be sampled along with their component variables, in the colnumn names of X.data interaction terms need to be # denominated by names consisting of the base terms separated by "#" (e.g. an interaction term of base variables "A", "B" and "C" needs column name "A#B#C") #g character or integer: is the hyperparameter on Zellner's g-prior for the regression coefficients. You can specify g="UIP", corresponding to g=N (default) # g="bric" corresponding to the benchmark prior suggestion from FLS (2001), i.e g=max(N, K^2) # with K denoting the total number of regressors and N the number of observations, g="EBL" estimates a # local empirical Bayes g-parameter (as in Liang et a. (2008), JASA). g="hyper", takes the 'hyper-g' # prior distribution (As in Liang et al.) with the default hyper-parameter a=3; This hyperparameter can # be adjusted (between 20.2. # The default value start.value=NA corresponds to start.value=min(nrow(X)-2,ncol(X),nrow(X)-3) # start.value=0 or start.value=NULL starts from the null model # * if mcmc="enumerate" then start.value is the index to start the iteration (default=0) . Any number between 0 and K^2-1 is admissible #g.stats TRUE or FALSE whether statistics on the shrinkage factor should be collected. default=TRUE # set g.stats=FALSE for faster iteration #logfile setting logfile=TRUE produces a logfile named "test.log" in your current working directory, # in order to keep track of the sampling procedure. # setting logfile equal to some filepath (like logfile="subfolder/bla.txt") puts the logfile # into that specified position. (default: logfile=FALSE) # Note that logfile="" implies log printouts on the console #logstep specifies at which number of posterior draws information is written to the log file; default: 100 000 iterations #force.full.ols default FALSE. If force.full.ols=TRUE, the ols estimation part of the sampling procedure relies on slower matrix inversion, # instead of streamlined routines. force.full.ols=TRUE can slow down sampling but may deal better with highly collinear data #beta.save obsolete #int obsolete, see "mcmc" #exact obsolete: see estimates.bma() #ask.set obsolete, with no replacement #printRes obsolete; see user.int #return.g.stats renamed intop 'g.stats' #theta renamed into 'mprior' #prior.msize renamed into 'mprior.size' # ################ # OUTPUT # ################################################################################################### # A call to bms returns a list/"bma" object with the following elements: #info a list containing some posterior statistics. it has the following elements # iter (numeric(1)) The number of iteration runs in the sampler (without burn-ins) # burn (numeric(1)) The number of burn-in iterations # inccount (numeric(K)) The (weighted) cumulative sum of the binary vector with regressor inclusions # models.visited (numeric(1)) the number of model candidates that have been accepted (including burn-ins) # b1mo ((numeric(K)) the (weighted) cumulative sum of first posterior moment of beta: Sum p(M)E(beta|X,y,M) # b2mo ((numeric(K)) the (weighted) cumulative sum of second posterior moment of beta: Sum p(M)E(beta^2|X,y,M) # add.otherstats (numeric(0 or some integer)) cumulative sum of some additional statistics (such as posterior moments of g) # cumsumweights (numeric(1)) the denominator to turn cumulative sums into (weighted) averages # K (numeric(1)) number of covariates in X.data # N (numeric(1)) number of observations in X.data # corr.pmp (numeric(1)) correlation between analytical and MCMC frequencies for the best nmodel models; if mcmc="enum", then this is NA # msize (numeric(1)) the cumulative sum of model sizes # timed (difftime) time taken for the main sampling routine in seconds # k.vec (numeric(K+1)) cumulative sum of the different model siyes, from zero to K # cons (numeric(1)) scalar value of E(constant|X,y), same as BMAOBJECT$cons # pos.sign (numeric(K)) cumulative sum of the coefficients>0, from 1 to K # arguments named list with the all the function arguments to the function bms, after being adjusted for inconsistencies # topmod object/list containing the best nmodel models; for mor information see help on function .top10 # start.pos (numeric(<=K)) the indexes of covariates in the actual starting model, that started the MCMC sampling chain # gprior.info (list) A list containing information on the gprior: its type ("BRIC", "EBL", "hyper" or "numeric"), whether it is constant, and its # first moment (in case of the hyper-prior, also the second moment) # X.data (data.frame) the data in the bms function argument X.data; the same as BMAOBJECT$arguments$X.data, retained for backward compatibility # reg.names (character(K)) the covariates' column names (generic ones, if no names were provided with X.data) # bms.call (call) the function call to bms() as it was entered into the command line (regularized by match.call()) # ############################### # Deprecated output elements # ################################################################################################### # in order to convert your bma object into a bma object of versions before this version: 20090612, # use the function as.oldbma(BMAOBJECT) # ----- The following are deprecated elements of BMAOBJECT =bms(...) ------- # estimates use estimates.bma(BMAOBJECT) # estimates when exact=TRUE: use estimates.bma(BMAOBJECT, exact=TRUE) # info use info.bma(BMAOBJECT) # topmodels use topmodels.bma(BMAOBJECT) # beta.draws use beta.draws.bma(BMAOBJECT) # pmp.10 use pmp.bma(BMAOBJECT) # # # ###### BMS MAIN FUNCTION ######################### #' Bayesian Model Sampling and Averaging #' #' Given data and prior information, this function samples all possible model #' combinations via MC3 or enumeration and returns aggregate results. #' #' Ad \code{mcmc}: \cr Interaction sampler: adding an ".int" to an MC3 sampler #' (e.g. "mcmc="bd.int") provides for special treatment of interaction terms. #' Interaction terms will only be sampled along with their component variables: #' In the colnumn names of X.data, interaction terms need to be denominated by #' names consisting of the base terms separated by \code{#} (e.g. an #' interaction term of base variables \code{"A"}, \code{"B"} and \code{"C"} #' needs column name \code{"A#B#C"}). Then variable \code{"A#B#C"} will only be #' included in a model if all of the component variables ("A", "B", and "C") #' are included. #' #' The MC3 samplers "\code{bd}", "\code{rev.jump}", "\code{bd.int}" and #' "\code{rev.jump.int}", iterate away from a starting model by adding, #' dropping or swapping (only in the case of rev.jump) covariates. #' #' In an MCMC fashion, they thus randomly draw a candidate model and then move #' to it in case its marginal likelihood (marg.lik.) is superior to the #' marg.lik. of the current model. #' #' In case the candidate's marg.lik is inferior, it is randomly accepted or #' rejected according to a probability formed by the ratio of candidate #' marg.lik over current marg.lik. Over time, the sampler should thus converge #' to a sensible distribution. For aggregate results based on these MC3 #' frequencies, the first few iterations are typically disregarded (the #' 'burn-ins'). #' #' Ad \code{g} and the hyper-g prior: The hyper-g prior introduced by Liang et #' al. (2008) puts a prior distribution on the shrinkage factor \eqn{g/(1+g)}, #' namely a Beta distribution \eqn{ Beta(1, 1/2-1)} that is governed by the #' parameter \eqn{a}. \eqn{a=4} means a uniform prior distribution of the #' shrinkage factor, while \eqn{a>2} close to 2 concentrates the prior #' shrinkage factor close to one. \cr The prior expected value is #' \eqn{E(g/1+g)) = 2/a}. In this sense \code{g="hyper=UIP"} and #' \code{g="hyper=BRIC"} set the prior expected shrinkage such that it conforms #' to a fixed UIP-g (eqng=N) or BRIC-g (\eqn{g=max(K^2,N)} ). #' #' #' @param X.data a data frame or a matrix, with the dependent variable in the #' first column, followed by the covariates (alternatively, \code{X.data} can #' also be provided as a \code{\link{formula}}). Note that \code{bms} #' automatically estimates a constant, therefore including constant terms is #' not necessary. #' @param burn The (positive integer) number of burn-in draws for the MC3 #' sampler, defaults to 1000. (Not taken into account if mcmc="enumerate") #' @param iter If mcmc is set to an MC3 sampler, then this is the number of #' iteration draws to be sampled (ex burn-ins), default 3000 draws. \cr If #' \code{mcmc="enumerate"}, then iter is the number of models to be sampled, #' starting from 0 (defaults to \eqn{2^K-1}) - cf. \code{start.value}. #' @param nmodel the number of best models for which information is stored #' (default 500). Best models are used for convergence analysis between #' likelihoods and MCMC frequencies, as well as likelihood-based inference.\cr #' Note that a very high value for \code{nmodel} slows down the sampler #' significantly. Set nmodel=0 to speed up sampling (if best model information #' is not needed). #' @param mcmc a character denoting the model sampler to be used.\cr The MC3 #' sampler \code{mcmc="bd"} corresponds to a birth/death MCMC algogrithm. #' \code{mcmc="rev.jump"} enacts a reversible jump algorithm adding a "swap" #' step to the birth / death steps from "bd".\cr Alternatively, the entire #' model space may be fully enumerated by setting \code{mcmc="enumerate"} which #' will iterate all possible regressor combinations (Note: consider that this #' means \eqn{2^K} iterations, where K is the number of covariates.)\cr Default #' is full enumeration (\code{mcmc="enumerate"}) with less then 15 covariates, #' and the birth-death MC3 sampler (\code{mcmc="bd"}) with 15 covariates or #' more. Cf. section 'Details' for more options. #' @param g the hyperparameter on Zellner's g-prior for the regression #' coefficients.\cr \code{g="UIP"} corresponds to \eqn{g=N}, the number of #' observations (default);\cr \code{g="BRIC"} corresponds to the benchmark #' prior suggested by Fernandez, Ley and Steel (2001), i.e \eqn{g=max(N, K^2)}, #' where K is the total number of covariates;\cr \code{g="RIC"} sets #' \eqn{g=K^2} and conforms to the risk inflation criterion by George and #' Foster (1994)\cr \code{g="HQ"} sets \eqn{g=log(N)^3} and asymptotically #' mimics the Hannan-Quinn criterion with \eqn{C_{HQ}=3} (cf. Fernandez, Ley #' and Steel, 2001, p.395)\cr \code{g="EBL"} estimates a local empirical Bayes #' g-parameter (as in Liang et al. (2008));\cr \code{g="hyper"} takes the #' 'hyper-g' prior distribution (as in Liang et al., 2008) with the default #' hyper-parameter \eqn{a} set such that the prior expected shrinkage factor #' conforms to 'UIP';\cr This hyperparameter \eqn{a} can be adjusted (between #' \eqn{20.2.\cr The default value #' \code{start.value=NA} corresponds to #' \code{start.value=min(ncol(X.data),nrow(X.data)-3)}. Note that #' \code{start.value=0} or \code{start.value=NULL} starts from the null #' model.\cr If \code{mcmc="enumerate"} then \code{start.value} is the index to #' start the iteration (default: 0, the null model) . Any number between 0 and #' \eqn{K^2-1} is admissible. #' @param g.stats \code{TRUE} if statistics on the shrinkage factor g/(1+g) #' should be collected, defaulting to TRUE (Note: set \code{g.stats=FALSE} for #' faster iteration.) #' @param logfile setting \code{logfile=TRUE} produces a logfile named #' \code{"test.log"} in your current working directory, in order to keep track #' of the sampling procedure. \code{logfile} equal to some filepath (like #' \code{logfile="subfolder/log.txt"}) puts the logfile into that specified #' position. (default: \code{logfile=FALSE}). Note that \code{logfile=""} #' implies log printouts on the console. #' @param logstep specifies at which number of posterior draws information is #' written to the log file; default: 10 000 iterations #' @param force.full.ols default FALSE. If \code{force.full.ols=TRUE}, the OLS #' estimation part of the sampling procedure relies on slower matrix inversion, #' instead of streamlined routines. \code{force.full.ols=TRUE} can slow down #' sampling but may deal better with highly collinear data #' @param fixed.reg indices or variable names of \code{X.data} that are fixed #' regressors to be always included in every sampled model. Note: the parameter #' \code{mprior.size} refers to prior model size including these fixed #' regressors. #' @return A list of class \code{bma}, that may be displayed using e.g. #' \code{\link{summary.bma}} or \code{\link{coef.bma}}. The list contains the #' following elements: \item{info}{a list of aggregate statistics: \code{iter} #' is the number of iterations, \code{burn} the number of burn-ins.\cr The #' following have to be divided by \code{cumsumweights} to get posterior #' expected values: \code{inccount} are the posterior inclusion probabilities, #' \code{b1mo} and \code{b2mo} the first and second moment of coefficients, #' \code{add.otherstats} other statistics of interest (typically the moments of #' the shrinkage factor), \code{msize} is the post. expected model size, #' \code{k.vec} the posterior model size distribution, \code{pos.sign} the #' unconditional post. probability of positive coefficients, \code{corr.pmp} is #' the correlation between the best models' MCMC frequencies and their marg. #' likelihoods.\cr \code{timed} is the time that was needed for MCMC sampling, #' \code{cons} is the posterior expected value of the constant. \code{K} and #' \code{N} are the maximum number of covariates and the sample size, #' respectively.} \item{arguments}{a list of the evaluated function arguments #' provided to \code{bms} (see above)} \item{topmod}{a 'topmod' object #' containing the best drawn models. see \code{\link{topmod}} for more details} #' \item{start.pos}{the positions of the starting model. If bmao is a'bma' #' object this corresponds to covariates bmao$reg.names[bmao$start.pos]. If #' bmao is a chain that resulted from several starting models (cf. #' \code{\link{c.bma}}, then \code{start.pos} is a list detailing all of them.} #' \item{gprior.info}{a list of class \code{\link{gprior-class}}, detailing #' information on the g-prior: \code{gtype} corresponds to argument \code{g} #' above, \code{is.constant} is FALSE if \code{gtype} is either "hyper" or #' "EBL", \code{return.g.stats} corresponds to argument \code{g.stats} above, #' \code{shrinkage.moments} contains the first and second moments of the #' shrinkage factor (only if \code{return.g.stats==TRUE}), \code{g} details the #' fixed g (if \code{is.constant==TRUE}), \code{hyper.parameter} corresponds to #' the hyper-g parameter \eqn{a} as in Liang et al. (2008) } #' \item{mprior.info}{a list of class \code{\link{mprior-class}}, detailing #' information on the model prior: \code{origargs} lists the original arguments #' to \code{mprior} and \code{mprior.size} above; \code{mp.msize} denotes the #' prior mode size; \code{mp.Kdist} is a (K+1) vector with the prior model size #' distribution from 0 to K} \item{X.data}{data.frame or matrix: corresponds to #' argument \code{X.data} above, possibly cleaned for NAs} #' \item{reg.names}{character vector: the covariate names to be used for X.data #' (corresponds to \code{\link{variable.names.bma}} } \item{bms.call}{the #' original call to the \code{bms} function} #' @note There are several ways to speed-up sampling: \code{nmodel=10} saves #' only the ten best models, at most a marginal improvement. \code{nmodels=0} #' does not save the best (500) models, however then posterior convergence and #' likelihood-based inference are not possible. %\code{beta.save=FALSE} saves #' the best models, but not their coefficients, which renders the use of #' \code{image.bma} and the paramer \code{exact=TRUE} in functions such as #' \code{coef.bma} infeasible. \code{g.stats=FALSE} saves some time by not #' retaining the shrinkage factors for the MC3 chain (and the best models). #' \code{force.fullobject=TRUE} in contrast, slows sampling down significantly #' if \code{mcmc="enumerate"}. #' @section Theoretical background: The models analyzed are Bayesian #' normal-gamma conjugate models with improper constant and variance priors #' akin to Fernandez, Ley and Steel (2001): A model \eqn{M} can be described as #' follows, with \eqn{\epsilon} ~ \eqn{N(0,\sigma^2 I)}: \deqn{latex}{ y= #' \alpha + X \beta + \epsilon} \deqn{f(\beta | \sigma, M, g) ~ N(0, g \sigma^2 #' (X'X)^-1) } #' #' Moreover, the (improper) prior on the constant \eqn{f(\alpha)} is put #' proportional to 1. Similarly, the variance prior \eqn{f(\sigma)} is #' proportional to \eqn{1/\sigma}. #' @author Martin Feldkircher, Paul Hofmarcher, and Stefan Zeugner #' @seealso \code{\link{coef.bma}}, \code{\link{plotModelsize}} and #' \code{\link{density.bma}} for some operations on the resulting 'bma' object, #' \code{\link{c.bma}} for integrating separate MC3 chains and splitting of #' sampling over several runs. #' #' Check \url{http://bms.zeugner.eu} for additional help. #' @references #' \url{http://bms.zeugner.eu}: BMS package homepage with help and tutorials #' #' Feldkircher, M. and S. Zeugner (2015): Bayesian Model Averaging Employing #' Fixed and Flexible Priors: The BMS Package for R, Journal of Statistical Software 68(4). #' #' Feldkircher, M. and S. Zeugner (2009): Benchmark Priors #' Revisited: On Adaptive Shrinkage and the Supermodel Effect in Bayesian Model #' Averaging, IMF Working Paper 09/202. #' #' Fernandez, C. E. Ley and M. Steel (2001): Benchmark priors for Bayesian #' model averaging. Journal of Econometrics 100(2), 381--427 #' #' Ley, E. and M. Steel (2008): On the Effect of Prior Assumptions in Bayesian #' Model Averaging with Applications to Growth Regressions. working paper #' #' Liang, F., Paulo, R., Molina, G., Clyde, M. A., and Berger, J. O. (2008). #' Mixtures of g Priors for Bayesian Variable Selection. Journal of the #' American Statistical Association 103, 410-423. #' #' Sala-i-Martin, X. and G. Doppelhofer and R.I. Miller (2004): Determinants of #' long-term growth: a Bayesian averaging of classical estimates (BACE) #' approach. American Economic Review 94(4), 813--835 #' @keywords models #' @examples #' #' data(datafls) #' #estimating a standard MC3 chain with 1000 burn-ins and 2000 iterations and uniform model priors #' bma1 = bms(datafls,burn=1000, iter=2000, mprior="uniform") #' #' ##standard coefficients based on exact likelihoods of the 100 best models: #' coef(bma1,exact=TRUE, std.coefs=TRUE) #' #' #suppressing user-interactive output, using a customized starting value, and not saving the best #' # ...models for only 19 observations (but 41 covariates) #' bma2 = bms(datafls[20:39,],burn=1000, iter=2000, nmodel=0, start.value=c(1,4,7,30), #' user.int=FALSE) #' coef(bma2) #' #' #MC3 chain with a hyper-g prior (custom coefficient a=2.1), saving only the 20 best models, #' # ...and an alternative sampling procedure; putting a log entry to console every 1000th step #' bma3 = bms(datafls,burn=1000, iter=5000, nmodel=20, g="hyper=2.1", mcmc="rev.jump", #' logfile="",logstep=1000) #' image(bma3) #showing the coefficient signs of the 20 best models #' #' #enumerating with 10 covariates (= 1024 models), keeping the shrinkage factors #' # ...of the best 200 models #' bma4 = bms(datafls[,1:11],mcmc="enumerate",nmodel=200,g.stats=TRUE) #' #' #using an interaction sampler for two interaction terms #' dataint=datafls #' dataint=cbind(datafls,datafls$LifeExp*datafls$Abslat/1000, #' datafls$Protestants*datafls$Brit-datafls$Muslim) #' names(dataint)[ncol(dataint)-1]="LifeExp#Abslat" #' names(dataint)[ncol(dataint)]="Protestants#Brit#Muslim" #' bma5 = bms(X.data=dataint,burn=1000,iter=9000,start.value=0,mcmc="bd.int") #' #' density(bma5,reg="English") # plot posterior density for covariate "English" #' #' # a matrix as X.data argument #' bms(matrix(rnorm(1000),100,10)) #' #' # keeping a set of fixed regressors: #' bms(datafls, mprior.size=7, fixed.reg = c("PrScEnroll", "LifeExp", "GDP60")) #' # Note that mprior.size=7 means prior model size of 3 fixed to 4 'uncertain' regressors #' #' @export bms <-function(X.data,burn=1000,iter=NA,nmodel=500,mcmc="bd",g="UIP",mprior="random",mprior.size=NA,user.int=TRUE, start.value=NA,g.stats=TRUE,logfile=FALSE,logstep=10000,force.full.ols=FALSE, fixed.reg=numeric(0)) { # beta.save=TRUE,exact=NA,int=NA,printRes=NA,ask.set=NA,return.g.stats=NA,theta=NULL,prior.msize=NULL #deprecated function arguments, retained for compatibility with older versions ### getting data dimensions #################### if (class(X.data)[[1]]=="formula") { X.data=stats::model.frame(X.data); if (!is.null(ncol(X.data[[2]]))) X.data=cbind(X.data[[1]],X.data[[2]][,-1]) } if (any(is.na(X.data))) { X.data=na.omit(X.data) if (nrow(X.data)<3) {stop("Too few data observations. Please provide at least three data rows without NA entries.") } warning("Argument 'X.data' contains NAs. The corresponding rows have not been taken into account.") } N<-nrow(X.data) K=ncol(X.data)-1 maxk=N-3 #maximum number of admissible k per model ############################################################################################################################################ #### User Checks: ########################################################################################################################## ############################################################################################################################################ # check for deprecated arguments # if (!is.na(exact)) { warning("Function argument 'exact' has been deprecated, please refer to function 'estimates.bma' instead.") } # if (!is.na(int)) { mcmc=paste(mcmc,".int",sep=""); warning("Function argument 'int' has been deprecated, please add an 'int' to the argument 'mcmc' instead.") } # if (!is.na(printRes)) { user.int=printRes; warning("Function argument 'printRes' has been deprecated, please refer to the argument 'user.int' instead.") } # if (!is.na(ask.set)) { warning("Function argument 'ask.set' has been deprecated, with no replacement.") } # if (!is.na(return.g.stats)) { g.stats=return.g.stats; warning("Function argument 'return.g.stats' has been renamed into 'g.stats'.") } # if (!is.null(theta)) { mprior=theta; warning("Function argument 'theta' has been renamed into 'mprior'.") } # if (!is.null(prior.msize)) { mprior.size=prior.msize; warning("Function argument 'prior.msize' has been renamed into 'mprior.size'.") } # return.g.stats=g.stats; #theta=mprior; prior.msize=mprior.size if (is.null(nmodel[1])||is.na(nmodel[1])||nmodel[1]<=0) {dotop=FALSE;nmodel=0} else {dotop=TRUE} ########################################################################### nameix=1:K; names(nameix)=colnames(X.data[,-1,drop=FALSE]); fixed.pos=nameix[fixed.reg]; rm(nameix) ###################################################################################################################################### #assign the sampling procedure if (missing(mcmc)&&((K-length(fixed.pos))<15)) {mcmc="enum"} int=FALSE; is.enum=FALSE #int: whether interaction sampler is wanted; is.enum: whether the sampler is enumeration if (is.function(mcmc)) { samplingfun=mcmc mcmc="custom" } else { if (length(grep("int",mcmc,ignore.case=TRUE))) {int=TRUE} if (length(grep("enum",mcmc,ignore.case=TRUE))) { is.enum=TRUE; samplingfun=.iterenum if (K>maxk) samplingfun=.iterenum.KgtN } else if(length(grep("bd",mcmc,ignore.case=TRUE))){ samplingfun=switch(int+1,.fls.samp,.fls.samp.int) } else { samplingfun=switch(int+1,.rev.jump,.rev.jump.int) } } if (int&&(length(fixed.pos)>0L)) { warning("interaction sampler does not allow for non-zero argument fixed.pos; consequently it was set fixed.pos=0"); fixed.pos=numeric(0); } sampling=.fixedset.sampler(samplingfun,fullK=K,fixed.pos=fixed.pos, X.data=X.data); ###################################################################################################################################### # specific enumeration user checks & init if (is.enum) { #check for a start.value index to start enumeration from seomewhere in between (and not do all possible models) burn=0; int=FALSE; mcmc="enum"; is.enum=TRUE tmp=.enum_startend(iter=iter, start.value=start.value, K=K, maxk=maxk, fixed.pos=fixed.pos); iter=tmp$iter; start.value=tmp$start.value } else { if (is.na(iter)) {iter=3000}; #if no enumeration and iter not given, set to default value 3000 } ###################################################################################################################################### # generate logfile if desired if(logfile!=FALSE){ if (is.character(logfile)) { sfilename=logfile} else { sfilename="test.log" } if (nchar(sfilename)>0) file.create(sfilename) logfile=TRUE cat(as.character(Sys.time()),": starting loop ... \n",append=TRUE, file=sfilename) #write one line if (logstep!=10000) fact=logstep else fact=max(floor((burn+iter)/100),logstep) } ###################################################################################################################################### ###################################################################################################################################### # subtract mean from all regressors as in FLS y<-as.matrix(X.data[,1]) X<-as.matrix(X.data[,2:ncol(X.data)]) y<-y-mean(y) X<-X-matrix(colMeans(X),N,K,byrow=TRUE) # multiply the whole matrix stuff out before going into the simulation loops XtX.big=crossprod(X) Xty.big=crossprod(X,y) yty = as.vector(crossprod(y)) # user check: whether we have to use force.full.ols coreig=eigen(cor(X),symmetric=TRUE,only.values=TRUE)$values if (!force.full.ols) { #this line was added due to feature requests if (sum(coreig>1e-7)1e-13))) } ###################################################################################################################################### # for the case that X contains interaction terms if(int){ if(length(grep("#",colnames(X.data),fixed=TRUE))==0) stop("Please separate column names of interaction terms by # (e.g. A#B)") mPlus=.constr.intmat(X,K) } else{ mPlus<-NA } ###################################################################################################################################### #Prior Initialization #model prior: outsourced to stupid function for cleanliness pmplist=.choose.mprior(mprior,mprior.size,K=K,X.data=X.data,fixed.pos=fixed.pos) mprior=pmplist$mp.mode; #g-prior gprior.info = .choose.gprior(g,N,K,return.g.stats=g.stats,yty=yty,X.data=X.data) # gprior.info is a list that summarizes info about the choice of the g-prior lprobcalc = gprior.info$lprobcalc ###################################################################################################################################### #The function Starter selects randomly a start matrix and runs a #regression. From this regression, the #start Design matrix is that for which the t-stats are >0.2. So we #can circumvent starting from a very bad start point. start.list=.starter(K,start.value,y,N=N,XtX.big=XtX.big,Xty.big=Xty.big,X=X,fixed.pos=fixed.pos) molddraw=start.list$molddraw; start.position=start.list$start.position kold=sum(molddraw) position=(1:K)[molddraw==1] ######################################################################################################################################## ################################################################################################ # Initializing # ################################################################################################ # initialize sampler-specific variables ######################################## # these are to select additional statistics (such as g) collect.otherstats=FALSE otherstats=numeric(0) add.otherstats=numeric(0) # initialize the vector for collecting the empirical shrinkage factor moments if (gprior.info$return.g.stats & !(gprior.info$is.constant)) { add.otherstats=gprior.info$shrinkage.moments; collect.otherstats=TRUE } cumsumweights=iter null.lik=lprobcalc$just.loglik(ymy=yty,k=0) # calculate Likelihood for NullModel if (K < N-3) { mid.lik=lprobcalc$just.loglik(ymy=yty*(1-as.vector(crossprod(crossprod(chol2inv(chol(XtX.big)),Xty.big),Xty.big)/yty)),k=ceiling(K/2)) # calculate Likelihood for max model } else { mid.lik=lprobcalc$just.loglik(ymy=yty*.001,k=ceiling(K/2)) # calculate Likelihood for max model } if (!is.finite(mid.lik)) { mid.lik=sapply(as.list(seq(.1,.9,.1)),function(x) lprobcalc$just.loglik(ymy=yty*x,k=ceiling(K/2))); mid.lik=max(mid.lik[is.finite(mid.lik)]) } #adding up posterior stats has been outsourced to sub-functions for speed reasons if (collect.otherstats) { addup<- function() { inccount <<- inccount + molddraw #PIPs msize<<-msize + kold # average size of models #for speed reasons, iterative adding with indexing should be done in one stacked vector if (kold!=0) { bm[c(position,K+position,2*K+kold,3*K+position)]=c(b1,b2,1,b1>0); bmo <<- bmo+bm #bmo is partitioned: first K entries have cum. b1 ("b1mo"), second K entries have cum. b2 ("b2mo"), third K entries have model size dist ("k.vec"), and fourth K entries are like inccount for positive betas (add up pos. sign covariate selections) } else { null.count<<-null.count+1 } # collect e.g. estimated g-priors, etc otherstats<<-lik.list[["otherstats"]]; add.otherstats<<-add.otherstats + otherstats } } else { addup <- function() { inccount <<- inccount + molddraw #PIPs msize<<-msize + kold # average size of models #for speed reasons, iterative adding with indexing should be done in one stacked vector if (kold!=0) { bm[c(position,K+position,2*K+kold,3*K+position)]=c(b1,b2,1,b1>0); bmo <<- bmo+bm #bmo is partitioned: first K entries have cum. b1 ("b1mo"), second K entries have cum. b2 ("b2mo"), third K entries have model size dist ("k.vec"), and fourth K entries are like inccount for positive betas (add up pos. sign covariate selections) } else { null.count<<-null.count+1 } } } if (is.enum) { cumsumweights=0 if (collect.otherstats) { addup<- function() { weight= exp(pmpold+lprobold-mid.lik) inccount <<- inccount + weight*molddraw #PIPs msize<<-msize + weight*kold # average size of models cumsumweights<<-cumsumweights+weight #denominator to get at sum of PMPs=1 #browser() #for speed reasons, iterative adding with indexing should be done in one stacked vector if (kold!=0) { bm[c(position,K+position,2*K+kold,3*K+position)]=weight*c(b1,b2,1,b1>0); bmo <<- bmo+bm #bmo is partitioned: first K entries have cum. b1 ("b1mo"), second K entries have cum. b2 ("b2mo"), third K entries have model size dist ("k.vec"), and fourth K entries are like inccount for positive betas (add up pos. sign covariate selections) } else { null.count<<-null.count+weight } otherstats<<-lik.list[["otherstats"]]; add.otherstats<<-add.otherstats + weight*otherstats } } else { addup <- function() { weight= exp(pmpold+lprobold-mid.lik) #browser() inccount <<- inccount + weight*molddraw #PIPs msize<<-msize + weight*kold # average size of models cumsumweights<<-cumsumweights+weight #denominator to get at sum of PMPs=1 #for speed reasons, iterative adding with indexing should be done in one stacked vector if (kold!=0) { bm[c(position,K+position,2*K+kold,3*K+position)]=weight*c(b1,b2,1,b1>0); bmo <<- bmo+bm #bmo is partitioned: first K entries have cum. b1 ("b1mo"), second K entries have cum. b2 ("b2mo"), third K entries have model size dist ("k.vec"), and fourth K entries are like inccount for positive betas (add up pos. sign covariate selections) } else { null.count<<-null.count+weight } } } } environment(addup) <- environment() ################################################################################## ##initialize model varibales with starter model ################################### ols.object=.ols.terms2(positions=(1:K)[molddraw==1],yty=yty,k=kold,N,K=K,XtX.big=XtX.big,Xty.big=Xty.big) #OLS results from starter model lik.list=lprobcalc$lprob.all(ymy=ols.object$ymy, k=kold, bhat=ols.object$bhat, diag.inverse=ols.object$diag.inverse) #likelihood and expected values for starter model lprobold=lik.list$lprob b1=lik.list$b1new b2=lik.list$b2new ## calculate the posterior model probability for the first model pmpold=pmplist$pmp(ki=kold,mdraw=molddraw) ################################################################################## ## initialize top 10 function #################################################### #topmods=.top10(nmaxregressors=K,nbmodel=nmodel,bbeta=FALSE,bbeta2=FALSE,lengthfixedvec=length(add.otherstats)) topmods=topmod(nbmodels=nmodel,nmaxregressors=K,bbeta=FALSE,lengthfixedvec=length(add.otherstats)) if (mcmc=="enum") { try(topmods$duplicates_possible(FALSE), silent=TRUE) } if (dotop && (burn==0L)) topmods$addmodel(mylik=pmpold+lprobold,vec01=molddraw,fixedvec=lik.list$otherstats) ################################################################################## ## Initialize the rest ########################################################### null.count=0 #number the null model has been drawn models.visited=0 #how often a model has been accepted (in burn-ins and iterations) inccount=numeric(K) #how often the respective covariate has been included msize=0 #average model size k.vec=numeric(K) #how often the respective model size has been accepted b1mo=numeric(K) #holds aggregate first moment of all coefficients ab=numeric(K) #Initialize them here b2mo=numeric(K) #holds aggregate second moment of all coefficients bb=numeric(K) possign=inccount # the number of times the respective coefficent has been positive mnewdraw=numeric(K) #holds the binary vector denoting the proposed model if (force.full.ols) {candi.is.full.object=TRUE} else {candi.is.full.object=FALSE} #candi.is.full: if TRUE, standard OLS, else OLS via Frisch-Waugh tricks bmo=numeric(4*K); bm=bmo #common placeholder for b1mo, b2mo, k.vec and possign if (is.enum) { addup() } # in case the sampler is enumeration then count the starting value as well (no burn-ins) if (!is.finite(pmpold)) pmpold = -1e90 # this is if in case of an MCMC sampler the starting model got 0 PMP ############################################################################################################### ############################################################################################################### ############################################################################################# set.seed(as.numeric(Sys.time())) #Set Seed randomly for number generator t1<-Sys.time() #Save time before going into the loop ########################################################################################### #START MAIN LOOP ########################################################################################### nrep=burn+iter; i=0; while(i2|i<3|force.full.ols) {candi.is.full.object=TRUE} else {candi.is.full.object=FALSE}} #candi.is.full.object = TRUE if there were multiple regs dropped or added due to interaction terms if (candi.is.full.object) { ols.candidate = .ols.terms2(positions=positionnew,yty=yty,k=knew,N,K=K,XtX.big=XtX.big,Xty.big=Xty.big) #in case of changing interaction terms, draw the big OLS stuff ymy.candi =ols.candidate[["ymy"]] } else { ymy.candi=ols.object[["child.ymy"]](a$addi,a$dropi,k=knew) #if standard sampling, use Frisch-Waugh to get the new ResidSS (faster) } if ( (ymy.candi<0) | is.na(ymy.candi) ) stop(paste("stumbled on rank-deficient model" )) lprobnew = lprobcalc[["just.loglik"]](ymy=ymy.candi,k=knew) # get the log-likelihood out of the ResidSS #Now decide whether to accept candidate draw accept.candi = as.logical(log(stats::runif(1,0,1))< lprobnew-lprobold + pmpnew-pmpold) } else { accept.candi=TRUE candi.is.full.object=FALSE } if(accept.candi){ if (!candi.is.full.object) { #in case one has used Frisch-Waugh and the new model got accepted, #calculate the 'real' inverse in order not to make copying mistakes ols.res = ols.object[["mutate"]](addix=a$addi, dropix=a$dropi, newpos=positionnew, newk=knew) } else { ols.object = ols.candidate ols.res = ols.candidate[["full.results"]]() } lik.list = lprobcalc[["lprob.all"]](max(0,ols.res$ymy), knew, ols.res$bhat, ols.res$diag.inverse) lprobold=lik.list[["lprob"]] position = positionnew pmpold=pmpnew #get posterior odds for new model if accepted molddraw=mnewdraw kold=knew models.visited=models.visited+1 #does not account for revisiting models } # Collect Posterior Draws ######################################################################## if (i>burn){ b1=lik.list[["b1new"]]; b2=lik.list[["b2new"]]; addup() #addup does iterative, cumulative sums of quantities of interest (betas, model size, etc.) # add log(lik)*p(M) to topmodels if (dotop) topmods[["addmodel"]](mylik=pmpold+lprobold,vec01=molddraw,fixedvec=otherstats) } } ########################################################################################### #END MAIN LOOP ########################################################################################### ########################################################################################### #adjust the topmod object and calculate all the betas after sampling #similar to havving set bbeta=T, and bbeta2=T in the call to .top10 above if (dotop) topmods=.topmod.as.bbetaT(topmods,gprior.info,X.data) ########################################################################################### ########################################################################################### timed<-difftime(Sys.time(),t1) # do aggregating calculations if (is.enum) {iter=iter+1; models.visited=models.visited+1} bmo=matrix(bmo,4,byrow=TRUE); b1mo=bmo[1,]; b2mo=bmo[2,]; k.vec=bmo[3,]; possign=bmo[4,]; rm(bmo) post.inf=.post.calc(gprior.info,add.otherstats,k.vec,null.count,X.data,topmods,b1mo,b2mo,iter,burn,inccount,models.visited,K,N,msize,timed,cumsumweights,mcmc,possign) result=list(info=post.inf$info,arguments=.construct.arglist(bms, environment()),topmod=topmods,start.pos=sort(start.position),gprior.info=post.inf$gprior.info,mprior.info=pmplist,reg.names=post.inf$reg.names,bms.call=try(match.call(bms,sys.call(0)),silent=TRUE)) if (!is.null(result$X.data)) { result$X.data<-NULL } class(result)=c("bma") ########################################################################################### # print results to console if(user.int){ print(result) print(timed) plot.bma(result) # do modelsize plot } return(invisible(result)) }