# install.packages("remotes")
# remotes::install_github("rdevito/MSFA")
library(MSFA)
A Tutorial for Bayesian Integrative Factor Models
Welcome
This is the tutorial to guide the use of Bayesian integrative factor models. Including naive methods Stack FA (Stacking data from all the studies and apply FA), Ind FA (applying FA to each study separatly), and advanced methods PFA (Perturbed Factor Analysis) (Roy et al. 2021), MOM-SS (Bayesian Factor Regression with the non-local spike-and-slab priors) (Avalos-Pacheco, Rossell, and Savage 2022), SUFA (Subspace Factor Analysis) (Chandra, Dunson, and Xu 2024), BMSFA (Bayesian Multi-study Factor Analysis) (De Vito et al. 2021) and Tetris (Grabski et al. 2023).
Full intro to the multi-study data and comparison for these methods can be found in the manuscript. Raw code can be found the the GitHub repository (https://github.com/Mavis-Liang/Bayesian_integrative_FA_tutorial/tree/main)
Quick start
Step 1: Prepare the package and data
A small simulated data can be downloaded here:
A small simulated data (if it does not trigger instant download, you can go to the repo provided above and find it under folder RDS
).
<- readRDS("Data/sim_B_small.rds") data
A glance at the data:
dim(data$Y_list[[1]])
[1] 50 20
For study 1, we have 50 samples and 20 features.
length(data$Y_list)
[1] 3
We have 3 studies in total.
sapply(data$Y_list, dim)
[,1] [,2] [,3]
[1,] 50 50 50
[2,] 20 20 20
The second and third studies also have 50 samples and 20 features.
Step 2: Run BMSFA
<- MSFA::sp_msfa(data$Y_list, k = 5, j_s = c(4, 4, 4),
fit1 scaling = FALSE, centering = TRUE,
control = list(nrun = 5000, burn = 4000))
r= 1000
r= 2000
r= 3000
r= 4000
r= 5000
Step 3: Post-processing
We use sp_OP
to apply orthogonal Procrustes rotation to the posterior samples of the common loading matrix \(\Phi\) and the study-specific loading matrices \(\Lambda_s\). The function sp_OP
is from the MSFA
package.
Common covariance and study-specific covariance matrices are calculated by the cross-product of the loading matrices.
The marginal covariance matrix is calculated by \(\Sigma_s = \Phi \Phi^T + \Lambda_s \Lambda_s^T + diag(\Psi_s)\), where \(\Psi_s\) is the diagonal matrix of the residual variances for study \(s\).
library(tidyverse) # for data wrangling
<- function(fit){
post_BMSFA # Common covariance matrix and loading
<- MSFA::sp_OP(fit$Phi, trace=FALSE)$Phi
est_Phi <- tcrossprod(est_Phi)
est_SigmaPhi
# Study-specific covariance matrices and loadings
<- lapply(fit$Lambda, function(x) MSFA::sp_OP(x, trace=FALSE)$Phi)
est_LambdaList <- lapply(est_LambdaList, function(x) tcrossprod(x))
est_SigmaLambdaList
# Marginal covariance matrices
<- length(est_SigmaLambdaList)
S # Get point estimate of each Psi_s
<- lapply(1:S, function(s) {
est_PsiList apply(fit$psi[[s]], c(1, 2), mean)
})<- lapply(1:S, function(s) {
est_margin_cov + est_SigmaLambdaList[[s]] + diag(est_PsiList[[s]] %>% as.vector())
est_SigmaPhi
})
return(list(Phi = est_Phi, SigmaPhi = est_SigmaPhi,
LambdaList = est_LambdaList, SigmaLambdaList = est_SigmaLambdaList,
PsiList = est_PsiList,
SigmaMarginal = est_margin_cov))
}<- post_BMSFA(fit1) out1
Now, determine the number of factors using eigen value decomposition (EVD).
# Obtain the eigen values from the common covariance matrix
<- eigen(out1$SigmaPhi)$values
val_eigen_SigmaPhi # Proportion of variance explained by each eigen value
<- val_eigen_SigmaPhi/sum(val_eigen_SigmaPhi)
prop_var_SigmaPhi # Choose the number of factors - factors that explain more than 5% of the variance
<- length(which(prop_var_SigmaPhi > 0.05))
choose_K_SigmaPhi
# Similarly, for each study-specific covariance matrix:
<- lapply(out1$SigmaLambdaList, function(x) eigen(x)$values)
val_eigen_SigmaLambdaList <- lapply(val_eigen_SigmaLambdaList, function(x) x/sum(x))
prop_var_SigmaLambdaList <- lapply(prop_var_SigmaLambdaList, function(x) length(which(x > 0.05)))
choose_K_SigmaLambdaList
# Print the output
choose_K_SigmaPhi
[1] 4
for(s in 1:length(choose_K_SigmaLambdaList)){
cat("Study", s, "has", choose_K_SigmaLambdaList[[s]], "factors.\n")
}
Study 1 has 1 factors.
Study 2 has 1 factors.
Study 3 has 2 factors.
Now we rerun the model with the number of factors we just determined. And again we post-process the MCMC chains with post_BMSFA()
.
<- MSFA::sp_msfa(data$Y_list, k = 4, j_s = c(1, 1, 2),
fit2 scaling = FALSE, centering = TRUE,
control = list(nrun = 5000, burn = 4000))
r= 1000
r= 2000
r= 3000
r= 4000
r= 5000
<- post_BMSFA(fit2) out2
Step 4: Check our results
# For example, view the loading matrix
head(out2$Phi)
[,1] [,2] [,3] [,4]
[1,] 0.16952713 0.020636365 -1.1875126 0.3430111
[2,] -0.08185122 0.117941438 -1.2051410 0.3426028
[3,] -0.24166569 0.009744298 -0.9142091 0.3266155
[4,] -0.23449935 0.057977792 -0.7576149 0.1696956
[5,] -0.30719242 0.025246159 -0.4643306 0.2115390
[6,] -0.42761733 -0.030917582 -0.3497078 0.2136760
dim(out2$Phi)
[1] 20 4