wwxkenmo / enigma Goto Github PK
View Code? Open in Web Editor NEWA fast and accurate deconvolution algorithm based on regularized matrix completion algorithm (ENIGMA)
License: MIT License
A fast and accurate deconvolution algorithm based on regularized matrix completion algorithm (ENIGMA)
License: MIT License
Hi,
I am running the code available here stepwise(except step 2 because my ref_type is single_cell). I am using same datasets as well. I have loaded reference in Seurat object. Running get_cell_proportion
is taking forever. I wanted to ask if it did happened for you too. I am using RLR
since the document says that CIBERSORT
takes longer time.
Thank you.
Hi! Thank you for developing ENIGMA. I tried to reproduce the code of example, but failed. I replaced "result_CSE_normalized" with "result_CSE", but I didn't get the good results as your description. Is the problem with the version of ENIGMA?
# loading the packages we need
suppressPackageStartupMessages(library(scater))
suppressPackageStartupMessages(library(splatter))
suppressPackageStartupMessages(library(ENIGMA))
suppressPackageStartupMessages(library(cowplot))
## Simulate CSE
params <- newSplatParams(seed=2022)
params <- setParams(params, update = list(nGenes = 8368, batchCells=200))
sim.groups.c1 <- splatSimulateGroups(params,
group.prob = c(0.25,0.25,0.25,0.25),
de.prob = c(0.3),
verbose = FALSE)
sim.groups.c1 <- logNormCounts(sim.groups.c1)
sim.groups.c1 <- runPCA(sim.groups.c1)
sim.groups.c1 <- runTSNE(sim.groups.c1,dimred="PCA",n_dimred=5)
p_c1 <- plotTSNE(sim.groups.c1, colour_by = "Group",point_size=3)
label.c1 <- sim.groups.c1$Group
params <- newSplatParams(seed=1002)
params <- setParams(params, update = list(nGenes = 8368, batchCells=200))
sim.groups.c2 <- splatSimulateGroups(params,
group.prob = c(0.5,0.5),
de.prob = c(0.01),
verbose = FALSE)
sim.groups.c2 <- logNormCounts(sim.groups.c2)
sim.groups.c2 <- runPCA(sim.groups.c2)
sim.groups.c2 <- runTSNE(sim.groups.c2,dimred="PCA",n_dimred=5)
p_c2 <- plotTSNE(sim.groups.c2, colour_by = "Group",point_size=3)
label.c2 <- sim.groups.c2$Group
params <- newSplatParams(seed=1004)
params <- setParams(params, update = list(nGenes = 8368, batchCells=200))
sim.groups.c3 <- splatSimulateGroups(params,
group.prob = c(0.5,0.5),
de.prob = c(0.1),
verbose = FALSE)
sim.groups.c3 <- logNormCounts(sim.groups.c3)
sim.groups.c3 <- runPCA(sim.groups.c3)
sim.groups.c3 <- runTSNE(sim.groups.c3,dimred="PCA",n_dimred=5)
p_c3 <- plotTSNE(sim.groups.c3, colour_by = "Group",point_size=3)
label.c3 <- sim.groups.c3$Group
p_true <- plot_grid(p_c1,p_c2,p_c3,nrow=1)
p_true <- p_true + ggtitle("Ground Truth") +
theme(plot.title = element_text(hjust = 0.5))
p_true
################
sim.groups.c1 <- counts(sim.groups.c1) %*% diag(10000/colSums(counts(sim.groups.c1)))
sim.groups.c2 <- counts(sim.groups.c2) %*% diag(10000/colSums(counts(sim.groups.c2)))
sim.groups.c3 <- counts(sim.groups.c3) %*% diag(10000/colSums(counts(sim.groups.c3)))
colnames(sim.groups.c2) <- colnames(sim.groups.c3) <- colnames(sim.groups.c1) <- paste0("Sample-",1:ncol(sim.groups.c1))
mat <- list(sim.groups.c1,sim.groups.c2,sim.groups.c3)
names(mat) <- c("ct1","ct2","ct3")
H1_array <- array(0,
dim = c( 3,
8368,
200))
### construct reference profile
H1 <- array(0,
dim = c( 3,
8368))
for(i in 1:3){
H1_array[i,,] <- mat[[i]]
H1[i,] <- rowMeans(mat[[i]])
}
k <- 3 # number of cell types
ng <- 8368 # number of genes
p <- 200 # number of samples
###Simulate cell type-specific fractions
cc <- matrix(runif(p*k), ncol=k)
cc <- t(scale(t(cc), center=FALSE, scale=rowSums(cc)))
colnames(cc) <- paste('cellType', 1:ncol(cc), sep="")
##evaluate differential expression genes
##calculate
G <- NULL
for(i in 1:200){
G <- cbind(G, t(as.matrix(t(as.matrix(cc[i,])) %*% H1_array[,,i])))
}
noise <- t(matrix(rnorm(p*ng)*15, ncol=ng))
noise[noise<0] <- 0
G <- G + noise
k <- apply(G,1,var) > 10^-8
G <- G[k,]
H1 <- H1[,k]
H1_array <- H1_array[,k,]
rownames(G) <- colnames(H1) <- paste0("gene",c(1:nrow(G)))
colnames(G) <- paste0("Sample",1:ncol(G))
rownames(H1) <- colnames(cc)
raw <- SingleCellExperiment(assays=list(logcounts = G))
raw$Group <- label.c1
raw <- raw
raw <- runPCA(raw)
p <- plotPCA(raw, colour_by = "Group")
#raw <- runUMAP(raw,dimred="PCA",n_dimred=5)
#p_raw_umap <- plotUMAP(raw, colour_by = "Group",point_size=3)
raw <- runTSNE(raw,dimred="PCA",n_dimred=5)
p_raw1 <- plotTSNE(raw, colour_by = "Group",point_size=3)
raw$Group <- label.c2
p_raw2 <- plotTSNE(raw, colour_by = "Group",point_size=3)
raw$Group <- label.c3
p_raw3 <- plotTSNE(raw, colour_by = "Group",point_size=3)
p_raw <- plot_grid(p_raw1,p_raw2,p_raw3,nrow=1)
p_raw <- p_raw + ggtitle("Pseudo-Bulk") +
theme(plot.title = element_text(hjust = 0.5))
p_raw
#############################
get_proportion <- function(X, ref) {
cat( date(), "Calculating cell type proportion of bulk samples... \n" )
gene_id = intersect( rownames(X), rownames(ref) )
X_m = X[gene_id,]
ref_m = ref[gene_id,]
ref_m <- apply(ref_m,2,scale)
theta <- NULL
coefVec <- NULL
for(i in 1:ncol(X_m)){
Exp <- as.matrix(X_m[,i])
rownames(Exp) <- rownames(ref_m)
colnames(Exp) <- colnames(X_m)[i]
Exp <- scale(Exp)
rlm.o <- rlm(Exp ~ as.matrix(ref_m), maxit = 100)
coef.v <- summary(rlm.o)$coef[2:(ncol(as.matrix(ref_m)) + 1), 1]
coefVec <- rbind(coefVec,coef.v)
coef.v[which(coef.v < 0)] <- 0
total <- sum(coef.v)
coef.v <- coef.v/total
theta <- rbind(theta,coef.v)
}
colnames(theta) <- colnames(coefVec) <- colnames(ref_m)
rownames(theta) <- rownames(coefVec) <- colnames(X_m)
res <- list()
res$theta <- theta
return(res)
}
suppressPackageStartupMessages(library(MASS))
Fra_Simulate <- get_proportion(G, t(H1))
par(mfrow=c(1,3))
plot(Fra_Simulate$theta[,1],cc[,1],main = paste("Cor = ",round(cor(Fra_Simulate$theta[,1],cc[,1]),2),sep=""),xlab="Predict Fractions of CT1",ylab="Ground Truth of CT1")
plot(Fra_Simulate$theta[,2],cc[,2],main = paste("Cor = ",round(cor(Fra_Simulate$theta[,2],cc[,2]),2),sep=""),xlab="Predict Fractions of CT2",ylab="Ground Truth of CT2")
plot(Fra_Simulate$theta[,3],cc[,3],main = paste("Cor = ",round(cor(Fra_Simulate$theta[,3],cc[,3]),2),sep=""),xlab="Predict Fractions of CT3",ylab="Ground Truth of CT3")
suppressPackageStartupMessages(library(TCA))
bulk <- G
time.tca <- system.time({tca.mdl <- tca(X = bulk, W = Fra_Simulate$theta, C1 = NULL, C2 = NULL,
parallel = TRUE,num_cores=4,max_iters=5)
Z_hat_simulate <- tensor(X = (as.matrix(bulk)), tca.mdl)})
TCA <- SingleCellExperiment(assays=list(logcounts = Z_hat_simulate[[1]]))
TCA$Group <- label.c1
TCA <- TCA[Fra_Simulate$theta[,1]>0.05]
TCA <- runPCA(TCA)
p <- plotPCA(TCA, colour_by = "Group")
#TCA <- runUMAP(TCA,dimred="PCA",n_dimred=5)
#p_TCA_umap <- plotUMAP(TCA, colour_by = "Group",point_size=3)
TCA <- runTSNE(TCA,dimred="PCA",n_dimred=5)
p_TCA_tsne1 <- plotTSNE(TCA, colour_by = "Group",point_size=3)
TCA <- SingleCellExperiment(assays=list(logcounts = Z_hat_simulate[[2]]))
TCA$Group <- label.c2
TCA <- TCA[Fra_Simulate$theta[,2]>0.05]
TCA <- runPCA(TCA)
p <- plotPCA(TCA, colour_by = "Group")
#TCA <- runUMAP(TCA,dimred="PCA",n_dimred=5)
#p_TCA_umap <- plotUMAP(TCA, colour_by = "Group",point_size=3)
TCA <- runTSNE(TCA,dimred="PCA",n_dimred=5)
p_TCA_tsne2 <- plotTSNE(TCA, colour_by = "Group",point_size=3)
TCA <- SingleCellExperiment(assays=list(logcounts = Z_hat_simulate[[3]]))
TCA$Group <- label.c3
TCA <- TCA[Fra_Simulate$theta[,3]>0.05]
TCA <- runPCA(TCA)
p <- plotPCA(TCA, colour_by = "Group")
#TCA <- runUMAP(TCA,dimred="PCA",n_dimred=5)
#p_TCA_umap <- plotUMAP(TCA, colour_by = "Group",point_size=3)
TCA <- runTSNE(TCA,dimred="PCA",n_dimred=5)
p_TCA_tsne3 <- plotTSNE(TCA, colour_by = "Group",point_size=3)
p_TCA <- plot_grid(p_TCA_tsne1,p_TCA_tsne2,p_TCA_tsne3,nrow=1)
p_TCA <- p_TCA + ggtitle("TCA") +
theme(plot.title = element_text(hjust = 0.5))
p_TCA
############################
G <- as.matrix(G)
H1 <- as.matrix(H1)
egm = create_ENIGMA(bulk = G, ref = t(H1), ref_type = "bulk", meta_ref = as.matrix(colnames(t(H1))))
## Wed Sep 15 11:05:22 2021 Reference from FACS Bulk RNA-seq/microarray.
egm = batch_correct(egm)
## Here we don't need batch effect correction therefore I set the correct bulk expression profile back to the raw!
egm@bulk <- G
## Caculate cell type-specific fractions
egm = get_cell_proportion(egm)
## Using ENIGMA to infer CSE
egm = ENIGMA_L2_max_norm(egm, epsilon=0.001, alpha=0.7,
beta=4500,tao_k=0.01,max.iter=1000,verbose=TRUE)
###################################################################
#Error: no slot of name "result_CSE_normalized" for this object of class "ENIGMA"
###################################################################
##########################################
#replaced by following code for the version of ENIGMA
##########################################
#enigma <- egm@result_CSE_normalized[,egm@result_CSE_normalized$cell_type %in% "cellType1"]
enigma <- egm@result_CSE[,egm@result_CSE$cell_type %in% "cellType1"]
enigma$Group <- label.c1
enigma <- enigma[Fra_Simulate$theta[,1]>0.05]
enigma <- runPCA(enigma)
p <- plotPCA(enigma, colour_by = "Group")
#enigma <- runUMAP(enigma,dimred="PCA",n_dimred=5)
#p_enigma_umap <- plotUMAP(enigma, colour_by = "Group",point_size=3)
enigma <- runTSNE(enigma,dimred="PCA",n_dimred=5)
p_enigma_tsne1 <- plotTSNE(enigma, colour_by = "Group",point_size=3)
#replaced by following code for the version of ENIGMA
#enigma <- egm@result_CSE_normalized[,egm@result_CSE_normalized$cell_type %in% "cellType3"]
enigma <- egm@result_CSE[,egm@result_CSE$cell_type %in% "cellType3"]
enigma$Group <- label.c3
enigma <- enigma[Fra_Simulate$theta[,3]>0.05]
enigma <- runPCA(enigma)
p <- plotPCA(enigma, colour_by = "Group")
#enigma <- runUMAP(enigma,dimred="PCA",n_dimred=5)
#p_enigma_umap <- plotUMAP(enigma, colour_by = "Group",point_size=3)
enigma <- runTSNE(enigma,dimred="PCA",n_dimred=5)
p_enigma_tsne3 <- plotTSNE(enigma, colour_by = "Group",point_size=3)
#replaced by following code for the version of ENIGMA
#enigma <- egm@result_CSE_normalized[,egm@result_CSE_normalized$cell_type %in% "cellType2"]
enigma <- egm@result_CSE[,egm@result_CSE$cell_type %in% "cellType3"]
enigma$Group <- label.c2
enigma <- enigma[Fra_Simulate$theta[,2]>0.05]
enigma <- runPCA(enigma)
p <- plotPCA(enigma, colour_by = "Group")
#enigma <- runUMAP(enigma,dimred="PCA",n_dimred=5)
#p_enigma_umap <- plotUMAP(enigma, colour_by = "Group",point_size=3)
enigma <- runTSNE(enigma,dimred="PCA",n_dimred=5)
p_enigma_tsne2 <- plotTSNE(enigma, colour_by = "Group",point_size=3)
p_enigma <- plot_grid(p_enigma_tsne1,p_enigma_tsne2,p_enigma_tsne3,nrow=1)
p_enigma <- p_enigma + ggtitle("ENIGMA") +
theme(plot.title = element_text(hjust = 0.5))
p_enigma
sessionInfo()
R version 4.1.0 (2021-05-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)
Matrix products: default
locale:
[1] LC_COLLATE=Chinese (Simplified)_China.936 LC_CTYPE=Chinese (Simplified)_China.936
[3] LC_MONETARY=Chinese (Simplified)_China.936 LC_NUMERIC=C
[5] LC_TIME=Chinese (Simplified)_China.936
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] MASS_7.3-54 pheatmap_1.0.12 magrittr_2.0.1
[4] BiocManager_1.30.16 scater_1.20.1 ggplot2_3.3.5
[7] scuttle_1.2.1 TCA_1.2.1 cowplot_1.1.1
[10] splatter_1.16.1 SingleCellExperiment_1.14.1 SummarizedExperiment_1.22.0
[13] Biobase_2.52.0 GenomicRanges_1.44.0 GenomeInfoDb_1.28.4
[16] IRanges_2.26.0 S4Vectors_0.30.2 BiocGenerics_0.38.0
[19] MatrixGenerics_1.4.3 matrixStats_0.61.0 ENIGMA_0.1.1
loaded via a namespace (and not attached):
[1] Rtsne_0.15 ggbeeswarm_0.6.0 colorspace_2.0-2
[4] ellipsis_0.3.2 futile.logger_1.4.3 XVector_0.32.0
[7] BiocNeighbors_1.10.0 rstudioapi_0.13 farver_2.1.0
[10] bit64_4.0.5 AnnotationDbi_1.54.1 fansi_0.5.0
[13] splines_4.1.0 sparseMatrixStats_1.4.2 cachem_1.0.6
[16] config_0.3.1 nloptr_1.2.2.3 annotate_1.70.0
[19] png_0.1-7 compiler_4.1.0 httr_1.4.2
[22] backports_1.4.1 assertthat_0.2.1 Matrix_1.4-0
[25] fastmap_1.1.0 limma_3.48.3 BiocSingular_1.8.1
[28] formatR_1.11 tools_4.1.0 rsvd_1.0.5
[31] gtable_0.3.0 glue_1.6.0 GenomeInfoDbData_1.2.6
[34] reshape2_1.4.4 dplyr_1.0.7 gmodels_2.18.1
[37] tinytex_0.36 Rcpp_1.0.7 vctrs_0.3.8
[40] Biostrings_2.60.2 gdata_2.18.0 nlme_3.1-153
[43] DelayedMatrixStats_1.14.3 xfun_0.29 stringr_1.4.0
[46] beachmat_2.8.1 lifecycle_1.0.1 irlba_2.3.5
[49] gtools_3.9.2 XML_3.99-0.8 edgeR_3.34.1
[52] zlibbioc_1.38.0 scales_1.1.1 RColorBrewer_1.1-2
[55] lambda.r_1.2.4 yaml_2.2.1 memoise_2.0.1
[58] pbapply_1.5-0 gridExtra_2.3 stringi_1.7.6
[61] RSQLite_2.2.9 genefilter_1.74.1 ScaledMatrix_1.0.0
[64] checkmate_2.0.0 BiocParallel_1.26.2 rlang_0.4.12
[67] pkgconfig_2.0.3 bitops_1.0-7 pracma_2.3.6
[70] lattice_0.20-45 purrr_0.3.4 labeling_0.4.2
[73] bit_4.0.4 tidyselect_1.1.1 plyr_1.8.6
[76] R6_2.5.1 generics_0.1.1 nnls_1.4
[79] DelayedArray_0.18.0 DBI_1.1.2 pillar_1.6.4
[82] withr_2.4.3 mgcv_1.8-38 survival_3.2-13
[85] KEGGREST_1.32.0 RCurl_1.98-1.5 tibble_3.1.6
[88] crayon_1.4.2 futile.options_1.0.1 utf8_1.2.2
[91] viridis_0.6.2 locfit_1.5-9.4 grid_4.1.0
[94] sva_3.40.0 data.table_1.14.2 blob_1.2.2
[97] matrixcalc_1.0-5 digest_0.6.29 xtable_1.8-4
[100] tidyr_1.1.4 munsell_0.5.0 beeswarm_0.4.0
[103] viridisLite_0.4.0 vipor_0.4.5
Hi,
I've tried to run Enigma trace norm for ~500 TCGA samples using my own scRNA data (15 cell types with ~ 10000 genes) as the reference (in the aggregated 10000x15 matrix but not the Seurat object). It has been running over 24 hrs. I wonder if there is any tip to speed up the calculation or if it is possible to make a multi-core version?
Thanks,
YC
Thank you for developing and making this powerful tool available.
I tried to follow your toy example of bulk RNA-seq deconvolution using ENIGMA. However, I cannot find the example RDS data files (below). Could you please provide an example file or a pointer to where I can download the files?
dataNSCLC <- readRDS("/load/Data/Path/dataNSCLC.rds")
ref_sc <- readRDS("/load/Data/Path/ref.rds")
I would appreciate your help.
Zhengyan 'George' Kan
Thank you so much for designing this wonderful program. For the Cell Type Specific Differential Expression (CTS-DE) Analysis could you please offer more context for setting this up after normalizing the ENIGMA? In vignette the analysis is performed by:
p <- 100
y <- as.numeric(gl(2, p/2)) - 1
DEG <- FindCSE_DEG(egm, y)
How are the values and formula for 'p' and 'y' determined? Doing 'Help?' for FindCSE_DEG does not define 'y'
Currently I cannot get this step to work and get error:
"CSE estimates and y have different length"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'object' in selecting a method for function 'coef': variable lengths differ (found for 'Epithelium')
Hello, Thank you for making this code available for use.
I am new to this kind of work and had 2 queries:
1. I was exploring the code to make sense of the deconvolution results, and saw that the calculation in sub_loss function under L2 Norm method is different at 'ENIGMA/blob/main/ENIGMA_analysis/ENIGMA_Script/ENIGMA.R' (which I ran) compared to the 'ENIGMA/blob/main/R/ENIGMA_L2_norm.R'
The algorithm details in the publication seem to match with ENIGMA_L2_norm.R
Please let me know if I am missing something, Thanks.
2. You have mentioned in the publication "We multiplied β with the expectation of θi, which means that if the average proportion of this cell type is low in bulk samples, the rank constraint will be relatively loose"
Can you please guide me as to where in the code this multiplication is done for the L2 Norm Method.
Dear Developer
Hello, thank you for developing a great algorithm.
Using the algorithm you developed, I deconvolved the bulk RNA sequencing data I have to find pseudo - cell type specific genes.
Example ) DEG = FindCSE_DEG(egm,y,FoldChange=TRUE)
But I have a question here.
If I arbitrarily apply a log2 to the "FoldChange" value that comes out as a result, making it in the form of log2(FoldChange), is it safe to assume that it is the same as the log2FC value that we usually get when we analyze with Deseq2 or limma or EdgeR ?
If the arbitrarily applied log2 is the same as the log2FC values we get from a typical Bulk RNA DEG, then I would like to go further and use these values to do GSEA analysis etc.
Thanks for reading this long post :)
attempt to replicate an object of type 'S4'
Thank you so much for uploading such a great tool, but I'm having problems using a as a reference.
I managed to common Create ENIGMA object, but got an error when running the mode batch effect correction.
It seems that I have a problem with my reference, but my reference is a single cell data of 7 cell types.Here is my code situation, I would be grateful if you could help me .
mode batch effect correction
+ egm = batch_correct(egm, + varname_cell_type = "cellType", + n_pseudo_bulk=100)
Tue Sep 28 11:28:58 2021 Reference is from Single Cell RNA-seq, doing batch correction in S mode.
Tue Sep 28 11:28:58 2021 Generating pseudo bulk...
Tue Sep 28 11:29:24 2021 Doing ComBat...
Found 693 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the DataTue Sep 28 11:29:27 2021 Restore reference...
Error in rep(value, length.out = nrows) :
attempt to replicate an object of type 'S4'
Hi,
Thank you for this package. I have been looking for something like this. I tried to use this with my data but I got the following error.
Error in rowSums(Ma) : 'x' must be an array of at least two dimensions
My codes are:
egm = create_ENIGMA(bulk = as.matrix(countdata_all),
ref = sce,
ref_type = "single_cell",
meta_ref = colData(sce))
egm = batch_correct(egm,
varname_cell_type = "celltype_global",
n_pseudo_bulk=100)
Here, I am using a SingleCellExperiment object. Can you please help with the error?
Nurun
Hi,
First of all I'd like to thank you for the tool, I find it really helpful as it addresses key challenge while using bulk data. I'm trying to reproduce your work of deconvolution analysis for RA patient data. I have downloaded all the required packages and have followed all the steps exactly in the same way as mentioned in the paper (all input file downloaded from the given location in the paper) but facing an error while performing concordance analysis at the FACS reference.
An Error is thrown after running " bMIND_B <- evaluate_concordance(genes,B,label_sc,deconv2$A[,1,],label) "
Error: Error in pre_exp[i, pre_label == "OA"] : incorrect number of dimensions
Calls: evaluate_concordance -> wilcox.test
Execution halted
System Configuration: CentOS 7
In addition to this, the step:
res_alg_all_ra_l2_norm_trace <- cell_deconvolve_trace(O=log2(exprs(bulk_ra_vs_oa)[rownames(tmp_r
a$type),]+1),
theta=Fra_ra$theta,
R=log2(tmp_ra$type+1),
alpha=0.5,beta=1,gamma=1,
max.iter=1000,solver = "admm",verbose=FALSE,No
rmalize=FALSE)
in the paper is converging in 21 steps but for me it is converging in 22 steps.
Could you please help me resolve this error?
Hi! I have checked codes in ENIGMA_v1.2.tar.gz, and I found some codes are different with the former codes. For example, batch_correct() in ENIGMA_v1.2.tar.gz will not return the egm@bulk, does it influence the follow-up calculation? Why does not use the adjusted bulk matirx in many examples that their references are from scRNA-seq? I have read the descriptions in CIBERSORTx, but I'm still confused. Could you give me some suggestions? Not necessarily focusing the algorithm itself, a guide to its use will be better.
library(sva)
library(purrr)
library(nnls)
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(scater))
suppressPackageStartupMessages(library(Biobase))
suppressPackageStartupMessages(library(ENIGMA))
dataNSCLC <- readRDS("dataNSCLC.rds")
ref_sc <- readRDS("ref.rds")
#We used the third patients to generate reference
ref_sc_sub <- ref_sc[,ref_sc$PatientID %in% "3" == TRUE]
ref_sc_sub <- ref_sc_sub[,ref_sc_sub$CellFromTumor %in% "1"]
ref_sc_sub <- ref_sc_sub[,ref_sc_sub$main_celltype %in% c("Alveolar","Epi") == FALSE]
Bulk <- dataNSCLC[[5]]
Tumor <- dataNSCLC[[1]]
Immune <- dataNSCLC[[2]]
Endothelial <- dataNSCLC[[3]]
Fibroblast <- dataNSCLC[[4]]
pheno <- dataNSCLC[[6]]
# The pheno variable contain the label of each samples (LUSC vs LUAD)
names(pheno) <- colnames(Tumor)
# Create ENIGMA object
egm = create_ENIGMA(bulk = Bulk, ref = exprs(ref_sc_sub), ref_type = "single_cell", meta_ref = pData(ref_sc_sub))
t1 <- egm
#Sat Jan 15 21:41:54 2022 Reference from Single Cell RNA-seq.
#Sat Jan 15 21:41:54 2022 Obtain reference from a matrix
egm@bulk
#<0 x 0 matrix>
egm@ref
#<0 x 0 matrix>
egm = batch_correct(egm, varname_cell_type = "main_celltype", n_pseudo_bulk=100)
#Sat Jan 15 21:43:35 2022 Reference is from Single Cell RNA-seq, doing batch correction in S mode.
#Sat Jan 15 21:43:35 2022 Generating pseudo bulk...
#Sat Jan 15 21:43:50 2022 Doing ComBat...
#Found 897 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
#Found2batches
#Adjusting for0covariate(s) or covariate level(s)
#Standardizing Data across genes
#Fitting L/S model and finding priors
#Finding parametric adjustments
#Adjusting the Data
#Sat Jan 15 21:43:51 2022 Restore reference...
#check the reference profile
head(egm@ref)
# B_cell EC Fibro Myeloid T cell Tumor
#A1BG 90.18397 0.000000e+00 54.99012 93.5680176 96.709796 0.0000000
#A1CF 0.00000 0.000000e+00 0.00000 0.0000000 0.000000 0.0000000
#A2M 125.57136 4.151109e+03 3173.31839 262.3662606 61.825663 43.4396281
#A2ML1 0.00000 0.000000e+00 0.00000 0.0000000 0.000000 0.0000000
#A4GALT 12.46785 1.072874e+02 137.38238 8.0684443 7.066553 60.2433877
#A4GNT 0.00000 7.730739e-02 0.00000 0.3495604 0.000000 0.2039968
head(egm@bulk)
<0 x 0 matrix>
# codes from https://github.com/WWXkenmo/ENIGMA/tree/main/R
egm2 <- remove_batch_effect_S_mode(t1, varname_cell_type = "main_celltype", n_pseudo_bulk=100)
Sat Jan 15 22:27:04 2022 Generating pseudo bulk...
Sat Jan 15 22:27:17 2022 Doing ComBat...
Found 897 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Sat Jan 15 22:27:18 2022 Restore reference...
> head(egm2@bulk)[,1:5]
T15 T17 T18 T20 T3
A1BG 58.813456 33.5832470 43.6152390 36.8567380 56.288493
A1CF 1.024028 1.0111199 1.0101987 1.0193313 1.025897
A2M 97.351834 51.6655504 285.5168587 484.3324470 616.635303
A2ML1 12.076414 1.7723418 1.6704341 16.5629186 3.269301
A4GALT 45.154681 12.0121822 70.6796462 132.9285592 72.904058
A4GNT 1.019937 0.9863782 0.9625496 0.9935557 1.070991
> head(assay(t1@raw_input$bulk)[rownames(egm2@bulk),])[,1:5]
T15 T17 T18 T20 T3
A1BG 1.5159640 0.0000000 0.5378489 0.1655012 1.3405893
A1CF 0.0240282 0.0111199 0.0101987 0.0193313 0.0258966
A2M 5.4094990 1.4748340 31.2635647 70.3592077 101.5633070
A2ML1 11.0764140 0.7723418 0.6704341 15.5629186 2.2693010
A4GALT 7.3931706 0.3733317 14.4859500 35.7236510 15.1560200
A4GNT 0.0938299 0.0385983 0.0000000 0.0503258 0.1797800
Best,
Qin
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.