背景#

先简单讲一下我在使用 spotlight 前的工作状态

处理 gem 的空间转录组数据, 使用的是华大写的 stereopy 库, 到注释这一步, stereoy 包装了 singleR 工具, 试用后发现速度奇慢无比, 集群中的 GPU 资源也比较有限, 排不上队, 而同组的师姐使用 spotlight 很快就注释好了, 遂尝试使用 spotlight

工作流#

在 R 中读取 h5ad#

因为我之前的数据都是使用 python 进行处理的, 这里需要读取 h5ad 文件在使用 stereopy 保存数据的时候, flavor 选择”Seurat”

1
st.io.stereo_to_anndata(
2
  data,
3
  flavor="seurat",
4
    output="./out/tissue_seurat.h5ad"
5
)

接下来在 R 中读取时空组数据(adata)和已经注释好的单细胞经验数据(reurat_ref), 由于前者是一个 anndata 对象, Spotlight 不支持, 故接下来进一步从中获取表达谱, 构成稀疏矩阵

1
library(reticulate)
2
use_condaenv(condaenv = "/path/to/anaconda/envs/env_name/bin/python", require = TRUE)
3
ad <- import("anndata")
4

5
cat("1. loading data ...\n")
6
adata <- ad$read_h5ad("./out/tissue_seurat.h5ad")
7
seurat_ref <- readRDS("data/12T_harmony_celltype.rds")
8

9
expr_matrix <- t(adata$X)  # 获取表达谱
10
colnames(expr_matrix) <- as.character(adata$obs_names$tolist())  # 列名为捕获位置
11
rownames(expr_matrix) <- as.character(adata$var_names$tolist())  # 行名为基因

CAUTION
注意, 注意!!! 如果你看了 NMF 的原理, 你就会知道, 对于已经注释好的单细胞表达谱(也就是上面的 seurat_ref), 应该构造出行=基因, 列=细胞的矩阵, 而对于待注释的空间组数据(expr_matrix), 应该构造行=基因, 列=捕获位置的矩阵

预处理#

1
cat("2. preprocessing ...\n")
2
sce <- as.SingleCellExperiment(seurat_ref)
3
sce <- logNormCounts(sce)
4

5
# # 获取sce和seurat_obj中的共有基因集
6
# common_gene <- intersect(rownames(sce), rownames(seurat_obj))
7
#
8
# # 过滤sce和seurat_obj以仅保留共有基因
9
# sce <- sce[common_gene, ]
10
# expr_matrix <- expr_matrix[common_gene, ]
11
# seurat_obj <- CreateSeuratObject(counts = expr_matrix)
12
# sce <- as.SingleCellExperiment(seurat_ref)
13

14
# 去掉核糖体和线粒体基因
15
gene <- !grepl(
16
    pattern = "^RP[L|S]|MT",
17
    x = rownames(sce)
18
)
19
dec <- modelGeneVar(sce , subset.row = gene)
20

21
# 计算高变基因
22
hvg <- getTopHVGs(dec, n = config$n_top_genes)
23

24
# 加上细胞注释信息
25
colLabels(sce) <- colData(sce)$celltype
26

27
# Compute marker genes
28
mgs <- scoreMarkers(sce, subset.row = gene)
29

30
# 保留最相关的marker基因
31
mgs_fil <- lapply(names(mgs), function(i) {
32
  x <- mgs[[i]]
33
  # Filter and keep relevant marker genes, those with AUC > 0.8
34
  x <- x[x$mean.AUC > 0.8, ]
35
  # Sort the genes from highest to lowest weight
36
  x <- x[order(x$mean.AUC, decreasing = TRUE), ]
37
  # Add gene and cluster id to the dataframe
38
  x$gene <- rownames(x)
39
  x$cluster <- i
40
  data.frame(x)
41
})
42
mgs_df <- do.call(rbind, mgs_fil)

跑起来#

1
cat("3. spotlight runing ...\n")
2
x_matrix <- GetAssayData(seurat_ref, assay = "RNA", layer = "data")
3
# x_matrix <- as(x_matrix, "RsparseMatrix")
4
x <- SingleCellExperiment(list(counts = x_matrix))
5
y <- SingleCellExperiment(list(counts = expr_matrix))
6
groups <- sce$celltype
7
print(class(x_matrix))
8
print(class(expr_matrix))
9
rm(seurat_ref,adata,expr_matrix,x_matrix,sce,dec)
10
<!--ID: 1728657598895-->
11

12

13
res <- SPOTlight(
14
  x = x,
15
  y = y,
16
  groups = groups,
17
  mgs = mgs_df,
18
  hvg = hvg,
19
  weight_id = "mean.AUC",
20
  group_id = "cluster",
21
  gene_id = "gene"
22
)
23

24
cat("saving ... \n")
25
saveRDS(res, file = config$spotlight_res_path)