两样本间同种细胞的差异分析之火山图遍历绘制
概述:
本文介绍了一个用于分析单细胞/空间转录组数据的R脚本,可自动生成两样本间同种细胞的差异基因火山图。该脚本要求输入实验组和对照组的Seurat对象(.rds文件),通过设置log2FC和p值阈值筛选差异基因,并自动过滤细胞数量不足100的亚群。输出包括每类细胞的全部差异基因列表、top200基因列表及火山图可视化结果。
环境配置:
if (!base::requireNamespace("BiocManager", quietly = TRUE)){ install.packages("BiocManager") } BiocManager::install(c("Seurat","dplyr","ggplot2","tidyr","ggrepel"))demo数据及脚本“volcano.r”下载方式:
通过网盘分享的文件:18-火山图
链接: https://pan.baidu.com/s/1O0_qXB4zThXGBTkvO-7VMw?pwd=qabn 提取码: qabn
脚本使用说明:
Rscript ./volcano.r --help Options: --obj1_path_T path obj1 rds path --obj2_path_C path obj2 rds path --prefix1 character prefix1 of obj1 --prefix2 character prefix2 of obj2 --anno character annotation column --outdir path output directory --avg_log2FC 0.5 avg_log2FC threshold --p_val_adj 0.05 p_val_adj threshold --help help message Example: Rscript volcano.R --obj1_path_T obj1.rds --obj2_path_C obj2.rds --prefix1 Treat --prefix2 Ctrl --outdir output/ --anno seurat_clusters --avg_log2FC 0.5 --p_val_adj 0.05参数说明:--obj1_path_T:是实验组的rds路径;--obj2_path_T:是对照组的rds路径;--prefix1:是实验组的标签,自行设定;--prefix2:是对照组的标签;--anno:是rds中的meta.data中其中一列标注细胞注释或者聚类的列命;--outdir:是结果输出路径;--avg_log2FC:是筛选差异表达基因(DEGs)时的核心统计量之一,平均 log2 倍数变化,默认0.5;--p_val_adj:校正后的P值,通常作为差异基因的统计显著性阈值,默认0.05。
脚本使用示例:
Rscript volcano.r --obj1_path_T treat.rds --obj2_path_C ctrl.rds --prefix1 treat --prefix2 ctrl --outdir outdir --anno seurat_clusters --avg_log2FC 0.5 --p_val_adj 0.05脚本输出结果:
通过该脚本会得到两个目录 marker_treat_ctrl和 volcano_treat_ctrl ,marker_treat_ctrl存放的是全部marker基因列表和top200的marker基因列表,volcano_treat_ctrl存放的是满足绘制要求的火山图。
$tree ./ ./ ├── marker_treat_ctrl │ ├── treat_ctrl_0_ALL_markers.csv │ ├── treat_ctrl_0_top200_markers.csv │ ├── treat_ctrl_11_ALL_markers.csv │ ├── treat_ctrl_11_top200_markers.csv │ ├── treat_ctrl_13_ALL_markers.csv │ ├── treat_ctrl_13_top200_markers.csv │ ├── treat_ctrl_14_ALL_markers.csv │ ├── treat_ctrl_14_top200_markers.csv │ ├── treat_ctrl_15_ALL_markers.csv │ ├── treat_ctrl_15_top200_markers.csv │ ├── treat_ctrl_16_ALL_markers.csv │ ├── treat_ctrl_16_top200_markers.csv │ ├── treat_ctrl_17_ALL_markers.csv │ ├── treat_ctrl_17_top200_markers.csv │ ├── treat_ctrl_1_ALL_markers.csv │ ├── treat_ctrl_1_top200_markers.csv │ ├── treat_ctrl_2_ALL_markers.csv │ ├── treat_ctrl_2_top200_markers.csv │ ├── treat_ctrl_4_ALL_markers.csv │ ├── treat_ctrl_4_top200_markers.csv │ ├── treat_ctrl_5_ALL_markers.csv │ ├── treat_ctrl_5_top200_markers.csv │ ├── treat_ctrl_6_ALL_markers.csv │ ├── treat_ctrl_6_top200_markers.csv │ ├── treat_ctrl_8_ALL_markers.csv │ ├── treat_ctrl_8_top200_markers.csv │ ├── treat_ctrl_9_ALL_markers.csv │ └── treat_ctrl_9_top200_markers.csv └── volcano_treat_ctrl ├── treat_ctrl_0_volcano.png ├── treat_ctrl_11_volcano.png ├── treat_ctrl_13_volcano.png ├── treat_ctrl_16_volcano.png ├── treat_ctrl_1_volcano.png ├── treat_ctrl_4_volcano.png ├── treat_ctrl_6_volcano.png ├── treat_ctrl_8_volcano.png └── treat_ctrl_9_volcano.png展示其中一个的火山图:
另附上脚本内容:
library(getopt) # 定义参数矩阵 【修复版】 spec <- matrix(c( 'obj1_path_T', 'T', 1, "character", 'obj2_path_C', 'C', 1, "character", 'prefix1', 'x', 1, "character", 'prefix2', 'y', 1, "character", 'anno', 'a', 1, "character", 'outdir', 'o', 1, "character", 'avg_log2FC', 'f', 2, "character", 'p_val_adj', 'q', 2, "character", 'help', 'h', 0, 'logical' ), byrow = TRUE, ncol = 4) # 帮助信息 print_usage <- function() { cat(" Options: --obj1_path_T path obj1 rds path --obj2_path_C path obj2 rds path --prefix1 character prefix1 of obj1 --prefix2 character prefix2 of obj2 --anno character annotation column --outdir path output directory --avg_log2FC 0.5 avg_log2FC threshold --p_val_adj 0.05 p_val_adj threshold --help help message Example: Rscript volcano.R --obj1_path_T obj1.rds --obj2_path_C obj2.rds --prefix1 Treat --prefix2 Ctrl --outdir output/ --anno seurat_clusters --avg_log2FC 0.5 --p_val_adj 0.05 \n") q(status = 1) } # 获取参数 opt <- getopt(spec) # 帮助 if (!is.null(opt$help) || length(opt) == 0) { print_usage() } # 默认值 if(is.null(opt$avg_log2FC)) opt$avg_log2FC = 0.5 if(is.null(opt$p_val_adj)) opt$p_val_adj = 0.05 # 转成数值型 【关键!】 opt$avg_log2FC <- as.numeric(opt$avg_log2FC) opt$p_val_adj <- as.numeric(opt$p_val_adj) # 检查必需参数 if (is.null(opt$obj1_path_T) || is.null(opt$obj2_path_C) || is.null(opt$prefix1) || is.null(opt$prefix2) || is.null(opt$outdir) || is.null(opt$anno)) { cat("Error: Missing required arguments!\n") print_usage() } # 创建输出目录 if (!dir.exists(opt$outdir)) { dir.create(opt$outdir, recursive = TRUE) } # 提取参数 【全部修复对应】 obj1_path_T = opt$obj1_path_T obj2_path_C = opt$obj2_path_C prefix1 = opt$prefix1 prefix2 = opt$prefix2 outdir = opt$outdir anno = opt$anno log2F = opt$avg_log2FC p_val_adj = opt$p_val_adj library(Seurat) library(dplyr) library(ggplot2) library(tidyr) library(ggrepel) marker_path = paste0(outdir,'/marker_',prefix1,'_',prefix2) volcano_path = paste0(outdir,'/volcano_',prefix1,'_',prefix2) dir.create(volcano_path) dir.create(marker_path) obj1 = readRDS(obj1_path_T) obj2 = readRDS(obj2_path_C) obj1$orig.ident = prefix1 obj2$orig.ident = prefix2 combined <- merge(obj1, y = obj2, add.cell.ids = c(prefix1, prefix2)) Idents(combined) = anno clusters = unique(unique(combined[[anno]][,1])) for (cluster in clusters){ print(paste0('--------------',cluster,'-analysis!','--------------')) obj = subset(combined, idents = cluster) len1 = length(rownames(obj@meta.data[obj$orig.ident %in% prefix1, ])) len2 = length(rownames(obj@meta.data[obj$orig.ident %in% prefix2, ])) if (len1 >= 100 & len2 >= 100){ Idents(obj) = 'orig.ident' markers = FindMarkers(obj, ident.1 = prefix1, ident.2 = prefix2, min.pct = 0.1,logfc.threshold =0.1,recorrect_umi = FALSE) if (dim(markers)[1]==0){ print(paste0(cluster,' : no-DEG-genes')) print(paste0(cluster,'-no-volcano!')) next } Markers = FindAllMarkers(obj, only.pos = TRUE, min.pct = 0.1,logfc.threshold =0.1,assay ="Spatial",slot = "data",recorrect_umi = FALSE) if (dim(Markers)[1]==0){ print(paste0(cluster,' : no-DEG-genes')) print(paste0(cluster,'-no-volcano!')) next } Markers = Markers %>% group_by(cluster) %>% arrange(cluster, desc(avg_log2FC)) top200_markers = Markers %>% group_by(cluster) %>% top_n(n = 200, wt = avg_log2FC) %>% arrange(cluster, desc(avg_log2FC)) write.csv(Markers,paste0(marker_path,'/',prefix1,'_',prefix2,'_',cluster,'_ALL_markers.csv')) write.csv(top200_markers,paste0(marker_path,'/',prefix1,'_',prefix2,'_',cluster,'_top200_markers.csv')) markers = markers %>% arrange( desc(avg_log2FC)) feature_df = Markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC) markers$gene = rownames(markers) markers$label = markers$gene markers <- markers %>% mutate(label = ifelse(gene %in% feature_df$gene, gene, NA)) markers$threshold="ns"; idx1 <- which(markers$avg_log2FC > log2F & markers$p_val_adj < p_val_adj) idx2 = which(markers$avg_log2FC < (-log2F) & markers$p_val_adj < p_val_adj) if (length(idx1) > 0 & length(idx2) >0 ){ markers[which(markers$avg_log2FC > log2F & markers$p_val_adj <p_val_adj),]$threshold="up"; markers[which(markers$avg_log2FC < (-log2F) & markers$p_val_adj < p_val_adj),]$threshold="down"; markers$threshold=factor(markers$threshold, levels=c('down','ns','up')) markers =markers%>% mutate(label = ifelse((!is.na(label) &threshold =='ns' ), NA, label)) p = ggplot(data=markers, aes(x=avg_log2FC, y=-log10(p_val_adj), color=threshold)) + geom_point(alpha=0.8, size=0.8) + geom_vline(xintercept = c(-log2F, log2F), linetype=2, color="grey")+ geom_hline(yintercept = -log10(p_val_adj), linetype=2, color="grey")+ #labs(title= ifelse(""==title, "", paste("DEG:", title)))+ xlab(bquote(Log[2]*FoldChange))+ ylab(bquote(-Log[10]*italic(P.adj)) )+ theme_classic(base_size = 14) + ggtitle(paste0(prefix1,'(Treat) vs ', prefix2, '(Ctrl)','cell:',cluster))+ scale_color_manual('',labels=c(paste0(prefix2,"(",table(markers$threshold)[[1]],')'),'ns', paste0(prefix1,"(",table(markers$threshold)[[3]],')' )), values=c("blue", "grey","red" ) )+ guides(color=guide_legend(override.aes = list(size=3, alpha=1)))+ geom_label_repel(data = markers, aes(x = avg_log2FC, y = -log10(p_val_adj), label = label), size = 5,color="black", box.padding = unit(0.4, "lines"), segment.color = "black", #连线的颜色 segment.size = 0.4, #连线的粗细 ) ggsave(plot = p,paste0(volcano_path,'/',prefix1,'_',prefix2,'_',cluster,'_volcano.png'),width = 8,height = 6) print(paste0(cluster,'-done!')) } else if(length(idx1) > 0 & length(idx2) == 0) { markers[which(markers$avg_log2FC > log2F & markers$p_val_adj < p_val_adj), ]$threshold = "up" markers$threshold = factor(markers$threshold, levels = c('ns', 'up')) markers =markers%>% mutate(label = ifelse((!is.na(label) &threshold =='ns' ), NA, label)) p = ggplot(data = markers, aes(x = avg_log2FC, y = -log10(p_val_adj), color = threshold)) + geom_point(alpha = 0.8, size = 0.8) + geom_vline(xintercept = c(-log2F, log2F), linetype = 2, color = "grey") + geom_hline(yintercept = -log10(p_val_adj), linetype = 2, color = "grey") + xlab(bquote(Log[2] * FoldChange)) + ylab(bquote(-Log[10] * italic(P.adj))) + theme_classic(base_size = 14) + ggtitle(paste0(prefix1,'(Treat) vs ', prefix2, '(Ctrl)','cell:',cluster))+ scale_color_manual('', labels = c('ns', paste0( prefix1, "(", table(markers$threshold)[['up']], ')')), values = c("grey", "red")) + guides(color = guide_legend(override.aes = list(size = 3, alpha = 1))) + geom_label_repel(data = markers, aes(x = avg_log2FC, y = -log10(p_val_adj), label = label), size = 5, color = "black", box.padding = unit(0.4, "lines"), segment.color = "black", segment.size = 0.4) ggsave(plot = p,paste0(volcano_path,'/',prefix1,'_',prefix2,'_',cluster,'_volcano.png'),width = 8,height = 6) print(paste0(cluster,'-done!')) }else if(length(idx1) == 0 &length(idx2) >0 ){ markers[which(markers$avg_log2FC < (-log2F) & markers$p_val_adj < p_val_adj),]$threshold="down"; markers$threshold=factor(markers$threshold, levels=c('down','ns')) markers =markers%>% mutate(label = ifelse((!is.na(label)&threshold =='ns' ), NA, label)) p = ggplot(data=markers, aes(x=avg_log2FC, y=-log10(p_val_adj), color=threshold)) + geom_point(alpha=0.8, size=0.8) + geom_vline(xintercept = c(-log2F, log2F), linetype=2, color="grey")+ geom_hline(yintercept = -log10(p_val_adj), linetype=2, color="grey")+ #labs(title= ifelse(""==title, "", paste("DEG:", title)))+ xlab(bquote(Log[2]*FoldChange))+ ylab(bquote(-Log[10]*italic(P.adj)) )+ theme_classic(base_size = 14) + ggtitle(paste0(prefix1,'(Treat) vs ', prefix2, '(Ctrl)','cell:',cluster))+ scale_color_manual('',labels=c(paste0(paste0(prefix2,"(",table(markers$threshold)[[1]],')' ),'ns')), values=c("blue","grey" ) )+ guides(color=guide_legend(override.aes = list(size=3, alpha=1)))+ geom_label_repel(data = markers, aes(x = avg_log2FC, y = -log10(p_val_adj), label = label), size = 5,color="black", box.padding = unit(0.4, "lines"), segment.color = "black", #连线的颜色 segment.size = 0.4, #连线的粗细 ) ggsave(plot = p,paste0(volcano_path,'/',prefix1,'_',prefix2,'_',cluster,'_volcano.png'),width = 8,height = 6) print(paste0(cluster,'-done!')) } } else { print(paste0(cluster,'cell number < 100, no-volcano!')) next } }