TCGA转录组差异基因分析

数据前期准备(shell&perl)

#这个是从TCGA官网下载的文件,并将文件夹下的count文件放到一个文件夹下(此处以rawcount为例)
ls [!wenjian]* > wenjian
les wenjian | grep gz$ > jieguo
les wenjian | grep -v gz$ > guocheng1
les guocheng1 | grep -v annotation > guocheng2
les guocheng2 | grep -v log > guocheng3
awk NF guocheng3 > guocheng4
sed 's/://g' guocheng4 > guocheng5
sed 's/:/\//g' guocheng4 > guocheng6
paste guocheng6 jieguo > hebing1
sed 's/\t//g' hebing1 > hebing2
sed 's/^/cp /g' hebing2 > hebing3
mkdir rawcount
sed 's/$/ rawcount\//g' hebing3 > hebingfinish
les hebingfinish
chmod 755 hebingfinish
./hebingfinish

#解压所有gz压缩的文件
ls | grep gz$ > zhunbei1
sed 's/^/gunzip /g' zhunbei1
sed 's/^/gunzip /g' zhunbei1 > zhunbei2
bash zhunbei2

#对所有count数据进行合并,并统计出normal和tumor数目
./merge.pl  metadata.cart.2018-07-09.json 
#normal count: 52
#tumor count: 499

#下载gtf文件并用于将整理的count文件中的ensemble_id转换为gene_symbol
wget ftp://ftp.ensembl.org/pub/release-92/gtf/homo_sapiens/Homo_sapiens.GRCh38.92.chr.gtf.gz 
gunzip Homo_sapiens.GRCh38.92.chr.gtf.gz
chmod 755 ensembl2genename.pl
./ensembl2genename.pl Homo_sapiens.GRCh38.92.chr.gtf mRNAmatrix.txt mRNA_gene_symbol

R语言分析

library(parallel)
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
#cl
#R语言分析--edgeR
#R3.3.1
source("https://bioconductor.org/biocLite.R")
biocLite("edgeR")
library(edgeR)
library(gplots)
setwd("D:/TCGA_ProstateCancer")
rt <- read.table("mRNA_gene_symbol",sep="\t",header=T,check.names=F)
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data) #去除重复基因名
data <- data[rowMeans(data)>1,]

group <- c(rep("normal",52),rep("tumor",499)) #需要按照自己的数据填写
design <- model.matrix(~group)
y <- DGEList(counts=data,group=group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y,pair=c("normal","tumor"))
topTags(et)
ordered_tags <- topTags(et,n=100000)

allDiff <- ordered_tags$table
allDiff <- allDiff[is.na(allDiff$FDR)==FALSE,]
diff <- allDiff
newData <- y$pseudo.counts

write.table(diff,file="edgerOut.xls",sep="\t",quote=F)
foldChange=1
padj=0.05
diffSig <- diff[(diff$FDR < padj & (diff$logFC > foldChange | diff$logFC < (-foldChange))),]
write.table(diffSig,file="diffSig.xls",sep="\t",quote=F)
diffUp <- diff[(diff$FDR < padj & (diff$logFC > foldChange)),]
write.table(diffUp,file="up.xls",sep="\t",quote=F)
diffDown <- diff[(diff$FDR < padj & (diff$logFC < (-foldChange))),]

normalizeExp <- rbind(id=colnames(newData),newData)
diffExp <- rbind(id=colnames(newData),newData[rownames(diffSig),])
diffExp <- rbind(id=colnames(newData),newData[rownames(diffSig),])
write.table(diffExp,file="diffmRNAExp.txt",sep="\t",quote=F,col.names=F)

heatmapData <- newData[rownames(diffSig),]

#火山图
allDiff <- allDiff[(allDiff$FDR > 0),] #防止出现这个Error in plot.window(...) : need finite 'xlim' values报错
pdf(file="vol.pdf")
plot(-log10(allDiff$FDR),allDiff$logFC,xlab="-log10(FDR)",ylab="logFC",main="Volcano",xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20,cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC > foldChange),]
points(-log10(diffSub$FDR),diffSub$logFC,pch=20,col="red",cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC < (-foldChange)),]
points(-log10(diffSub$FDR),diffSub$logFC,pch=20,col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()

#热图
hmExp <- log10(heatmapData+0.001)
pdf(file="heatmap.pdf",width=60,height=90)
par(oma=c(10,3,3,7))
heatmap.2(hmExp,col='greenred',key=T,keysize=0.8,cexCol=0.8,cexRow=0.1,trace="none")
dev.off()

#R语言分析---DESeq
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq")
biocLite("limma")
#如果提示RCurl的Error,则需要运行下面的命令
#install.packages("RCurl")
library(DESeq)
library(limma)

setwd("D:/TCGA_ProstateCancer/DESeq")
rt <- read.table("mRNA_gene_symbol",sep="\t",header=T,check.name=F)
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
#dim(data)
#[1] 56867   551
data <- avereps(data)
#dim(data)
#[1] 55402   551
data <- data[rowMeans(data)>1,]
#dim(data)
#[1] 31726   551
data <- round(data,0) #去除小数点

group <- c(rep("normal",52),rep("tumor",499))
design <- factor(group)
newTab <- newCountDataSet(data,design) #对元数据处理,构建DESeq内部数据结构
newTab <- estimateSizeFactors(newTab) #对内部的数据进行Normalization
newData <- counts(newTab,normalized=TRUE) #将Normalization的数据输出

#重复样品
newTab <- estimateDispersions(newTab) #数据变异系数估计
diff <- nbinomTest(newTab,"normal","tumor") #计算差异基因,计算要求比较高内存至少16G以上(551)
diff <- diff[is.na(diff$padj)==FALSE,]
diff <- diff[order(diff$pval),]
write.table(diff,file="DESeqOut.xls",sep="\t",quote=F,row.name=F)
padj=0.05
foldChange=1
diffSig <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange | diff$log2FoldChange < (-foldChange))),]
write.table(diffSig,file="diffSig.xls",sep="\t",quote=F,row.names=F)
diffUp <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange)),]
write.table(diffUp,file="up.xls",sep="\t",quote=F,row.names=F)
diffDown <- diff[(diff$padj < padj & (diff$log2FoldChange < (-foldChange))),]
write.table(diffDown,file="down.xls",sep="\t",quote=F,row.names=F)

normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)
diffExp <- rbind(id=colnames(newData),newData[diffSig$id,])
write.table(diffExp,file="diffmRNAExp.txt",sep="\t",quote=F,col.names=F)

#热图
hmExp <- log10(newData[diffSig$id,]+0.001)
library(gplots)
pdf(file="heatmap.pdf",width=60,height=90)
par(oma=c(10,4,4,7))
heatmap.2(hmExp,col='greenred',key=T,keysize=0.8,cexCol=0.8,cexRow=0.1,trace="none")
dev.off()

#火山图
pdf(file="vol.pdf")
allDiff <- diff[is.na(diff$padj)==FALSE,]
xMax <- max(-log10(allDiff$padj)) + 1
yMax <- 10
plot(-log10(allDiff$padj), allDiff$log2FoldChange, xlab="-log10(padj)",ylab="log2FoldChange",main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange > foldChange),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="red",cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange < (-foldChange)),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()

#venn取交集差异基因
setwd("D:/TCGA_ProstateCancer/Venn")
#getwd()

file1="edgeR_diffmRNA"
file2="DESeq_diffmRNA"

list1 <- list()
library(VennDiagram)
rt1 <- read.table(file1,header=F)
list1[["edgeR"]] <- as.vector(rt1[,1])
rt2 <- read.table(file2,header=F)
list1[["DESeq"]] <- as.vector(rt2[,1])

sect1 <- intersect(list1[[1]],list1[[2]])
write.table(file="intersect.xls",sect1,sep="\t",quote=F,col.names=F,row.names=F)
stopCluster(cl)
此条目发表在TCGA分类目录。将固定链接加入收藏夹。

TCGA转录组差异基因分析》有6条回应

  1. 一一说:

    您好,请教一下,我在做到R语言分析—DESeq时,出现了这个错误,后面一直没有调试正确,可以请您帮我看看是哪里出问题了吗

    newTab <- newCountDataSet(data,design)

    Error in newCountDataSet(data, design) :
    length(conditions) == ncol(countData) is not TRUE

  2. 说:

    热图

    hmExp <- log10(newData[diffSig$id,]+0.001)
    library(gplots)
    pdf(file=”heatmap.pdf”,width=60,height=90)
    par(oma=c(10,4,4,7))
    heatmap.2(hmExp,col=’greenred’,key=T,keysize=0.8,cexCol=0.8,cexRow=0.1,trace=”none”)
    dev.off()

    想请教一下,这里c(10,4,4,7)的参数10、4、4、7是怎么确定的?谢谢!!

发表评论

邮箱地址不会被公开。 必填项已用*标注

To create code blocks or other preformatted text, indent by four spaces:

    This will be displayed in a monospaced font. The first four 
    spaces will be stripped off, but all other whitespace
    will be preserved.
    
    Markdown is turned off in code blocks:
     [This is not a link](http://example.com)

To create not a block, but an inline code span, use backticks:

Here is some inline `code`.

For more help see http://daringfireball.net/projects/markdown/syntax

Protected with IP Blacklist CloudIP Blacklist Cloud