TCGA_生存分析

#R 生存曲线
setwd("D:/TCGA_ProstateCancer_Survival/Survival_analysis")
library(survival)
library(survminer)
rt <- read.table("survivalInput.txt",header=T,sep="\t")
rt$futime <- rt$futime/365 
a <- (rt$expression < median(rt$expression))
diff <- survdiff(Surv(futime,fustat)~a,data=rt) 
pValue <- 1 - pchisq(diff$chisq,df=1)
pValue <- round(pValue,5)
fit <- survfit(Surv(futime,fustat) ~ a,data=rt) 
summary(fit) 
ggsurvplot(fit,pval=TRUE,linetype="solid",palette=c("red","blue"),legend.title="",legend=c(0.95,0.95),legend.labs=c("AR High-expression","AR low-expression"),conf.int=T,conf.int.style="ribbon",conf.int.alpha=0.1,font.legend=c(6))
ggsave(file="a.pdf",width = 10, height = 10)
发表在 TCGA | 留下评论

TCGA_简单的WGCNA分析(并行计算)

source("https://bioconductor.org/biocLite.R")
biocLite(c("GO.db","preprocessCore","impute"))
install.packages(c("matrixStats","Hmisc","splines","foreach","doParallel","reshape","fastcluster","dynamicTreeCut","survival"))
install.packages("WGCNA")

setwd("D:/TCGA_coexpression/WGCNA")
library(WGCNA)
allowWGCNAThreads() #查看核心数
ALLOW_WGCNA_THREADS=4 #设置为4核心运行

rt <- read.table(file="merge.txt",sep="\t",row.names=1,header=T,check.names=F,quote="!")
datExpr <- t(rt)
power1 <- c(seq(1,10,by=1),seq(12,30,by=2))
RpowerTable <- pickSoftThreshold(datExpr,powerVector=power1)[[2]]
#上面的输出结果
#   Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
#1      1    0.115 -0.643          0.860 639.000  6.13e+02 1360.0
#2      2    0.527 -1.160          0.816 209.000  1.73e+02  653.0
#3      3    0.693 -1.030          0.781  96.300  6.48e+01  359.0
#4      4    0.945 -0.919          0.954  54.000  2.95e+01  236.0
#5      5    0.929 -1.080          0.963  34.200  1.44e+01  195.0
#6      6    0.945 -1.160          0.986  23.400  7.41e+00  167.0
#7      7    0.951 -1.210          0.989  16.900  4.02e+00  147.0
#8      8    0.967 -1.250          0.999  12.800  2.29e+00  130.0
#9      9    0.975 -1.270          0.998   9.930  1.34e+00  117.0
#10    10    0.979 -1.280          0.998   7.920  8.25e-01  106.0
#11    12    0.982 -1.280          0.997   5.340  3.62e-01   88.9
#12    14    0.981 -1.270          0.991   3.820  1.75e-01   76.0
#13    16    0.983 -1.270          0.992   2.860  8.68e-02   66.0
#14    18    0.982 -1.260          0.989   2.210  4.28e-02   57.9
#15    20    0.981 -1.250          0.988   1.750  2.24e-02   51.4
#16    22    0.961 -1.250          0.966   1.420  1.21e-02   46.0
#17    24    0.961 -1.240          0.961   1.170  6.49e-03   41.4
#18    26    0.977 -1.230          0.982   0.981  3.44e-03   37.5
#19    28    0.981 -1.240          0.987   0.832  1.87e-03   34.2
#20    30    0.968 -1.240          0.970   0.713  1.01e-03   31.3

cex1 <- 0.7
pdf(file="softThresholding.pdf")
par(mfrow=c(1,2))
plot(RpowerTable[,1],-sign(RpowerTable[,3])*RpowerTable[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n")
text(RpowerTable[,1],-sign(RpowerTable[,3])*RpowerTable[,2],labels=power1,cex=cex1,col="red")
abline(h=0.85,col="red")
plot(RpowerTable[,1],RpowerTable[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity",type="n")
text(RpowerTable[,1],RpowerTable[,5],labels=power1,cex=cex1,col="red")
dev.off()

beta1 <- 9
ADJ <- adjacency(datExpr,power=beta1)
visual <- exportNetworkToCytoscape(ADJ,edgeFile="edge.txt",nodeFile="node.txt",threshold=0.8)

disableWGCNAThreads()
发表在 TCGA | 留下评论

TCGA_miRNA数据和mRNA数据整合

R语言方法

setwd("D:/TCGA_coexpression")
miRNA <- read.table("diff_miRNAExp.txt",row.names=1,header=T,sep="\t",check.names=F)
mRNA <- read.table("diffmRNAExp.txt",row.names=1,header=T,sep="\t",check.names=F)
colnames(miRNA) <- gsub("(....)?-(..)?-(....)?-(...)?.*","\\1-\\2-\\3-\\4",colnames(miRNA)) #此处的?号用于非贪婪匹配,最后的.*可以让最后的替换成功
colnames(mRNA) <- gsub("(....)?-(..)?-(....)?-(...)?.*","\\1-\\2-\\3-\\4",colnames(mRNA))
sameid <- intersect(colnames(miRNA),colnames(mRNA))
merge <- rbind(id=same,miRNA[,sameid],mRNA[,sameid])
write.table(merge,file="merge.txt",sep="\t",quote=F,col.names=F)

miRNALable <- cbind(rownames(miRNA),"miRNA")
mRNALable <- cbind(rownames(mRNA),"gene")
node <- rbind(c("ID","Classify"),miRNALable,mRNALable)
write.table(node,file="nodelable.txt",sep="\t",quote=F,col.names=F,row.names=F)
发表在 TCGA | 留下评论

TCGA-miRNA差异表达分析

数据前期整理(perl)

./merge.pl metadata.cart.2018-07-14.json
#normal count: 52
#tumor count: 499

R语言分析

#R语言分析---edgeR
setwd("D:/TCGA_miRNA_ProstateCancer")
foldChange <- 1
padj <- 0.05

library(edgeR)
rt <- read.table(file="miRNAmatrix.txt",sep="\t",header=T,check.names=F)
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data)
# dim(data)
# [1] 1881  551
data <- data[rowMeans(data)>0,]
# dim(data)
#[1] 1514  551
group <- c(rep("normal",52),rep("tumor",499))
y <- DGEList(counts=data,group=group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y,pair=c("normal","tumor"))
topTags(et)
ordered_tags <- topTags(et,n=100000)

allDiff <- ordered_tags$table
allDiff <- allDiff[is.na(allDiff$FDR)==FALSE,]
diff <- allDiff
newData <- y$pseudo.counts

write.table(diff,file="edgeOut.xls",sep="\t",quote=F)
diffSig <- diff[(diff$FDR < padj & (diff$logFC > foldChange | diff$logFC < (-foldChange))),]
write.table(diffSig,file="diffSig.xls",sep="\t",quote=F)
diffUp <- diff[(diff$FDR < padj & (diff$logFC > foldChange)),]
write.table(diffUp,file="up.xls",sep="\t",quote=F)
diffDown <- diff[(diff$FDR < padj & (diff$logFC < (-foldChange))),]
write.table(diffDown,file="down.xls",sep="\t",quote=F)

normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)
diffExp <-rbind(id=colnames(newData),newData[rownames(diffSig),])
write.table(diffExp,file="diffmiRNAExp.txt",sep="\t",quote=F,col.names=F)

heatmapData <- newData[rownames(diffSig),]
#火山图
pdf(file="vol.pdf")
xMax <- max(-log10(allDiff$FDR))+1
yMax <- 12
plot(-log10(allDiff$FDR), allDiff$logFC, xlab="-log10(FDR)",ylab="logFC",
     main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC > foldChange),]
points(-log10(diffSub$FDR), diffSub$logFC, pch=20, col="red",cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC < (-foldChange)),]
points(-log10(diffSub$FDR), diffSub$logFC, pch=20, col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()

#热图
library(gplots)
hmExp <- log10(heatmapData+0.001)
hmMat <- as.matrix(hmExp) #这句没必要,因为都是matrix
pdf(file="heatmap.pdf",width=60,height=30)
par(oma=c(10,3,3,7))
heatmap.2(hmMat,col='greenred',key=T,keysize=0.8,cexCol=0.8,cexRow=0.5,trace="none")
dev.off()

#R语言DESeq
foldChange=1
padj=0.05

setwd("D:/TCGA_miRNA_ProstateCancer/DESeq_miRNA_ProstateCancer")
rt <-read.table("miRNAmatrix.txt",sep="\t",header=T,check.names=F)  #改成自己的文件名
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data)
data <- data[rowMeans(data)>0,] 
data <- round(data,0)


group <- c(rep("normal",52),rep("tumor",499))
design <- factor(group)
newTab <- newCountDataSet(data,design) #对元数据处理,构建DESeq内部数据结构
newTab <- estimateSizeFactors(newTab) #对内部的数据进行Normalization
newData <- counts(newTab,normalized=TRUE) #将Normalization的数据输出

#重复样品
newTab <- estimateDispersions(newTab) #数据变异系数估计
diff <- nbinomTest(newTab,"normal","tumor") #计算差异基因
diff <- diff[is.na(diff$padj)==FALSE,]
diff <- diff[order(diff$pval),]
write.table(diff,file="DESeq_miRNA_Out.xls",sep="\t",quote=F,row.name=F)
diffSig <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange | diff$log2FoldChange < (-foldChange))),]
write.table(diffSig,file="diffSig_miRNA.xls",sep="\t",quote=F,row.names=F)
diffUp <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange)),]
write.table(diffUp,file="up_miRNA.xls",sep="\t",quote=F,row.names=F)
diffDown <- diff[(diff$padj < padj & (diff$log2FoldChange < (-foldChange))),]
write.table(diffDown,file="down_miRNA.xls",sep="\t",quote=F,row.names=F)

normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp_miRNA.txt",sep="\t",quote=F,col.names=F)
diffExp <- rbind(id=colnames(newData),newData[diffSig$id,])
write.table(diffExp,file="diffmiRNAExp.txt",sep="\t",quote=F,col.names=F)

#热图
hmExp <- log10(newData[diffSig$id,]+0.001)
library(gplots)
pdf(file="heatmap.pdf",width=60,height=90)
par(oma=c(10,4,4,7))
heatmap.2(hmExp,col='greenred',key=T,keysize=0.8,cexCol=0.5,cexRow=0.3,trace="none")
dev.off()

#火山图
pdf(file="vol.pdf")
allDiff <- diff[is.na(diff$padj)==FALSE,]
xMax <- max(-log10(allDiff$padj)) + 1
yMax <- 10
plot(-log10(allDiff$padj), allDiff$log2FoldChange, xlab="-log10(padj)",ylab="log2FoldChange",main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange > foldChange),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="red",cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange < (-foldChange)),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()
发表在 TCGA | 34条评论

TCGA转录组差异基因分析

数据前期准备(shell&perl)

#这个是从TCGA官网下载的文件,并将文件夹下的count文件放到一个文件夹下(此处以rawcount为例)
ls [!wenjian]* > wenjian
les wenjian | grep gz$ > jieguo
les wenjian | grep -v gz$ > guocheng1
les guocheng1 | grep -v annotation > guocheng2
les guocheng2 | grep -v log > guocheng3
awk NF guocheng3 > guocheng4
sed 's/://g' guocheng4 > guocheng5
sed 's/:/\//g' guocheng4 > guocheng6
paste guocheng6 jieguo > hebing1
sed 's/\t//g' hebing1 > hebing2
sed 's/^/cp /g' hebing2 > hebing3
mkdir rawcount
sed 's/$/ rawcount\//g' hebing3 > hebingfinish
les hebingfinish
chmod 755 hebingfinish
./hebingfinish

#解压所有gz压缩的文件
ls | grep gz$ > zhunbei1
sed 's/^/gunzip /g' zhunbei1
sed 's/^/gunzip /g' zhunbei1 > zhunbei2
bash zhunbei2

#对所有count数据进行合并,并统计出normal和tumor数目
./merge.pl  metadata.cart.2018-07-09.json 
#normal count: 52
#tumor count: 499

#下载gtf文件并用于将整理的count文件中的ensemble_id转换为gene_symbol
wget ftp://ftp.ensembl.org/pub/release-92/gtf/homo_sapiens/Homo_sapiens.GRCh38.92.chr.gtf.gz 
gunzip Homo_sapiens.GRCh38.92.chr.gtf.gz
chmod 755 ensembl2genename.pl
./ensembl2genename.pl Homo_sapiens.GRCh38.92.chr.gtf mRNAmatrix.txt mRNA_gene_symbol

R语言分析

library(parallel)
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
#cl
#R语言分析--edgeR
#R3.3.1
source("https://bioconductor.org/biocLite.R")
biocLite("edgeR")
library(edgeR)
library(gplots)
setwd("D:/TCGA_ProstateCancer")
rt <- read.table("mRNA_gene_symbol",sep="\t",header=T,check.names=F)
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data) #去除重复基因名
data <- data[rowMeans(data)>1,]

group <- c(rep("normal",52),rep("tumor",499)) #需要按照自己的数据填写
design <- model.matrix(~group)
y <- DGEList(counts=data,group=group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y,pair=c("normal","tumor"))
topTags(et)
ordered_tags <- topTags(et,n=100000)

allDiff <- ordered_tags$table
allDiff <- allDiff[is.na(allDiff$FDR)==FALSE,]
diff <- allDiff
newData <- y$pseudo.counts

write.table(diff,file="edgerOut.xls",sep="\t",quote=F)
foldChange=1
padj=0.05
diffSig <- diff[(diff$FDR < padj & (diff$logFC > foldChange | diff$logFC < (-foldChange))),]
write.table(diffSig,file="diffSig.xls",sep="\t",quote=F)
diffUp <- diff[(diff$FDR < padj & (diff$logFC > foldChange)),]
write.table(diffUp,file="up.xls",sep="\t",quote=F)
diffDown <- diff[(diff$FDR < padj & (diff$logFC < (-foldChange))),]

normalizeExp <- rbind(id=colnames(newData),newData)
diffExp <- rbind(id=colnames(newData),newData[rownames(diffSig),])
diffExp <- rbind(id=colnames(newData),newData[rownames(diffSig),])
write.table(diffExp,file="diffmRNAExp.txt",sep="\t",quote=F,col.names=F)

heatmapData <- newData[rownames(diffSig),]

#火山图
allDiff <- allDiff[(allDiff$FDR > 0),] #防止出现这个Error in plot.window(...) : need finite 'xlim' values报错
pdf(file="vol.pdf")
plot(-log10(allDiff$FDR),allDiff$logFC,xlab="-log10(FDR)",ylab="logFC",main="Volcano",xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20,cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC > foldChange),]
points(-log10(diffSub$FDR),diffSub$logFC,pch=20,col="red",cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC < (-foldChange)),]
points(-log10(diffSub$FDR),diffSub$logFC,pch=20,col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()

#热图
hmExp <- log10(heatmapData+0.001)
pdf(file="heatmap.pdf",width=60,height=90)
par(oma=c(10,3,3,7))
heatmap.2(hmExp,col='greenred',key=T,keysize=0.8,cexCol=0.8,cexRow=0.1,trace="none")
dev.off()

#R语言分析---DESeq
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq")
biocLite("limma")
#如果提示RCurl的Error,则需要运行下面的命令
#install.packages("RCurl")
library(DESeq)
library(limma)

setwd("D:/TCGA_ProstateCancer/DESeq")
rt <- read.table("mRNA_gene_symbol",sep="\t",header=T,check.name=F)
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
#dim(data)
#[1] 56867   551
data <- avereps(data)
#dim(data)
#[1] 55402   551
data <- data[rowMeans(data)>1,]
#dim(data)
#[1] 31726   551
data <- round(data,0) #去除小数点

group <- c(rep("normal",52),rep("tumor",499))
design <- factor(group)
newTab <- newCountDataSet(data,design) #对元数据处理,构建DESeq内部数据结构
newTab <- estimateSizeFactors(newTab) #对内部的数据进行Normalization
newData <- counts(newTab,normalized=TRUE) #将Normalization的数据输出

#重复样品
newTab <- estimateDispersions(newTab) #数据变异系数估计
diff <- nbinomTest(newTab,"normal","tumor") #计算差异基因,计算要求比较高内存至少16G以上(551)
diff <- diff[is.na(diff$padj)==FALSE,]
diff <- diff[order(diff$pval),]
write.table(diff,file="DESeqOut.xls",sep="\t",quote=F,row.name=F)
padj=0.05
foldChange=1
diffSig <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange | diff$log2FoldChange < (-foldChange))),]
write.table(diffSig,file="diffSig.xls",sep="\t",quote=F,row.names=F)
diffUp <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange)),]
write.table(diffUp,file="up.xls",sep="\t",quote=F,row.names=F)
diffDown <- diff[(diff$padj < padj & (diff$log2FoldChange < (-foldChange))),]
write.table(diffDown,file="down.xls",sep="\t",quote=F,row.names=F)

normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)
diffExp <- rbind(id=colnames(newData),newData[diffSig$id,])
write.table(diffExp,file="diffmRNAExp.txt",sep="\t",quote=F,col.names=F)

#热图
hmExp <- log10(newData[diffSig$id,]+0.001)
library(gplots)
pdf(file="heatmap.pdf",width=60,height=90)
par(oma=c(10,4,4,7))
heatmap.2(hmExp,col='greenred',key=T,keysize=0.8,cexCol=0.8,cexRow=0.1,trace="none")
dev.off()

#火山图
pdf(file="vol.pdf")
allDiff <- diff[is.na(diff$padj)==FALSE,]
xMax <- max(-log10(allDiff$padj)) + 1
yMax <- 10
plot(-log10(allDiff$padj), allDiff$log2FoldChange, xlab="-log10(padj)",ylab="log2FoldChange",main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange > foldChange),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="red",cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange < (-foldChange)),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()

#venn取交集差异基因
setwd("D:/TCGA_ProstateCancer/Venn")
#getwd()

file1="edgeR_diffmRNA"
file2="DESeq_diffmRNA"

list1 <- list()
library(VennDiagram)
rt1 <- read.table(file1,header=F)
list1[["edgeR"]] <- as.vector(rt1[,1])
rt2 <- read.table(file2,header=F)
list1[["DESeq"]] <- as.vector(rt2[,1])

sect1 <- intersect(list1[[1]],list1[[2]])
write.table(file="intersect.xls",sect1,sep="\t",quote=F,col.names=F,row.names=F)
stopCluster(cl)
发表在 TCGA | 6条评论

TCGA转录组数据下载–(TCGA官网GDC Tool的命令行方式下载)

首先打开TCGA的 Launch Data Portal,并点击 Data Transfer Tool enter image description here

然后下载 GDC Data Transfer Tool Client enter image description here

进入命令行终端,下载之前在网页下载自己所需的Manifest文件,然后输入如下命令(此处以windows为例):

gdc-client.exe -h
gdc-client.exe download -h
gdc-client.exe download -m gdc_manifest_20180709_091154.txt 
#若下载中断,再次执行程序可以扫描出已下载的文件,当发现中断的文件,会删除中断的文件重新下载,然后继续下载未下载的文件
#gdc-client.exe download -m gdc_manifest_20180801_083429.txt -d methylayion_rawdata --log-file methylayion_rawdata/download.log
#如果有token,可以按照如下操作下载controlled data
gdc-client.exe download -n 4 -m gdc_manifest_20191125_093941.txt -t gdc-user-token_xxx.txt -d atac_bam --log-file download.log
Centos 7 beta
#nohup ./gdc-client download -m gdc_manifest_20180801_083429.txt -d methylayion_rawdata --log-file methylayion_rawdata/download.log > /dev/null 2>&1 &
nohup gdc-client download -n 2 -m gdc_manifest_20190324_172540.txt -d methy_rawdata/ --log-file methy_rawdata/download.log > /dev/null 2>&1 &

enter image description here enter image description here

便可下载所需文件

发表在 TCGA | 留下评论