数据前期整理(perl)
./merge.pl metadata.cart.2018-07-14.json
#normal count: 52
#tumor count: 499
R语言分析
#R语言分析---edgeR
setwd("D:/TCGA_miRNA_ProstateCancer")
foldChange <- 1
padj <- 0.05
library(edgeR)
rt <- read.table(file="miRNAmatrix.txt",sep="\t",header=T,check.names=F)
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data)
# dim(data)
# [1] 1881 551
data <- data[rowMeans(data)>0,]
# dim(data)
#[1] 1514 551
group <- c(rep("normal",52),rep("tumor",499))
y <- DGEList(counts=data,group=group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y,pair=c("normal","tumor"))
topTags(et)
ordered_tags <- topTags(et,n=100000)
allDiff <- ordered_tags$table
allDiff <- allDiff[is.na(allDiff$FDR)==FALSE,]
diff <- allDiff
newData <- y$pseudo.counts
write.table(diff,file="edgeOut.xls",sep="\t",quote=F)
diffSig <- diff[(diff$FDR < padj & (diff$logFC > foldChange | diff$logFC < (-foldChange))),]
write.table(diffSig,file="diffSig.xls",sep="\t",quote=F)
diffUp <- diff[(diff$FDR < padj & (diff$logFC > foldChange)),]
write.table(diffUp,file="up.xls",sep="\t",quote=F)
diffDown <- diff[(diff$FDR < padj & (diff$logFC < (-foldChange))),]
write.table(diffDown,file="down.xls",sep="\t",quote=F)
normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)
diffExp <-rbind(id=colnames(newData),newData[rownames(diffSig),])
write.table(diffExp,file="diffmiRNAExp.txt",sep="\t",quote=F,col.names=F)
heatmapData <- newData[rownames(diffSig),]
#火山图
pdf(file="vol.pdf")
xMax <- max(-log10(allDiff$FDR))+1
yMax <- 12
plot(-log10(allDiff$FDR), allDiff$logFC, xlab="-log10(FDR)",ylab="logFC",
main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC > foldChange),]
points(-log10(diffSub$FDR), diffSub$logFC, pch=20, col="red",cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC < (-foldChange)),]
points(-log10(diffSub$FDR), diffSub$logFC, pch=20, col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()
#热图
library(gplots)
hmExp <- log10(heatmapData+0.001)
hmMat <- as.matrix(hmExp) #这句没必要,因为都是matrix
pdf(file="heatmap.pdf",width=60,height=30)
par(oma=c(10,3,3,7))
heatmap.2(hmMat,col='greenred',key=T,keysize=0.8,cexCol=0.8,cexRow=0.5,trace="none")
dev.off()
#R语言DESeq
foldChange=1
padj=0.05
setwd("D:/TCGA_miRNA_ProstateCancer/DESeq_miRNA_ProstateCancer")
rt <-read.table("miRNAmatrix.txt",sep="\t",header=T,check.names=F) #改成自己的文件名
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data)
data <- data[rowMeans(data)>0,]
data <- round(data,0)
group <- c(rep("normal",52),rep("tumor",499))
design <- factor(group)
newTab <- newCountDataSet(data,design) #对元数据处理,构建DESeq内部数据结构
newTab <- estimateSizeFactors(newTab) #对内部的数据进行Normalization
newData <- counts(newTab,normalized=TRUE) #将Normalization的数据输出
#重复样品
newTab <- estimateDispersions(newTab) #数据变异系数估计
diff <- nbinomTest(newTab,"normal","tumor") #计算差异基因
diff <- diff[is.na(diff$padj)==FALSE,]
diff <- diff[order(diff$pval),]
write.table(diff,file="DESeq_miRNA_Out.xls",sep="\t",quote=F,row.name=F)
diffSig <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange | diff$log2FoldChange < (-foldChange))),]
write.table(diffSig,file="diffSig_miRNA.xls",sep="\t",quote=F,row.names=F)
diffUp <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange)),]
write.table(diffUp,file="up_miRNA.xls",sep="\t",quote=F,row.names=F)
diffDown <- diff[(diff$padj < padj & (diff$log2FoldChange < (-foldChange))),]
write.table(diffDown,file="down_miRNA.xls",sep="\t",quote=F,row.names=F)
normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp_miRNA.txt",sep="\t",quote=F,col.names=F)
diffExp <- rbind(id=colnames(newData),newData[diffSig$id,])
write.table(diffExp,file="diffmiRNAExp.txt",sep="\t",quote=F,col.names=F)
#热图
hmExp <- log10(newData[diffSig$id,]+0.001)
library(gplots)
pdf(file="heatmap.pdf",width=60,height=90)
par(oma=c(10,4,4,7))
heatmap.2(hmExp,col='greenred',key=T,keysize=0.8,cexCol=0.5,cexRow=0.3,trace="none")
dev.off()
#火山图
pdf(file="vol.pdf")
allDiff <- diff[is.na(diff$padj)==FALSE,]
xMax <- max(-log10(allDiff$padj)) + 1
yMax <- 10
plot(-log10(allDiff$padj), allDiff$log2FoldChange, xlab="-log10(padj)",ylab="log2FoldChange",main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange > foldChange),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="red",cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange < (-foldChange)),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()
请问下载mirRNA.isoform形式的做差异分析试过吗
没试过,但是我专门看了下那个数据,也有read_count数和microRNA的ID,所以只要你提取合并后都可以分析的
xMax=max(-log10(allDiff$FDR))+1。为什么我这算出来是无穷大
看下allDiff$FDR是不是有0值
大神您好,我在这个过程中遇到一些问题,方便加个微信帮助一下不,wx号:416891498(毒毛花苷)
什么错误?
TCGA生存分析的时候没法提取临床生存时间数据,用的是survival time的perl脚本,报错:Use of uninitialized value $submitterId in split at survival_time.pl line 26, line 7506.
Use of uninitialized value $subId[0] in exists at survival_time.pl line 27, line 7506.。。。。。。另外就是我没有miRNA merge的perl脚本,没法合并数据,希望老师多多帮忙,谢谢
估计你写的代码有问题吧,26行报错了
可不可以发一个survival_time和miRNA merge的perl脚本到我的邮箱,我最近也是刚学生信,没有基础,就想快点补个生存分析发文章
!/usr/bin/perl -w
use strict;
use warnings;
my $file=$ARGV[0];
use Data::Dumper;
use JSON;
my $json = new JSON;
my $js;
open JFILE, “$file”;
while() {
$js .= “$_”;
}
my $obj = $json->decode($js);
print $obj->[0]->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’} . “\n”;
open(WF,”>time.txt”) or die $!;
print WF “id\tfutime\tfustat\n”;
my %hash=();
for my $i(@{$obj})
{
my $vitalsStatus=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’};
my $submitterId=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘treatments’}->[0]->{‘submitter_id’};
my @subId=split(/_/,$submitterId);
if(exists $hash{$subId[0]})
{
#print $subId[0] . “\n”;
next;
}
else
{
$hash{$subId[0]}=1;
}
if($vitalsStatus eq 'alive')
{
my $days_to_last_follow_up=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_last_follow_up'};
if(defined $days_to_last_follow_up)
{
print WF "$subId[0]\t$days_to_last_follow_up\t0\n";
}
}
else
{
my $days_to_death=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_death'};
if(defined $days_to_death)
{
print WF "$subId[0]\t$days_to_death\t1\n";
}
}
}
close(WF);
print Dumper $obj
这是我的survival time.pl脚本,我不知道问题出在哪里,求大神指教
你把你的输入文件,格式贴出来我看看
clinical.cart.2019-08-31.json/metadata.cart.2019-08-30.json 这就是在TCGA网址上下载的cart文件
贴出来比较方便我看你的代码
请问survival time文件是空文件这个问题解决了吗?同问!盼回复
PS C:\Users\luo xin\Desktop\TCGA\WT TCGA\time-2> perl .\survival_time.pl .\metadata.cart.2019-08-31.json
Use of uninitialized value $submitterId in split at .\survival_time.pl line 26, line 59350.
Use of uninitialized value $subId[0] in exists at .\survival_time.pl line 27, line 59350.
Use of uninitialized value $subId[0] in hash element at .\survival_time.pl line 34, line 59350.
Use of uninitialized value $vitalsStatus in string eq at .\survival_time.pl line 37, line 59350.
我不太明白怎么贴代码,这就是输入的代码及结果,结果生成的是一个空文件,没有生存时间
你贴的代码,我看到了,但是你的json文件没有贴上来。
大神,miRNA差异分析第一步,miRNA数据合并我就不会,能不能提供一个miRNA数据合并的脚本代码给我,万分感谢!
json文件太长了没法贴,就是在TCGA下载的cart临床文件
[{
“md5sum”: “d44eceff2bf522054905d30c252ac54e”,
“data_type”: “Gene Expression Quantification”,
“file_name”: “ca57d990-f70a-4506-ae83-4e0a9b46a02d.htseq.counts.gz”,
“file_size”: 263253,
“data_format”: “TXT”,
“analysis”: {
“analysis_id”: “5990574e-753c-4203-984e-0d7169874262”,
“updated_datetime”: “2018-09-05T17:42:44.583641-05:00”,
“created_datetime”: “2016-06-01T23:23:57.887864-05:00”,
“submitter_id”: “ca57d990-f70a-4506-ae83-4e0a9b46a02d_count”,
“state”: “released”,
“workflow_link”: “https://github.com/NCI-GDC/htseq-cwl”,
“workflow_type”: “HTSeq – Counts”,
“workflow_version”: “v1”,
“input_files”: [
{
“data_type”: “Aligned Reads”,
“updated_datetime”: “2019-05-23T23:18:56.477854+00:00”,
“created_datetime”: “2016-05-30T19:08:59.396179-05:00”,
“file_name”: “97a8cf4f-52b8-4b21-a59d-ecac0c93c15f_gdc_realn_rehead.bam”,
“md5sum”: “369085c9f9fe0c68c94aecce6c0958a2”,
“data_format”: “BAM”,
“submitter_id”: “97a8cf4f-52b8-4b21-a59d-ecac0c93c15f”,
“access”: “controlled”,
“platform”: “Illumina”,
“state”: “released”,
“file_id”: “b34557a1-0b59-4fd3-b06d-a53be5691639”,
“data_category”: “Sequencing Reads”,
“file_size”: 20685837333,
“experimental_strategy”: “RNA-Seq”
}
]
},
“submitter_id”: “ca57d990-f70a-4506-ae83-4e0a9b46a02d_count”,
“access”: “open”,
“state”: “released”,
“file_id”: “090330b6-5568-45b5-bd4d-3d4b3b8d846c”,
“data_category”: “Transcriptome Profiling”,
“associated_entities”: [
{
“entity_id”: “27e3aaf3-305f-5c22-957c-c5e5f9829b0c”,
“case_id”: “3a4693f0-cf26-5767-b036-5d51a46cf1a6”,
“entity_submitter_id”: “TARGET-50-PALDTE-01A-01R”,
“entity_type”: “aliquot”
}
],
“experimental_strategy”: “RNA-Seq”
},{
“md5sum”: “e7ec44c0285b9e8dbad2834f6d7f4777”,
“data_type”: “Gene Expression Quantification”,
“file_name”: “6590e431-0b86-4337-893c-77d08491a94d.htseq.counts.gz”,
“file_size”: 259822,
“data_format”: “TXT”,
“analysis”: {
“analysis_id”: “4f602421-2e4d-4292-8515-9ef9338b1b8a”,
“updated_datetime”: “2018-09-05T17:42:44.583641-05:00”,
“created_datetime”: “2016-06-01T23:26:59.857859-05:00”,
“submitter_id”: “6590e431-0b86-4337-893c-77d08491a94d_count”,
“state”: “released”,
“workflow_link”: “https://github.com/NCI-GDC/htseq-cwl”,
“workflow_type”: “HTSeq – Counts”,
“workflow_version”: “v1”,
“input_files”: [
{
“data_type”: “Aligned Reads”,
“updated_datetime”: “2019-05-23T23:18:56.477854+00:00”,
“created_datetime”: “2016-05-31T02:55:06.334378-05:00”,
“file_name”: “cfeac152-d75a-4a0d-869e-866830b23ae1_gdc_realn_rehead.bam”,
“md5sum”: “5e01790c59c9a582833d7672ebcbafed”,
“data_format”: “BAM”,
“submitter_id”: “cfeac152-d75a-4a0d-869e-866830b23ae1”,
“access”: “controlled”,
“platform”: “Illumina”,
“state”: “released”,
“file_id”: “52b384d3-21d0-4a24-9f7f-34c3d54333a8”,
“data_category”: “Sequencing Reads”,
“file_size”: 12879275825,
“experimental_strategy”: “RNA-Seq”
}
]
},
“submitter_id”: “6590e431-0b86-4337-893c-77d08491a94d_count”,
“access”: “open”,
“state”: “released”,
“file_id”: “5c91495b-7c2c-4638-954e-06195adc8877”,
“data_category”: “Transcriptome Profiling”,
“associated_entities”: [
{
“entity_id”: “001e1cad-9bf6-578d-b5fe-26c7c5ebd577”,
“case_id”: “a0a0ceff-2847-5661-a117-c68ed2a57bd7”,
“entity_submitter_id”: “TARGET-50-PAJMUF-01A-01R”,
“entity_type”: “aliquot”
}
],
“experimental_strategy”: “RNA-Seq”
},{
“md5sum”: “6d1a0e62fd64dfa9571ed80ddd7d1457”,
“data_type”: “Gene Expression Quantification”,
“file_name”: “1b1ed2c4-0fc3-49d2-89c7-140083e330f5.htseq.counts.gz”,
“file_size”: 261838,
“data_format”: “TXT”,
“analysis”: {
“analysis_id”: “a2cd7540-1038-4460-bfbd-f13a2688b35e”,
“updated_datetime”: “2018-09-05T17:42:44.583641-05:00”,
“created_datetime”: “2016-06-01T23:27:58.374091-05:00”,
“submitter_id”: “1b1ed2c4-0fc3-49d2-89c7-140083e330f5_count”,
“state”: “released”,
“workflow_link”: “https://github.com/NCI-GDC/htseq-cwl”,
“workflow_type”: “HTSeq – Counts”,
“workflow_version”: “v1”,
“input_files”: [
{
“data_type”: “Aligned Reads”,
“updated_datetime”: “2019-05-23T23:18:56.477854+00:00”,
“created_datetime”: “2016-05-31T05:23:29.067381-05:00”,
“file_name”: “ac7a04e6-f9db-42ae-9619-da011c96ffd8_gdc_realn_rehead.bam”,
“md5sum”: “232c63fa6d810a03f2f5cefeaf5d4a38”,
“data_format”: “BAM”,
“submitter_id”: “ac7a04e6-f9db-42ae-9619-da011c96ffd8”,
“access”: “controlled”,
“platform”: “Illumina”,
“state”: “released”,
“file_id”: “12ee42a2-854f-4bbd-b432-e8eab99a0e67”,
“data_category”: “Sequencing Reads”,
“file_size”: 15080082683,
“experimental_strategy”: “RNA-Seq”
}
]
},
“submitter_id”: “1b1ed2c4-0fc3-49d2-89c7-140083e330f5_count”,
“access”: “open”,
“state”: “released”,
“file_id”: “56a4b60d-baf1-4cfa-86d5-e2d146db30f7”,
“data_category”: “Transcriptome Profiling”,
你的json没贴全,感觉是循环的问题
你好,我也有遇到这个问题。perl survival_time.pl clinical.cart.2019-09-24.json提示Use of uninitialized value $vitalsStatus in string eq at survival_time.pl line 46, line 46884.
脚本:
!/usr/bin/perl -w
use strict;
use warnings;
my $file=$ARGV[0];
use Data::Dumper;
macϵͳ: sudo cpan install JSON
use JSON;
my $json = new JSON;
my $js;
open JFILE, “$file”;
while() {
$js .= “$_”;
}
my $obj = $json->decode($js);
print $obj->[0]->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’} . “\n”;
open(WF,”>time.txt”) or die $!;
print WF “id\tfutime\tfustat\n”;
my %hash=();
my @samp1e=(localtime(time));
for my $i(@{$obj})
{
my $vitalsStatus=$i->{‘diagnoses’}->[0]->{‘vital_status’};
my $submitterId=$i->{‘diagnoses’}->[0]->{‘submitter_id’};
my @subId=split(/_/,$submitterId);
if(exists $hash{$subId[0]})
{
if($samp1e[5]>119){next;}
#print $subId[0] . “\n”;
if($samp1e[4]>13){next;}
next;
}
else
{
$hash{$subId[0]}=1;
}
if($vitalsStatus eq 'alive')
{
my $days_to_last_follow_up=$i->{'diagnoses'}->[0]->{'days_to_last_follow_up'};
if(defined $days_to_last_follow_up)
{
print WF "$subId[0]\t$days_to_last_follow_up\t0\n";
}
}
else
{
my $days_to_death=$i->{'diagnoses'}->[0]->{'days_to_death'};
if(defined $days_to_death)
{
print WF "$subId[0]\t$days_to_death\t1\n";
}
}
}
close(WF);
print Dumper $obj
json文件一共有46884行,同时pl第25行if($vitalsStatus eq ‘alive’)错误是说这个未初始化,请教一下该如何解决。谢谢您。
可能是你循环里面的perl引用错了
您好~您的问题解决了吗,我的survival_time代码和你的基本一样,我也遇到了跟您同样的报错问题,想请教您最终怎么解决,万分感谢!
求miRNA merge的perl脚本
!/usr/bin/perl -w
use strict;
use warnings;
my $file=$ARGV[0];
use Data::Dumper;
use JSON;
my $json = new JSON;
my $js;
open JFILE, “$file”;
while() {
$js .= “$_”;
}
my $obj = $json->decode($js);
print $obj->[0]->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’} . “\n”;
open(WF,”>time.txt”) or die $!;
print WF “id\tfutime\tfustat\n”;
my %hash=();
for my $i(@{$obj})
{
my $vitalsStatus=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’};
my $submitterId=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘treatments’}->[0]->{‘submitter_id’};
my @subId=split(/_/,$submitterId);
if(exists $hash{$subId[0]})
{
#print $subId[0] . “\n”;
next;
}
else
{
$hash{$subId[0]}=1;
}
if($vitalsStatus eq 'alive')
{
my $days_to_last_follow_up=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_last_follow_up'};
if(defined $days_to_last_follow_up)
{
print WF "$subId[0]\t$days_to_last_follow_up\t0\n";
}
}
else
{
my $days_to_death=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_death'};
if(defined $days_to_death)
{
print WF "$subId[0]\t$days_to_death\t1\n";
}
}
}
close(WF);
print Dumper $obj
上面是我的Perl脚本。
你看看meta文件,我觉的是你的perl引用时,用错了。
json文件说太长了,贴不了,我也是time.txt文件是空的。
Use of uninitialized value $submitterId in split at survival_time.pl line 26, line 28921.
Use of uninitialized value $subId[0] in exists at survival_time.pl line 27, line 28921.
我的也是循环出错
额,那你得看看里面的数据结构有没有问题
大佬可以加我一下微信吗,我也遇到循环出错问题,不知道怎么解决,13705204798,谢谢
最近实在没有精力排错,我看之间的评论下面有的是json引用方式有问题,你根据文件格式改一下吧
你的问题解决了吗,我也是这个问题
您好!
[rowMeans(data)>0,]是筛掉表达量低的基因吗?
如果是,取rowMeans(data)>0.1是否可以,这个值在选取上有什么标准吗?
谢谢您!
我觉得也可以,这个阈值是可以自定义的
survival_time.pl这个脚本文件可以给我一下吗,谢谢