TCGA-miRNA差异表达分析

数据前期整理(perl)

./merge.pl metadata.cart.2018-07-14.json
#normal count: 52
#tumor count: 499

R语言分析

#R语言分析---edgeR
setwd("D:/TCGA_miRNA_ProstateCancer")
foldChange <- 1
padj <- 0.05

library(edgeR)
rt <- read.table(file="miRNAmatrix.txt",sep="\t",header=T,check.names=F)
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data)
# dim(data)
# [1] 1881  551
data <- data[rowMeans(data)>0,]
# dim(data)
#[1] 1514  551
group <- c(rep("normal",52),rep("tumor",499))
y <- DGEList(counts=data,group=group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y,pair=c("normal","tumor"))
topTags(et)
ordered_tags <- topTags(et,n=100000)

allDiff <- ordered_tags$table
allDiff <- allDiff[is.na(allDiff$FDR)==FALSE,]
diff <- allDiff
newData <- y$pseudo.counts

write.table(diff,file="edgeOut.xls",sep="\t",quote=F)
diffSig <- diff[(diff$FDR < padj & (diff$logFC > foldChange | diff$logFC < (-foldChange))),]
write.table(diffSig,file="diffSig.xls",sep="\t",quote=F)
diffUp <- diff[(diff$FDR < padj & (diff$logFC > foldChange)),]
write.table(diffUp,file="up.xls",sep="\t",quote=F)
diffDown <- diff[(diff$FDR < padj & (diff$logFC < (-foldChange))),]
write.table(diffDown,file="down.xls",sep="\t",quote=F)

normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)
diffExp <-rbind(id=colnames(newData),newData[rownames(diffSig),])
write.table(diffExp,file="diffmiRNAExp.txt",sep="\t",quote=F,col.names=F)

heatmapData <- newData[rownames(diffSig),]
#火山图
pdf(file="vol.pdf")
xMax <- max(-log10(allDiff$FDR))+1
yMax <- 12
plot(-log10(allDiff$FDR), allDiff$logFC, xlab="-log10(FDR)",ylab="logFC",
     main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC > foldChange),]
points(-log10(diffSub$FDR), diffSub$logFC, pch=20, col="red",cex=0.4)
diffSub <- allDiff[(allDiff$FDR < padj & allDiff$logFC < (-foldChange)),]
points(-log10(diffSub$FDR), diffSub$logFC, pch=20, col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()

#热图
library(gplots)
hmExp <- log10(heatmapData+0.001)
hmMat <- as.matrix(hmExp) #这句没必要,因为都是matrix
pdf(file="heatmap.pdf",width=60,height=30)
par(oma=c(10,3,3,7))
heatmap.2(hmMat,col='greenred',key=T,keysize=0.8,cexCol=0.8,cexRow=0.5,trace="none")
dev.off()

#R语言DESeq
foldChange=1
padj=0.05

setwd("D:/TCGA_miRNA_ProstateCancer/DESeq_miRNA_ProstateCancer")
rt <-read.table("miRNAmatrix.txt",sep="\t",header=T,check.names=F)  #改成自己的文件名
rt <- as.matrix(rt)
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data <- avereps(data)
data <- data[rowMeans(data)>0,] 
data <- round(data,0)


group <- c(rep("normal",52),rep("tumor",499))
design <- factor(group)
newTab <- newCountDataSet(data,design) #对元数据处理,构建DESeq内部数据结构
newTab <- estimateSizeFactors(newTab) #对内部的数据进行Normalization
newData <- counts(newTab,normalized=TRUE) #将Normalization的数据输出

#重复样品
newTab <- estimateDispersions(newTab) #数据变异系数估计
diff <- nbinomTest(newTab,"normal","tumor") #计算差异基因
diff <- diff[is.na(diff$padj)==FALSE,]
diff <- diff[order(diff$pval),]
write.table(diff,file="DESeq_miRNA_Out.xls",sep="\t",quote=F,row.name=F)
diffSig <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange | diff$log2FoldChange < (-foldChange))),]
write.table(diffSig,file="diffSig_miRNA.xls",sep="\t",quote=F,row.names=F)
diffUp <- diff[(diff$padj < padj & (diff$log2FoldChange > foldChange)),]
write.table(diffUp,file="up_miRNA.xls",sep="\t",quote=F,row.names=F)
diffDown <- diff[(diff$padj < padj & (diff$log2FoldChange < (-foldChange))),]
write.table(diffDown,file="down_miRNA.xls",sep="\t",quote=F,row.names=F)

normalizeExp <- rbind(id=colnames(newData),newData)
write.table(normalizeExp,file="normalizeExp_miRNA.txt",sep="\t",quote=F,col.names=F)
diffExp <- rbind(id=colnames(newData),newData[diffSig$id,])
write.table(diffExp,file="diffmiRNAExp.txt",sep="\t",quote=F,col.names=F)

#热图
hmExp <- log10(newData[diffSig$id,]+0.001)
library(gplots)
pdf(file="heatmap.pdf",width=60,height=90)
par(oma=c(10,4,4,7))
heatmap.2(hmExp,col='greenred',key=T,keysize=0.8,cexCol=0.5,cexRow=0.3,trace="none")
dev.off()

#火山图
pdf(file="vol.pdf")
allDiff <- diff[is.na(diff$padj)==FALSE,]
xMax <- max(-log10(allDiff$padj)) + 1
yMax <- 10
plot(-log10(allDiff$padj), allDiff$log2FoldChange, xlab="-log10(padj)",ylab="log2FoldChange",main="Volcano", xlim=c(0,xMax),ylim=c(-yMax,yMax),yaxs="i",pch=20, cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange > foldChange),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="red",cex=0.4)
diffSub <- allDiff[(allDiff$padj < padj & allDiff$log2FoldChange < (-foldChange)),]
points(-log10(diffSub$padj),diffSub$log2FoldChange,pch=20,col="green",cex=0.4)
abline(h=0,lty=2,lwd=3)
dev.off()
此条目发表在TCGA分类目录。将固定链接加入收藏夹。

TCGA-miRNA差异表达分析》有34条回应

  1. 刘生说:

    请问下载mirRNA.isoform形式的做差异分析试过吗

  2. JM说:

    xMax=max(-log10(allDiff$FDR))+1。为什么我这算出来是无穷大

  3. 罗鑫说:

    大神您好,我在这个过程中遇到一些问题,方便加个微信帮助一下不,wx号:416891498(毒毛花苷)

      • 罗鑫说:

        TCGA生存分析的时候没法提取临床生存时间数据,用的是survival time的perl脚本,报错:Use of uninitialized value $submitterId in split at survival_time.pl line 26, line 7506.
        Use of uninitialized value $subId[0] in exists at survival_time.pl line 27, line 7506.。。。。。。另外就是我没有miRNA merge的perl脚本,没法合并数据,希望老师多多帮忙,谢谢

        • daizao说:

          估计你写的代码有问题吧,26行报错了

          • 罗鑫说:

            可不可以发一个survival_time和miRNA merge的perl脚本到我的邮箱,我最近也是刚学生信,没有基础,就想快点补个生存分析发文章

          • 罗鑫说:

            !/usr/bin/perl -w

            use strict;
            use warnings;

            my $file=$ARGV[0];

            use Data::Dumper;

            use JSON;
            my $json = new JSON;
            my $js;

            open JFILE, “$file”;
            while() {
            $js .= “$_”;
            }
            my $obj = $json->decode($js);

            print $obj->[0]->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’} . “\n”;

            open(WF,”>time.txt”) or die $!;
            print WF “id\tfutime\tfustat\n”;
            my %hash=();
            for my $i(@{$obj})
            {
            my $vitalsStatus=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’};
            my $submitterId=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘treatments’}->[0]->{‘submitter_id’};
            my @subId=split(/_/,$submitterId);
            if(exists $hash{$subId[0]})
            {
            #print $subId[0] . “\n”;
            next;
            }
            else
            {
            $hash{$subId[0]}=1;
            }

            if($vitalsStatus eq 'alive')
            {
            my $days_to_last_follow_up=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_last_follow_up'};
            if(defined $days_to_last_follow_up)
            {
            print WF "$subId[0]\t$days_to_last_follow_up\t0\n";
            }
            }
            else
            {
            my $days_to_death=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_death'};
            if(defined $days_to_death)
            {
            print WF "$subId[0]\t$days_to_death\t1\n";
            }
            }

            }
            close(WF);

            print Dumper $obj

            这是我的survival time.pl脚本,我不知道问题出在哪里,求大神指教

          • daizao说:

            你把你的输入文件,格式贴出来我看看

          • 罗鑫说:

            clinical.cart.2019-08-31.json/metadata.cart.2019-08-30.json 这就是在TCGA网址上下载的cart文件

          • daizao说:

            贴出来比较方便我看你的代码

        • bundy说:

          请问survival time文件是空文件这个问题解决了吗?同问!盼回复

  4. 罗鑫说:

    PS C:\Users\luo xin\Desktop\TCGA\WT TCGA\time-2> perl .\survival_time.pl .\metadata.cart.2019-08-31.json
    Use of uninitialized value $submitterId in split at .\survival_time.pl line 26, line 59350.
    Use of uninitialized value $subId[0] in exists at .\survival_time.pl line 27, line 59350.
    Use of uninitialized value $subId[0] in hash element at .\survival_time.pl line 34, line 59350.
    Use of uninitialized value $vitalsStatus in string eq at .\survival_time.pl line 37, line 59350.
    我不太明白怎么贴代码,这就是输入的代码及结果,结果生成的是一个空文件,没有生存时间

  5. 罗鑫说:

    json文件太长了没法贴,就是在TCGA下载的cart临床文件
    [{
    “md5sum”: “d44eceff2bf522054905d30c252ac54e”,
    “data_type”: “Gene Expression Quantification”,
    “file_name”: “ca57d990-f70a-4506-ae83-4e0a9b46a02d.htseq.counts.gz”,
    “file_size”: 263253,
    “data_format”: “TXT”,
    “analysis”: {
    “analysis_id”: “5990574e-753c-4203-984e-0d7169874262”,
    “updated_datetime”: “2018-09-05T17:42:44.583641-05:00”,
    “created_datetime”: “2016-06-01T23:23:57.887864-05:00”,
    “submitter_id”: “ca57d990-f70a-4506-ae83-4e0a9b46a02d_count”,
    “state”: “released”,
    “workflow_link”: “https://github.com/NCI-GDC/htseq-cwl”,
    “workflow_type”: “HTSeq – Counts”,
    “workflow_version”: “v1”,
    “input_files”: [
    {
    “data_type”: “Aligned Reads”,
    “updated_datetime”: “2019-05-23T23:18:56.477854+00:00”,
    “created_datetime”: “2016-05-30T19:08:59.396179-05:00”,
    “file_name”: “97a8cf4f-52b8-4b21-a59d-ecac0c93c15f_gdc_realn_rehead.bam”,
    “md5sum”: “369085c9f9fe0c68c94aecce6c0958a2”,
    “data_format”: “BAM”,
    “submitter_id”: “97a8cf4f-52b8-4b21-a59d-ecac0c93c15f”,
    “access”: “controlled”,
    “platform”: “Illumina”,
    “state”: “released”,
    “file_id”: “b34557a1-0b59-4fd3-b06d-a53be5691639”,
    “data_category”: “Sequencing Reads”,
    “file_size”: 20685837333,
    “experimental_strategy”: “RNA-Seq”
    }
    ]
    },
    “submitter_id”: “ca57d990-f70a-4506-ae83-4e0a9b46a02d_count”,
    “access”: “open”,
    “state”: “released”,
    “file_id”: “090330b6-5568-45b5-bd4d-3d4b3b8d846c”,
    “data_category”: “Transcriptome Profiling”,
    “associated_entities”: [
    {
    “entity_id”: “27e3aaf3-305f-5c22-957c-c5e5f9829b0c”,
    “case_id”: “3a4693f0-cf26-5767-b036-5d51a46cf1a6”,
    “entity_submitter_id”: “TARGET-50-PALDTE-01A-01R”,
    “entity_type”: “aliquot”
    }
    ],
    “experimental_strategy”: “RNA-Seq”
    },{
    “md5sum”: “e7ec44c0285b9e8dbad2834f6d7f4777”,
    “data_type”: “Gene Expression Quantification”,
    “file_name”: “6590e431-0b86-4337-893c-77d08491a94d.htseq.counts.gz”,
    “file_size”: 259822,
    “data_format”: “TXT”,
    “analysis”: {
    “analysis_id”: “4f602421-2e4d-4292-8515-9ef9338b1b8a”,
    “updated_datetime”: “2018-09-05T17:42:44.583641-05:00”,
    “created_datetime”: “2016-06-01T23:26:59.857859-05:00”,
    “submitter_id”: “6590e431-0b86-4337-893c-77d08491a94d_count”,
    “state”: “released”,
    “workflow_link”: “https://github.com/NCI-GDC/htseq-cwl”,
    “workflow_type”: “HTSeq – Counts”,
    “workflow_version”: “v1”,
    “input_files”: [
    {
    “data_type”: “Aligned Reads”,
    “updated_datetime”: “2019-05-23T23:18:56.477854+00:00”,
    “created_datetime”: “2016-05-31T02:55:06.334378-05:00”,
    “file_name”: “cfeac152-d75a-4a0d-869e-866830b23ae1_gdc_realn_rehead.bam”,
    “md5sum”: “5e01790c59c9a582833d7672ebcbafed”,
    “data_format”: “BAM”,
    “submitter_id”: “cfeac152-d75a-4a0d-869e-866830b23ae1”,
    “access”: “controlled”,
    “platform”: “Illumina”,
    “state”: “released”,
    “file_id”: “52b384d3-21d0-4a24-9f7f-34c3d54333a8”,
    “data_category”: “Sequencing Reads”,
    “file_size”: 12879275825,
    “experimental_strategy”: “RNA-Seq”
    }
    ]
    },
    “submitter_id”: “6590e431-0b86-4337-893c-77d08491a94d_count”,
    “access”: “open”,
    “state”: “released”,
    “file_id”: “5c91495b-7c2c-4638-954e-06195adc8877”,
    “data_category”: “Transcriptome Profiling”,
    “associated_entities”: [
    {
    “entity_id”: “001e1cad-9bf6-578d-b5fe-26c7c5ebd577”,
    “case_id”: “a0a0ceff-2847-5661-a117-c68ed2a57bd7”,
    “entity_submitter_id”: “TARGET-50-PAJMUF-01A-01R”,
    “entity_type”: “aliquot”
    }
    ],
    “experimental_strategy”: “RNA-Seq”
    },{
    “md5sum”: “6d1a0e62fd64dfa9571ed80ddd7d1457”,
    “data_type”: “Gene Expression Quantification”,
    “file_name”: “1b1ed2c4-0fc3-49d2-89c7-140083e330f5.htseq.counts.gz”,
    “file_size”: 261838,
    “data_format”: “TXT”,
    “analysis”: {
    “analysis_id”: “a2cd7540-1038-4460-bfbd-f13a2688b35e”,
    “updated_datetime”: “2018-09-05T17:42:44.583641-05:00”,
    “created_datetime”: “2016-06-01T23:27:58.374091-05:00”,
    “submitter_id”: “1b1ed2c4-0fc3-49d2-89c7-140083e330f5_count”,
    “state”: “released”,
    “workflow_link”: “https://github.com/NCI-GDC/htseq-cwl”,
    “workflow_type”: “HTSeq – Counts”,
    “workflow_version”: “v1”,
    “input_files”: [
    {
    “data_type”: “Aligned Reads”,
    “updated_datetime”: “2019-05-23T23:18:56.477854+00:00”,
    “created_datetime”: “2016-05-31T05:23:29.067381-05:00”,
    “file_name”: “ac7a04e6-f9db-42ae-9619-da011c96ffd8_gdc_realn_rehead.bam”,
    “md5sum”: “232c63fa6d810a03f2f5cefeaf5d4a38”,
    “data_format”: “BAM”,
    “submitter_id”: “ac7a04e6-f9db-42ae-9619-da011c96ffd8”,
    “access”: “controlled”,
    “platform”: “Illumina”,
    “state”: “released”,
    “file_id”: “12ee42a2-854f-4bbd-b432-e8eab99a0e67”,
    “data_category”: “Sequencing Reads”,
    “file_size”: 15080082683,
    “experimental_strategy”: “RNA-Seq”
    }
    ]
    },
    “submitter_id”: “1b1ed2c4-0fc3-49d2-89c7-140083e330f5_count”,
    “access”: “open”,
    “state”: “released”,
    “file_id”: “56a4b60d-baf1-4cfa-86d5-e2d146db30f7”,
    “data_category”: “Transcriptome Profiling”,

    • daizao说:

      你的json没贴全,感觉是循环的问题

      • ma sahara说:

        你好,我也有遇到这个问题。perl survival_time.pl clinical.cart.2019-09-24.json提示Use of uninitialized value $vitalsStatus in string eq at survival_time.pl line 46, line 46884.
        脚本:

        !/usr/bin/perl -w

        use strict;
        use warnings;

        my $file=$ARGV[0];

        use Data::Dumper;

        macϵͳ: sudo cpan install JSON

        use JSON;
        my $json = new JSON;
        my $js;

        open JFILE, “$file”;
        while() {
        $js .= “$_”;
        }
        my $obj = $json->decode($js);

        print $obj->[0]->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’} . “\n”;

        open(WF,”>time.txt”) or die $!;
        print WF “id\tfutime\tfustat\n”;
        my %hash=();
        my @samp1e=(localtime(time));
        for my $i(@{$obj})
        {
        my $vitalsStatus=$i->{‘diagnoses’}->[0]->{‘vital_status’};
        my $submitterId=$i->{‘diagnoses’}->[0]->{‘submitter_id’};
        my @subId=split(/_/,$submitterId);
        if(exists $hash{$subId[0]})
        {
        if($samp1e[5]>119){next;}
        #print $subId[0] . “\n”;
        if($samp1e[4]>13){next;}
        next;
        }
        else
        {
        $hash{$subId[0]}=1;
        }

        if($vitalsStatus eq 'alive')
        {
        my $days_to_last_follow_up=$i->{'diagnoses'}->[0]->{'days_to_last_follow_up'};
        if(defined $days_to_last_follow_up)
        {
        print WF "$subId[0]\t$days_to_last_follow_up\t0\n";
        }
        }
        else
        {
        my $days_to_death=$i->{'diagnoses'}->[0]->{'days_to_death'};
        if(defined $days_to_death)
        {
        print WF "$subId[0]\t$days_to_death\t1\n";
        }
        }

        }
        close(WF);

        print Dumper $obj

        json文件一共有46884行,同时pl第25行if($vitalsStatus eq ‘alive’)错误是说这个未初始化,请教一下该如何解决。谢谢您。

  6. 老驴说:

    求miRNA merge的perl脚本

  7. 吴莎说:

    !/usr/bin/perl -w

    use strict;
    use warnings;

    my $file=$ARGV[0];

    use Data::Dumper;

    use JSON;
    my $json = new JSON;
    my $js;

    open JFILE, “$file”;
    while() {
    $js .= “$_”;
    }
    my $obj = $json->decode($js);

    print $obj->[0]->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’} . “\n”;

    open(WF,”>time.txt”) or die $!;
    print WF “id\tfutime\tfustat\n”;
    my %hash=();
    for my $i(@{$obj})
    {
    my $vitalsStatus=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘vital_status’};
    my $submitterId=$i->{‘cases’}->[0]->{‘diagnoses’}->[0]->{‘treatments’}->[0]->{‘submitter_id’};
    my @subId=split(/_/,$submitterId);
    if(exists $hash{$subId[0]})
    {
    #print $subId[0] . “\n”;
    next;
    }
    else
    {
    $hash{$subId[0]}=1;
    }

    if($vitalsStatus eq 'alive')
    {
    my $days_to_last_follow_up=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_last_follow_up'};
    if(defined $days_to_last_follow_up)
    {
    print WF "$subId[0]\t$days_to_last_follow_up\t0\n";
    }
    }
    else
    {
    my $days_to_death=$i->{'cases'}->[0]->{'diagnoses'}->[0]->{'days_to_death'};
    if(defined $days_to_death)
    {
    print WF "$subId[0]\t$days_to_death\t1\n";
    }
    }

    }
    close(WF);

    print Dumper $obj

    上面是我的Perl脚本。

  8. 吴莎说:

    json文件说太长了,贴不了,我也是time.txt文件是空的。
    Use of uninitialized value $submitterId in split at survival_time.pl line 26, line 28921.
    Use of uninitialized value $subId[0] in exists at survival_time.pl line 27, line 28921.

  9. 桂程鹏说:

    我的也是循环出错

  10. 申叶燑说:

    您好!
    [rowMeans(data)>0,]是筛掉表达量低的基因吗?
    如果是,取rowMeans(data)>0.1是否可以,这个值在选取上有什么标准吗?
    谢谢您!

  11. chw说:

    survival_time.pl这个脚本文件可以给我一下吗,谢谢

发表评论

邮箱地址不会被公开。 必填项已用*标注

To create code blocks or other preformatted text, indent by four spaces:

    This will be displayed in a monospaced font. The first four 
    spaces will be stripped off, but all other whitespace
    will be preserved.
    
    Markdown is turned off in code blocks:
     [This is not a link](http://example.com)

To create not a block, but an inline code span, use backticks:

Here is some inline `code`.

For more help see http://daringfireball.net/projects/markdown/syntax

Protected with IP Blacklist CloudIP Blacklist Cloud