TCGA转录本数据合并-R语言

library(R.utils)
library(hash)
library(data.table)
library(jsonlite)
setwd("C:/Users/daizao/Desktop/practise/test")
dir.create("data_in_one")
for (i in list.files("raw_data/")){
    b <- paste("raw_data/",i,sep="")
    pathname <- paste(b,dir(b),sep="/")
    file.copy(pathname,"data_in_one/")
}   

dz <- fromJSON("metadata.cart.2019-07-19.json")
temp_tcgaid <- as.character(lapply(dz$associated_entities,function(x){a <- x$entity_submitter_id;return(a)}))
temp_filename <- as.character(dz$file_name)
h <- hash(temp_filename,temp_tcgaid)

cishu <- 0
for (i in 1:length(dz$file_name)){
    if (cishu==0){
        test <- fread(paste("data_in_one",dz$file_name[i],sep="/"))
        test <- test[-((nrow(test)-5):nrow(test)),]
        exp <- matrix(NA,nrow(test),length(dz$file_name))
        rownames(exp) <- as.data.frame(test)[,1]
        tcgaid <- c()
        for (j in keys(h)){
            tcgaid_temp <- h[[j]]
            tcgaid <- paste(tcgaid,tcgaid_temp,sep=",")
        }
        ttt <- lapply(strsplit(tcgaid,",")[[1]],function(x){if(x != ""){return (x)}})
        ttt <- as.character(ttt)[-1]
        colnames(exp) <- ttt
        cishu <- cishu + 1
    }
    if (cishu > 0){
        test <- fread(paste("data_in_one",dz$file_name[i],sep="/"))
        test <- test[-((nrow(test)-5):nrow(test)),] 
        new_h <- hash(test$V1,test$V2)
        for (j in rownames(exp)){
            file_name_new <- dz$file_name[i]
            exp[j,h[[file_name_new]]] <- new_h[[j]]
        }
    }
}

normalSample <- c()
tumorSample <- c()

for ( i in colnames(exp)){
    sample <- unlist(strsplit(i,"-"))[4]
    if(grepl("^1",sample)){
        normalSample <- paste(normalSample,i,sep=",")

    }else{
        tumorSample <- paste(tumorSample,i,sep=",")
    }

}

if ("normalSample" %in% ls()){
    normal_name <- strsplit(normalSample,",")[[1]][-1]
    tumor_name <- strsplit(tumorSample,",")[[1]][-1]
    if (length(normal_name) == 1){
        temp_normal <- as.data.frame(exp[,normal_name])
        colnames(temp_normal) <- normal_name
        normal_data <- temp_normal
    }else{
        normal_data <- exp[,normal_name]
    }
    tumor_data <- exp[,tumor_name]

    total_sort_sample <- merge(normal_data,tumor_data,by="row.names",all=T)
}else{
    total_sort_sample <- tumor_data
}

zanshi <- c("id")
for (i in colnames(total_sort_sample)[-1]){zanshi <- paste(zanshi,i,sep=",")}
colnames(total_sort_sample) <- unlist(strsplit(zanshi,","))
write.table(total_sort_sample,file="RNAmatrix.txt",sep="\t",row.names=F,quote=F)

因为是R语言自身的原因,速度没有perl脚本快

此条目发表在R, TCGA分类目录。将固定链接加入收藏夹。

TCGA转录本数据合并-R语言》有一条回应

  1. 小小说:

    超级赞,感谢!

发表评论

邮箱地址不会被公开。 必填项已用*标注

To create code blocks or other preformatted text, indent by four spaces:

    This will be displayed in a monospaced font. The first four 
    spaces will be stripped off, but all other whitespace
    will be preserved.
    
    Markdown is turned off in code blocks:
     [This is not a link](http://example.com)

To create not a block, but an inline code span, use backticks:

Here is some inline `code`.

For more help see http://daringfireball.net/projects/markdown/syntax

Protected with IP Blacklist CloudIP Blacklist Cloud