TCGA转录本数据合并-R语言

library(R.utils)
library(hash)
library(data.table)
library(jsonlite)
setwd("C:/Users/daizao/Desktop/practise/test")
dir.create("data_in_one")
for (i in list.files("raw_data/")){
    b <- paste("raw_data/",i,sep="")
    pathname <- paste(b,dir(b),sep="/")
    file.copy(pathname,"data_in_one/")
}   

dz <- fromJSON("metadata.cart.2019-07-19.json")
temp_tcgaid <- as.character(lapply(dz$associated_entities,function(x){a <- x$entity_submitter_id;return(a)}))
temp_filename <- as.character(dz$file_name)
h <- hash(temp_filename,temp_tcgaid)

cishu <- 0
for (i in 1:length(dz$file_name)){
    if (cishu==0){
        test <- fread(paste("data_in_one",dz$file_name[i],sep="/"))
        test <- test[-((nrow(test)-5):nrow(test)),]
        exp <- matrix(NA,nrow(test),length(dz$file_name))
        rownames(exp) <- as.data.frame(test)[,1]
        tcgaid <- c()
        for (j in keys(h)){
            tcgaid_temp <- h[[j]]
            tcgaid <- paste(tcgaid,tcgaid_temp,sep=",")
        }
        ttt <- lapply(strsplit(tcgaid,",")[[1]],function(x){if(x != ""){return (x)}})
        ttt <- as.character(ttt)[-1]
        colnames(exp) <- ttt
        cishu <- cishu + 1
    }
    if (cishu > 0){
        test <- fread(paste("data_in_one",dz$file_name[i],sep="/"))
        test <- test[-((nrow(test)-5):nrow(test)),] 
        new_h <- hash(test$V1,test$V2)
        for (j in rownames(exp)){
            file_name_new <- dz$file_name[i]
            exp[j,h[[file_name_new]]] <- new_h[[j]]
        }
    }
}

normalSample <- c()
tumorSample <- c()

for ( i in colnames(exp)){
    sample <- unlist(strsplit(i,"-"))[4]
    if(grepl("^1",sample)){
        normalSample <- paste(normalSample,i,sep=",")

    }else{
        tumorSample <- paste(tumorSample,i,sep=",")
    }

}

if ("normalSample" %in% ls()){
    normal_name <- strsplit(normalSample,",")[[1]][-1]
    tumor_name <- strsplit(tumorSample,",")[[1]][-1]
    if (length(normal_name) == 1){
        temp_normal <- as.data.frame(exp[,normal_name])
        colnames(temp_normal) <- normal_name
        normal_data <- temp_normal
    }else{
        normal_data <- exp[,normal_name]
    }
    tumor_data <- exp[,tumor_name]

    total_sort_sample <- merge(normal_data,tumor_data,by="row.names",all=T)
}else{
    total_sort_sample <- tumor_data
}

zanshi <- c("id")
for (i in colnames(total_sort_sample)[-1]){zanshi <- paste(zanshi,i,sep=",")}
colnames(total_sort_sample) <- unlist(strsplit(zanshi,","))
write.table(total_sort_sample,file="RNAmatrix.txt",sep="\t",row.names=F,quote=F)

因为是R语言自身的原因,速度没有perl脚本快

发表在 R, TCGA | 一条评论

Perl 矩阵排序(根据TCGA的样品信息)

#!/usr/bin/perl -w
use strict;
use warnings;

my $usage=<<USAGE;
Usage:
    perl dz_merge.pl /path/to/inutfile /path/to/outfile
USAGE
if(@ARGV==0){die $usage};

my $file1=$ARGV[0];
my $file2=$ARGV[1];
my @normalsample;
my @tumorsample;
my @normalindex;
my @tumorindex;
my %gene_exp;

open(RF,$file1) || die "can't open $file1";
while(my $line=<RF>){
    chomp($line);
    my @arr=split("\t",$line);
    if ($.==1){
        for my $i (1..$#arr){
            my @sample=split("-",$arr[$i]);
            if ($sample[3]=~/^1/){
                push(@normalsample,$arr[$i]);
                push(@normalindex,$i);
            }elsif($sample[3]=~/^0/){
                push(@tumorsample,$arr[$i]);
                push(@tumorindex,$i);
            }
        }
    }
}
close(RF);

open(RF,$file1) || die $!;
open(WF,">".$file2) || die $!;
print WF "id\t",join("\t",@normalsample),"\t",join("\t",@tumorsample),"\n";
while(my $line=<RF>){
    next if ($.==1);
    chomp($line);
    my @arr=split("\t",$line);
    print WF $arr[0];
    for my $i (0..$#normalindex){
        print WF "\t",$arr[$normalindex[$i]];
    }
    for my $j (0..$#tumorindex){
        print WF "\t",$arr[$tumorindex[$j]]
    }
    print WF "\n";
}
close(RF);
close(WF);
发表在 Perl, TCGA | 4条评论

TCGA转录本数据合并

#!/usr/bin/perl -w
use strict;
use warnings;
use Data::Dumper;
use File::Basename;
use JSON;

my $usage=<<USAGE;
Usage:
    perl dz_merge.pl /PATH/TO/meta.json /path/to/cartfile /path/to/filename
USAGE
if(@ARGV==0){die $usage};

my $file1=$ARGV[0];
my $file2=$ARGV[1];
my $file3=$ARGV[2];
my $js;

open(RF,$file1) || die "wrong json doc";
while (my $line=<RF>){
    $js .= "$line";
}
close(RF);

my $json=decode_json($js);

my %hash;
my %gene_exp;

if (-e ($file2."/dz_result.txt")){
    unlink ($file2."/dz_result.txt");
}
my @cat_file=glob($file2."/*.txt");
#print @cat_file;
for my $doc (0..$#cat_file){
    my $basename = basename $cat_file[$doc];
    $hash{$basename}++;
}

my @tcgaid;
my @genename;
my $jishu=0;

for my $i (@{$json}){
    my $docname=$i -> {file_name};
    my @arr=split(/\./,$docname);
    my $filename="$arr[0].$arr[1].$arr[2]";
    if (exists $hash{$filename}){
        push(@tcgaid,$i -> {associated_entities} -> [0] -> {entity_submitter_id});
        open(RF,$file2."/".$filename."/".$filename);
        while(my $line=<RF>){
            chomp ($line);
            my @genearr=split("\t",$line);
            $gene_exp{$i -> {associated_entities} -> [0] -> {entity_submitter_id}}{$genearr[0]}=$genearr[1];
            if ($jishu == 0){
                push(@genename,$genearr[0]);
            }
        }
        close(RF);
        $jishu=$jishu+1;
    }
}

my $normal=0;
my $tumor=0;

open(WF,">>".$file3) || die "wrong output file !";
print WF "id";
for my $i (0..$#tcgaid){
    my @arr=split("-",$tcgaid[$i]);
    if ($arr[3]=~/^1/){#normal tissue
        print WF "\t",$tcgaid[$i];
        $normal++;  
    }elsif($arr[3]=~/^0/){#tumor tissure
        print WF "\t",$tcgaid[$i];
        $tumor++;   
    }
}

print WF "\n";
print "normal tissue:",$normal,"\n";
print "tumor tissue:",$tumor,"\n";

for my $j (0..$#genename){
    print WF $genename[$j];
    for my $i (0..$#tcgaid){
        my @arr=split("-",$tcgaid[$i]);
        if ($arr[3]=~/^1/){
                print WF "\t",$gene_exp{$tcgaid[$i]}{$genename[$j]};
        }elsif($arr[3]=~/^0/){
                print WF "\t",$gene_exp{$tcgaid[$i]}{$genename[$j]};
        }
    }
    print WF "\n";
}

close(WF);

由于样品还未排序,不好用于差异分析,所以需要根据样本类型进行排序:排序脚本
R语言合并脚本

发表在 Perl, TCGA | 5条评论

perl 文件夹及文件操作

#!/usr/bin/perl -w
use strict;
use warnings;
use File::Copy;


my $usage=<<USAGE;
Usage:
    perl dz_putFiles2oneDir.pl /path/to/input_dir /path/to/filename
USAGE
if(@ARGV==0){die $usage};

my $file1=$ARGV[0];
my $file2=$ARGV[1];

my $input_dir=$file1;
my $output=$file2;

opendir(DIR,$input_dir) || die "can not open the directory";
if(!(-d $output)){
    mkdir ($output); #创建文件夹
}
if((-d $input_dir) && ($input_dir ne $output)){
    my @files=glob($input_dir."/*/*.gz");
    for my $i (0..$#files){
        copy ($files[$i],$output) || die "Copy failed: $!";
    }
}

closedir(DIR);
发表在 Perl | 留下评论

Python 参数传递

#!-*-coding:utf8 -*-
#要在首行才可以在后面用中文注释    
import getopt
import sys
import os


if __name__ == '__main__':
#    ...
    helpdoc = '''
Description

    ...

Usage

    python pyscript.py -i/--ifile <input file> -o/--ofile <output file> -t/--time <int> ...

Parameters

    -h/--help
        Print helpdoc
    -i/--ifile
        Input file,including only one column with sampleId
    -o/--ofile
        Output file, including two columns, the 1st column is sampleId, the 2nd column is attribute information
    -t/--time
        Time for interval (seconds, default 5s)
    ...
'''

try:
    opts,args = getopt.getopt(sys.argv[1:], "hi:o:t:n:", ["help","ifile=", "ofile=", "time="])
    if len(opts) == 0:
        print("Options Error!\n\n"+helpdoc)
        sys.exit(2)
except getopt.GetoptError:
    print("Options two Error!\n\n"+helpdoc)
    sys.exit(2)

for opt,arg in opts:
    if opt in ("-h","--help"):
        print(helpdoc)
        sys.exit()
    elif opt in ("-i","--ifile"):
        infile = arg
    elif opt in ("-o","--ofile"):
        outfile = arg
    elif opt in ("-t","--time"):
        sleep_time = int(float(arg))
    elif opt in ("-n","--requests-number"):
        requestNum = int(float(arg))

print (infile)
print (outfile)
print (sleep_time)
print (requestNum)

参考 1、python参数输入 2、cmdbtools

发表在 Python | 留下评论

Shell 参数传递

#!/bin/bash

helpdoc(){
    cat << EOF
Description:

    This shellscript is used to run the pipeline to call snp using GATK4
    - Data merge: merge multi-BAM files coresponding to specified strain
    - Data pre-processing: Mark Duplicates + Base (Quality Score) Recalibration
    - Call variants per-sample
    - Filter Variants: hard-filtering

Usage:

    $0 -S <strain name> -R <bwa index> -k <know-site> -i <intervals list>

Option:

    -S  strain name, if exist character "/", place "/" with "_" (Required)
    -R  the path of bwa index (Required)
    -k  known-sites variants VCF file
    -i  intervals list file,must be sorted (Required)

EOF
}

#$# 代表所有参数的个数
#若无指定参数,输出h说明文档

if [ $# = 0 ]
then 
    helpdoc
    exit 1
fi  

while getopts "h:S:R:k:i:" opt
do
    case $opt in
        h)
            helpdoc
            exit 0
            ;;
        S)
            strain=$OPTARG
            if [[ $strain =~ / ]]
            then
                echo "Error in specifing strain name , if exist character \"/\", place \"/\" with \"_\""
                helpdoc
                exit 1
            fi
            ;;

        R)
            index=$OPTARG
            ;;
        k)
            vcf=$OPTARG
            if [ ! -f $vcf ]
            then
                echo "NO SUCH FILE: $vcf"
                helpdoc
                exit 1
            fi
            ;;
        i)
            intervals=$OPTARG
            if [ ! -f $intervals ]
            then 
                echo "NO SUCH FILE: $intervals"
                helpdoc
                exit 1
            fi  
                ;;
            ?)
                echo "Unknown option: $opt"
                helpdoc
                exit 1
                ;;
    esac
done

echo $strain
echo $index 
echo $vcf
echo $intervals

参考:
perl、shell、python输出参数文档

发表在 Linux | 留下评论