Translate PDF to Table (将pdf文件转换为table)

library(pdftools)
rm(list=ls())
options(stringsAsFactors=F)
setwd("C:/Users/daizao/Desktop/pdf_table_read")
dz <- pdf_text("mmc3.pdf")
pdf2table <- function(x){
    process1 <- strsplit(x,"\\r\\n")
    process2 <- lapply(process1,function(x){gsub("\"\\s+(.*?)\\s+(.*?)\\s+(.*?)\\s+(.*?)\\s+(.*?)\"","\\1\t\\2\\3\\4\t\\5",x,perl=T)})
    test <- data.frame(matrix(unlist(process2), nrow=(length(unlist(process2))), byrow=T))
    data <- data.frame()
    for (i in 2:nrow(test)){
        a <- unlist(strsplit(test[i,]," "))
        b <- a[grepl("^[a-zA-Z]",unlist(strsplit(test[i,]," ")))]
        temp1 <- paste(b[2:(length(b)-1)],collapse=" ")
        temp2 <- cbind(b[1],temp1,b[(length(b))])
        data <- rbind(data,temp2)
    }
    return(data)
}
data <- pdf2table(dz)
colnames(data) <- data[1,]
data <- data[-1,]
write.table(data,file="gene.txt",sep="\t",row.names=F,quote=F)
发表在 R | 留下评论

Install MACS2 in CentOS6.5 without root (非root权限安装MACS2)

#download
wget -c https://files.pythonhosted.org/packages/5e/32/0108a85d9a91c18413e8492bea631725cdbc20b45970ef63e8d0020ef2da/MACS2-2.1.2.1.tar.gz 

#install python 2.7.10
wget -c https://www.python.org/ftp/python/2.7.10/Python-2.7.10.tgz
tar -zxf Python-2.7.10.tgz
cd Python-2.7.10
./configure --prefix=/home/train/python-2.7.10
make
make install
cd
echo 'export PATH=/home/train/python-2.7.10/bin/:$PATH' >> ~/.bashrc_python-2.7.10
echo 'export PYTHONPATH=/home/train/python-2.7.10/include/:/home/train/python-2.7.10/lib/:$PYTHONPATH' >> ~/.bashrc_python-2.7.10
source ~/.bashrc_python-2.7.10
python --version

#install pip
wget -c https://bootstrap.pypa.io/get-pip.py
python get-pip.py

#instuall numpy
wunzip numpy-1.16.4.zip
wget -c https://files.pythonhosted.org/packages/d3/4b/f9f4b96c0b1ba43d28a5bdc4b64f0b9d3fbcf31313a51bc766942866a7c7/numpy-1.16.4.zip
unzip numpy-1.16.4.zip
cd numpy-1.16.4
python setup.py install 
cd


tar -zxf MACS2-2.1.2.1.tar.gz
cd MACS2-2.1.2
python setup.py install #we suggest do not build a new path
echo 'source ~/.bashrc_python-2.7.10' >> ~/.bashrc_macs2
发表在 Linux, NGS_analysis, Python | 留下评论

The installtion of GeneWise in Centos6.5 (在Centos6.5上的GeneWise安装)

wget -c http://www.ebi.ac.uk/~birney/wise2/wise2.4.1.tar.gz --no-check-certificate

tar -zxf wise2.4.1.tar.gz
cd wise2.4.1
cd src/
cp ~/wise2.4.1/src/HMMer2/sqio.c ~/sqio.c.bak
sed -i 's/getline/new_getline/g' ~/wise2.4.1/src/HMMer2/sqio.c >> sqio.c

cd
wget -c https://download.gnome.org/sources/glib/2.10/glib-2.10.0.tar.gz
tar -zxf glib-2.10.0.tar.gz
cd glib-2.10.0
mkdir ~/glib
./configure --prefix=/home/train/glib
make
cd..
export CFLAGS='-I/usr/include -I/home/train/glib/include'

cd wise2.4.1/src/
find . -name makefile | xargs sed -i 's/glib-config/pkg-config glib-2.0/g'
sed -i 's/isnumber/isdigit/g' models/phasemodel.c
make all
export WISECONFIGDIR=/home/train/wise2.4.1/wisecfg/
make test
echo 'export PATH=$PATH:/home/train/wise2.4.1/src/bin/' >> ~/.bashrc
echo 'export WISECONFIGDIR=/home/train/wise2.4.1/wisecfg/' >> ~/.bashrc
source ~/.bashrc

参考: 1、GENEWISE 的使用 2、同源注释工具GeneWise安装和使用 3、genewise.rb 4、Tutorial: Getting Cegma/Genewise To Work On Centos, Rhel

发表在 Linux, NGS_analysis | 留下评论

Get Motif file in the result of HOMER (根据HOMER结果获取motif文件)

#!/usr/bin/perl -w
use strict;
use warnings;

my $usage=<<USAGE;
Usage:
    perl get_motif_file.pl gene_name /path/to/all.motif_file /path/to/motifFile
USAGE
if(@ARGV==0){die $usage};


my $file1=$ARGV[0];
my $file2=$ARGV[1];
my $file3=$ARGV[2];
my @data;

open(RF,$file2) || die $!;
open(WF,">process.txt") || die $!;
while(my $line=<RF>){
    chomp($line);
    push @data, $line;
    my $temp="\t";
    push @data, $temp;
}
print WF @data;
close(RF);
close(WF);

open(RF,"process.txt") || die $!;
open(WF,">process_1.txt") || die $!;
while (my $line=<RF>){
    my @arr=split(/>/,$line);
    for my $i (1..$#arr){
        print WF ">".$arr[$i]."\n";
    }
}
close(RF);
close(WF);

my $g=0;
open(RF,"process_1.txt") || die $!;
open(WF,">".$file3) || die $!;
while(my $line=<RF>){
    chomp($line);
    my @arr=split(/\t/,$line);
    my @gene=split(/\(/,$arr[1]);
    if ($gene[0] eq $file1){
        for my $i (0..4){
            print WF $arr[$i]."\t";
        }
        print WF $arr[5]."\n";
        for my $j (6..$#arr){
            if ($g <= 2){
                print WF $arr[$j]."\t";
            }
            if ($g==3){
                print WF $arr[$j]."\n";
                $g=0;
                next;
            }
            $g=$g+1;
        }
        }
    }
close(RF);
close(WF);
发表在 NGS_analysis, Perl | 留下评论

dbGaP IT Director申请

步骤:
1、SO在eRA申请一个新的子账号,并且向NIH发送邮件说明情况
2、NIH会给子账号发邮件,按照要求填写资料
3、完成

欢迎加群讨论申请问题

发表在 TCGA | 留下评论

TCGA转录本数据合并-R语言

library(R.utils)
library(hash)
library(data.table)
library(jsonlite)
setwd("C:/Users/daizao/Desktop/practise/test")
dir.create("data_in_one")
for (i in list.files("raw_data/")){
    b <- paste("raw_data/",i,sep="")
    pathname <- paste(b,dir(b),sep="/")
    file.copy(pathname,"data_in_one/")
}   

dz <- fromJSON("metadata.cart.2019-07-19.json")
temp_tcgaid <- as.character(lapply(dz$associated_entities,function(x){a <- x$entity_submitter_id;return(a)}))
temp_filename <- as.character(dz$file_name)
h <- hash(temp_filename,temp_tcgaid)

cishu <- 0
for (i in 1:length(dz$file_name)){
    if (cishu==0){
        test <- fread(paste("data_in_one",dz$file_name[i],sep="/"))
        test <- test[-((nrow(test)-5):nrow(test)),]
        exp <- matrix(NA,nrow(test),length(dz$file_name))
        rownames(exp) <- as.data.frame(test)[,1]
        tcgaid <- c()
        for (j in keys(h)){
            tcgaid_temp <- h[[j]]
            tcgaid <- paste(tcgaid,tcgaid_temp,sep=",")
        }
        ttt <- lapply(strsplit(tcgaid,",")[[1]],function(x){if(x != ""){return (x)}})
        ttt <- as.character(ttt)[-1]
        colnames(exp) <- ttt
        cishu <- cishu + 1
    }
    if (cishu > 0){
        test <- fread(paste("data_in_one",dz$file_name[i],sep="/"))
        test <- test[-((nrow(test)-5):nrow(test)),] 
        new_h <- hash(test$V1,test$V2)
        for (j in rownames(exp)){
            file_name_new <- dz$file_name[i]
            exp[j,h[[file_name_new]]] <- new_h[[j]]
        }
    }
}

normalSample <- c()
tumorSample <- c()

for ( i in colnames(exp)){
    sample <- unlist(strsplit(i,"-"))[4]
    if(grepl("^1",sample)){
        normalSample <- paste(normalSample,i,sep=",")

    }else{
        tumorSample <- paste(tumorSample,i,sep=",")
    }

}

if ("normalSample" %in% ls()){
    normal_name <- strsplit(normalSample,",")[[1]][-1]
    tumor_name <- strsplit(tumorSample,",")[[1]][-1]
    if (length(normal_name) == 1){
        temp_normal <- as.data.frame(exp[,normal_name])
        colnames(temp_normal) <- normal_name
        normal_data <- temp_normal
    }else{
        normal_data <- exp[,normal_name]
    }
    tumor_data <- exp[,tumor_name]

    total_sort_sample <- merge(normal_data,tumor_data,by="row.names",all=T)
}else{
    total_sort_sample <- tumor_data
}

zanshi <- c("id")
for (i in colnames(total_sort_sample)[-1]){zanshi <- paste(zanshi,i,sep=",")}
colnames(total_sort_sample) <- unlist(strsplit(zanshi,","))
write.table(total_sort_sample,file="RNAmatrix.txt",sep="\t",row.names=F,quote=F)

因为是R语言自身的原因,速度没有perl脚本快

发表在 R, TCGA | 留下评论