AR_Chip-seq数据分析

#质控
cd /home/train/Liuping/raw_data/fastq_file
fastqc -t 32 -o /home/train/Liuping/raw_data/fastq_file/fastqc_raw/ /home/train/Liuping/raw_data/fastq_file/*.fastq
#去接头
java -jar /opt/biosoft/Trimmomatic-0.36/trimmomatic-0.36.jar SE -threads 32 -phred33 SRR1206250.fastq SRR1206250.cut.fastq ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
ls SRR120626*.fastq > a
ls SRR120626*.fastq | sed 's/^/java\ \-jar\ \/opt\/biosoft\/Trimmomatic-0.36\/trimmomatic-0.36.jar\ SE\ \-threads\ 32\ \-phred33\ /' > a
ls SRR120626?.fastq | awk -F "." '{print $1}' > b
paste -d " " a b > c
sed 's/$/.cut.fastq\ ILLUMINACLIP:TruSeq3-SE:2:30:10\ LEADING:3\ TRAILING:3\ SLIDINGWINDOW:4:15\ MINLEN:36/' c > d
chmod u+x d
./d > log.trim #运行时候忘记记录日志了
#去除N
cp /home/train/gc_reovirus_data/gc_fastq/removeN ./
ls SRR12062*.cut.fastq > e
sed 's/^/\/home\/train\/Liuping\/raw_data\/fastq_file\/removeN\ /' e > delN.sh
ls SRR12062*.cut.fastq | awk -F "." '{print $1}' > f
paste -d " " e f > g  #合并错了 = = 
sed 's/$/.unknowNu.fastq/' g > h
awk -F " " '{print $2}' h > i
paste -d " " delN.sh i > xindelN.sh
chmod u+x xindelN.sh
./xindelN.sh > log.delN
#去除低质量碱基
ls SRR12062*.unknowNu.fastq | sed 's/^/fastq_quality_filter\ -Q33\ -q\ 20\ -p\ 70\ -i\ /' > del_low_quality_base
sed 's/$/\ -o\ /' del_low_quality_base > del_low_quality_base_1
ls SRR12062*.unknowNu.fastq | awk -F "." '{print $1}' | paste -d " " del_low_quality_base_1 - | sed 's/$/.clean.fq/' > del_low_quality_base_xin #paste 的 - 用于标准输入
chmod u+x del_low_quality_base_xin
./del_low_quality_base_xin
#del_low_quality_base_xin文件内容如下
#重新指控,看清洗后数据质量有无提高
mkdir fastqc_clean
fastqc -t 32 -o /home/train/Liuping/raw_data/fastq_file/fastqc_clean/ /home/train/Liuping/raw_data/fastq_file/SRR12062*.clean.fq

#建立索引文件 (准备使用hg19和hg38两个基因组来做参考基因组)
cd /home/train/Liuping/human_genome/hg19_genome
cp hg19.fa ../../raw_data/fastq_file/
cd ../hg38_genome/
cp hg38.fa ../../raw_data/fastq_file/
cd /home/train/Liuping/raw_data/fastq_file
bowtie2-build hg19.fa hg19
bowtie2-build hg38.fa hg38
#比对
ls SRR12062*.clean.fq | awk -F "." '{print $1}' |sed 's/^/-S\ /' |sed 's/$/_hg19.sam/' > bowtie2_hg19_1
paste -d " " bowtie2_hg19 bowtie2_hg19_1 > bowtie2_hg19_xin
ls SRR12062*.clean.fq | awk -F "." '{print $1}' |sed 's/^/-S\ /' |sed 's/$/_hg38.sam/'> bowtie2_hg38_1
paste -d " " bowtie2_hg38 bowtie2_hg38_1 > bowtie2_hg38_xin
les bowtie2_hg38_xin >> bowtie2_hg19_xin
mv bowtie2_hg19_xin bowtie2_align
chmod u+x bowtie2_align
#nohup ./bowtie2_align > log_bowtie2 2>&1 &
nohup ParaFly -c bowtie2_align -CPU 16 > log_bowtie2 2>&1 & 

#取出uniq比对的文件
ls *.sam | sed 's/^/grep\ -v\ "XS:i:"\ /' | sed 's/$/\ |\ samtools\ view\ -bS\ ->\ /' > uniqsam_1
#ls *.sam | awk -F "." '{print $1}' | sed 's/$/.bam/' | sed 's/^/\ /' > uniqsam_2
ls *.sam | awk -F "." '{print $1}' | sed 's/$/.bam/' > uniqsam_2
paste -d " " uniqsam_1 uniqsam_2 > uniqsam_run
chmod u+x uniqsam_run
nohup ParaFly -c uniqsam_run -CPU 16 > log_uniq_bam 2>&1 & #[1] 19053


#call peaks   #由于差异peak包括刺激下上调的peak和下调的peak,即将两种数据对调计算即可得到,由于之前忘记计算了,所以这次在最后重新计算
mkdir result_call_peaks
cd /home/train/Liuping/raw_data/fastq_file/result_call_peaks
mkdir hg19_diff
cd ..
#macs14 -t SRR1206269_hg19.bam SRR1206268_hg19.bam SRR1206267_hg19.bam SRR1206266_hg19.bam SRR1206259_hg19.bam SRR1206265_hg19.bam SRR1206263_hg19.bam SRR1206262_hg19.bam SRR1206261_hg19.bam SRR1206258_hg19.bam SRR1206255_hg19.bam SRR1206253_hg19.bam SRR1206251_hg19.bam -c SRR1206264_hg19.bam SRR1206260_hg19.bam SRR1206257_hg19.bam SRR1206256_hg19.bam SRR1206254_hg19.bam SRR1206252_hg19.bam SRR1206250_hg19.bam -f BAM -g hs -n result_call_peaks/hg19_diff/hg19_diff_prostate -p 1e-5 -w --call-subpeaks
nohup macs2 callpeak -t SRR1206269_hg19.bam SRR1206268_hg19.bam SRR1206267_hg19.bam SRR1206266_hg19.bam SRR1206259_hg19.bam SRR1206265_hg19.bam SRR1206263_hg19.bam SRR1206262_hg19.bam SRR1206261_hg19.bam SRR1206258_hg19.bam SRR1206255_hg19.bam SRR1206253_hg19.bam SRR1206251_hg19.bam -c SRR1206264_hg19.bam SRR1206260_hg19.bam SRR1206257_hg19.bam SRR1206256_hg19.bam SRR1206254_hg19.bam SRR1206252_hg19.bam SRR1206250_hg19.bam -f BAM -g hs -n result_call_peaks/hg19_diff/hg19_diff_prostate -B -q 0.01 > log_macs_hg19_diff 2>&1 &

#由于分析较慢,且数据多,写批处理再用ParaFly并行计算
cd result_call_peaks/
mkdir hg19_tumor hg19_normal
mkdir hg38_diff hg38_tumor hg38_normal
cd ..
nohup ParaFly -c hg19and38_tumorandnomal_withouthg19diff -CPU 16 > log_hg19and38_macs_withouthg19diff 2>&1 &

cp /home/train/Liuping/raw_data/fastq_file/result_call_peaks/hg19_diff/hg19_diff_prostate_model.r /home/train/Liuping/raw_data/fastq_file/
cd /home/train/Liuping/raw_data/fastq_file/
Rscript hg19_diff_prostate_model.r
evince result_call_peaks/hg19_diff/hg19_diff_prostate_model.pdf

cd /home/train/Liuping/raw_data/fastq_file/result_call_peaks
perl /opt/biosoft/HOMER_v4.9/configureHomer.pl -install human-o
perl /opt/biosoft/HOMER_v4.9/configureHomer.pl -install hg19
perl /opt/biosoft/HOMER_v4.9/configureHomer.pl -install hg38
#nohup perl /opt/biosoft/HOMER_v4.9/configureHomer.pl -install hg19 >log_homer_hg19 2>&1 & 如果下载要很久的话 [1] 57498
cd /home/train/Liuping/raw_data/fastq_file/result_call_peaks/hg19_diff
more +29 hg19_diff_prostate_peaks.xls | awk -F "\t" '{print $10"\t"$1"\t"$2"\t"$3"\t+"}' > hg19_homer.bed
findMotifsGenome.pl hg19_homer.bed hg19 hg19_diff_motif_Dir -size 200 -mask #可以使用-p 多线程计算
#nohup findMotifsGenome.pl hg19_homer.bed hg19 hg19_diff_motif_Dir -size 200 -mask > log_findMotif 2>&1 & #[1] 60238
#先加工homer的bed文件,然后同意findMotifsGenome.pl
cd ..
cd hg19_normal/
more +25 hg19_normal_prostate_peaks.xls | awk -F "\t" '{print $10"\t"$1"\t"$2"\t"$3"\t+"}' > hg19_normal_homer.bed
nohup ParaFly -c findMotif_19and38_without_19diff -CPU 5 > log_findmitf_19_38_without19diff 2>&1 & 
#进行annotatePeak
annotatePeaks.pl hg19_diff/hg19_homer.bed hg19 > hg19_diff/peakAnn.xls 2> hg19_diff/annLog.txt
nohup ParaFly -c annopeaks -CPU 5 > log_annomotif_19_38_without19diff 2>&1 & #[1] 25431
#看下预测的gene_symbol不同基因组下的区别
les hg38_diff/peakAnn.xls | awk -F "\t" '{print $16}' > a_38
les hg19_diff/peakAnn.xls | awk -F "\t" '{print $16}' > b_19
uniq a_38 > a_uniq_38
uniq b_19 > b_uniq_19
sort a_uniq_38 > a_sort_uniq_38
sort b_uniq_19 > b_sort_uniq_19
diff a_sort_uniq_38 b_sort_uniq_19 | les #coding 全部相同


#重新计算normal相对tumor的差异上调,及tumor相对nomorl相对下调
cd /home/train/Liuping/raw_data/fastq_file/result_call_peaks
mkdir hg19_diff_normal
mkdir hg38_diff_normal
cd ..
nohup ParaFly -c macs2_callpeak_normal_diff -CPU 16 > log_macs2_callpeak_normal_diff 2>&1 &  
cd /home/train/Liuping/raw_data/fastq_file/result_call_peaks/hg19_diff_normal
more +29 hg19_diff_normal_prostate_peaks.xls | awk -F "\t" '{print $10"\t"$1"\t"$2"\t"$3"\t+"}' > hg19_diff_normal_prostate_homer.bed
cd /home/train/Liuping/raw_data/fastq_file/result_call_peaks
nohup ParaFly -c findMotif_normal_diff_19and38 -CPU 2 > log_indMotif_normal_diff_19and38 2>&1 & 
annotatePeaks.pl hg19_diff_normal/hg19_diff_normal_prostate_homer.bed hg19 > hg19_diff_normal/peakAnn.xls 2>hg19_diff_normal/annLog.txt
此条目发表在NGS_analysis分类目录。将固定链接加入收藏夹。

发表评论

邮箱地址不会被公开。 必填项已用*标注

To create code blocks or other preformatted text, indent by four spaces:

    This will be displayed in a monospaced font. The first four 
    spaces will be stripped off, but all other whitespace
    will be preserved.
    
    Markdown is turned off in code blocks:
     [This is not a link](http://example.com)

To create not a block, but an inline code span, use backticks:

Here is some inline `code`.

For more help see http://daringfireball.net/projects/markdown/syntax

Protected with IP Blacklist CloudIP Blacklist Cloud