硕士时候做的分析,整理一些放到博客,有问题可以发邮件咨询我
#建库
cd /home/train/gc_reovirus_data/gc_database
bowtie2-build C_idella_female_scaffolds.fasta.v1 genome
#质控
cd gc_fastq
fastqc -t 7 -o /home/train/gc_reovirus_data/gc_spleen_fastqc SRR1045893.fastq SRR1045894.fastq SRR1045895.fastq SRR1045896.fastq SRR1045897.fastq SRR1045898.fastq SRR1045899.fastq
cd /home/train/gc_reovirus_data/gc_spleen_fastqc
#去除接头
java -jar /opt/biosoft/Trimmomatic-0.33/trimmomatic-0.33.jar SE -threads 31 -phred33 SRR1045893.fastq SRR1045893.cut.fastq ILLUMINACLIP:/opt/biosoft/Trimmomatic-0.33/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
#去除N
./removeN SRR1045893.cut.fastq SRR1045893.unknowNu.fq
#去除低质量碱基(Q33表示你使用的是Illumina的编码方式而不是Sanger的编码方式)
fastq_quality_filter -Q33 -q 20 -p 70 -i SRR1045893.unknowNu.fq -o SRR1045893.clean.fq
#重新质控
fastqc SRR1045893.clean.fq -t 31 -o /home/train/gc_reovirus_data/gc_clean_fq SRR1045893.clean.fq SRR1045894.clean.fq SRR1045895.clean.fq SRR1045896.clean.fq SRR1045897.clean.fq SRR1045898.clean.fq SRR1045899.clean.fq
#tophat比对(注意修改输出文件路径!)
tophat -o mappingout93 -N 5 --read-edit-dist 5 -r 102 --mate-std-dev 20 -p 31 -a 5 -i 20 -I 4000 --min-segment-intron 20 --max-segment-intron 4000 --min-coverage-intron 20 --max-coverage-intron 4000 --coverage-search --microexon-search -G gc.final.gtf --library-type fr-unstranded /home/train/gc_reovirus_data/gc_database/genome SRR1045893.clean.fq
#mapping数据的分析
cd mappingout93
samtools view -h accepted_hits.bam | awk '$1~/^@/||$5==50{print $0}' | samtools view -bhS - > SRRgc93.unique.bam
samtools index SRRgc93.unique.bam
geneBody_coverage.py -i SRRgc93.unique.bam -r /home/gcfinal/gc.final.bed -o 93
junction_saturation.py -i SRRgc93.unique.bam -r /home/gcfinal/gc.final.bed -o 93
#cuffdiff计算(差异表达)
cd /home/train/gc_reovirus_data/gc_database/
cuffdiff -p 31 -b C_idella_female_scaffolds.fasta.v1 -L gc93,gc94,gc95,gc96,gc97,gc98,gc99 -o diff --library-type fr-unstranded -u gc.final.gtf mappingout93/accepted_hits.bam mappingout94/accepted_hits.bam mappingout95/accepted_hits.bam mappingout96/accepted_hits.bam mappingout97/accepted_hits.bam mappingout98/accepted_hits.bam mappingout99/accepted_hits.bam
cd diff/
parsing_cuffdiff_out.pl gene_exp.diff DEG
#表达量的计算
#htseq-count计算
cd mappingout93
htseq-count --stranded=no --format=bam --order=pos --idattr=gene_id --mode=intersection-nonempty -qSRRgc93.unique.bam /home/gcfinal/gc.final.gtf > SRR93.htseq_count.xls
#cufflinks
cd /home/train/gc_reovirus_data/gc_database/
cufflinks -o cufflinks93 -p 31 --library-type fr-unstranded -G /home/gcfinal/gc.final.gtf /home/train/gc_reovirus_data/gc_database/mappingout93/SRRgc93.unique.bam
cufflinks -p 31 -u -g gc.final.gtf -b C_idella_female_scaffolds.fasta.v1 -o cufflinks93_nweRNA mappingout93/accepted_hits.bam
#将多个新组装转录本merge
ls cufflinks9*_nweRNA/transcripts.gtf > assembly_GTF_list.txt
cuffmerge -g gc.final.gtf -s C_idella_female_scaffolds.fasta.v1 -p 31 -o cuffmerge_out assembly_GTF_list.txt
#将merge的gtf与参考基因组的gtf文件进行比较
cuffcompare -s C_idella_female_scaffolds.fasta.v1 -r gc.final.gtf -R -o compareGTF cuffmerge_out/merged.gtf
grep "class_code \"u\"" compareGtf.combined.gtf >novel.gtf
gtf_to_fasta novel.gtf C_idella_female_scaffolds.fasta.v1 novel.fa
#按照序列长度进行排序,然后再用UE将大于200bp的序列筛选出来(一行61bp,选大于3行,并多出17个碱基的序列即可)
seqkit sort -l novel.fa > test
cuffdiff -o test_diff -p 31 -L gc93,gc94,gc95,gc96,gc97,gc98,gc99 --library-type fr-unstranded -u cuffmerge_out/merged.gtf mappingout93/accepted_hits.bam mappingout94/accepted_hits.bam mappingout95/accepted_hits.bam mappingout96/accepted_hits.bam mappingout97/accepted_hits.bam mappingout98/accepted_hits.bam mappingout99/accepted_hits.bam
cuffnorm -o cuffnorm/ -p 31 -L gc93,gc94,gc95,gc96,gc97,gc98,gc99 cuffmerge_out/merged.gtf mappingout93/accepted_hits.bam mappingout94/accepted_hits.bam mappingout95/accepted_hits.bam mappingout96/accepted_hits.bam mappingout97/accepted_hits.bam mappingout98/accepted_hits.bam mappingout99/accepted_hits.bam
文章名:
Transcriptome data analysis of grass carp (Ctenopharyngodon idella) infected by reovirus provides insights into two immune-related genes