草鱼转录组数据分析部分流程

硕士时候做的分析,整理一些放到博客,有问题可以发邮件咨询我

#建库
 cd /home/train/gc_reovirus_data/gc_database
 bowtie2-build C_idella_female_scaffolds.fasta.v1 genome 
#质控
 cd gc_fastq
 fastqc -t 7 -o /home/train/gc_reovirus_data/gc_spleen_fastqc SRR1045893.fastq SRR1045894.fastq SRR1045895.fastq SRR1045896.fastq SRR1045897.fastq SRR1045898.fastq SRR1045899.fastq
 cd /home/train/gc_reovirus_data/gc_spleen_fastqc
#去除接头
 java -jar /opt/biosoft/Trimmomatic-0.33/trimmomatic-0.33.jar SE -threads 31 -phred33 SRR1045893.fastq SRR1045893.cut.fastq ILLUMINACLIP:/opt/biosoft/Trimmomatic-0.33/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
#去除N
 ./removeN SRR1045893.cut.fastq SRR1045893.unknowNu.fq
#去除低质量碱基(Q33表示你使用的是Illumina的编码方式而不是Sanger的编码方式)
 fastq_quality_filter -Q33 -q 20 -p 70 -i SRR1045893.unknowNu.fq -o SRR1045893.clean.fq
#重新质控
 fastqc SRR1045893.clean.fq -t 31 -o /home/train/gc_reovirus_data/gc_clean_fq SRR1045893.clean.fq SRR1045894.clean.fq SRR1045895.clean.fq SRR1045896.clean.fq SRR1045897.clean.fq SRR1045898.clean.fq SRR1045899.clean.fq 
#tophat比对(注意修改输出文件路径!)
 tophat -o mappingout93 -N 5 --read-edit-dist 5 -r 102 --mate-std-dev 20 -p 31 -a 5 -i 20 -I 4000 --min-segment-intron 20 --max-segment-intron 4000 --min-coverage-intron 20 --max-coverage-intron 4000 --coverage-search --microexon-search -G gc.final.gtf  --library-type fr-unstranded /home/train/gc_reovirus_data/gc_database/genome  SRR1045893.clean.fq
#mapping数据的分析 
 cd mappingout93
 samtools view -h accepted_hits.bam | awk '$1~/^@/||$5==50{print $0}' | samtools view -bhS - > SRRgc93.unique.bam
 samtools index SRRgc93.unique.bam
 geneBody_coverage.py -i SRRgc93.unique.bam -r /home/gcfinal/gc.final.bed -o 93
 junction_saturation.py -i SRRgc93.unique.bam -r /home/gcfinal/gc.final.bed  -o 93 
#cuffdiff计算(差异表达)
 cd /home/train/gc_reovirus_data/gc_database/
 cuffdiff -p 31 -b C_idella_female_scaffolds.fasta.v1 -L gc93,gc94,gc95,gc96,gc97,gc98,gc99 -o diff --library-type fr-unstranded -u gc.final.gtf mappingout93/accepted_hits.bam mappingout94/accepted_hits.bam mappingout95/accepted_hits.bam mappingout96/accepted_hits.bam mappingout97/accepted_hits.bam mappingout98/accepted_hits.bam mappingout99/accepted_hits.bam
 cd diff/
 parsing_cuffdiff_out.pl gene_exp.diff DEG
#表达量的计算
 #htseq-count计算
 cd mappingout93
 htseq-count --stranded=no --format=bam --order=pos --idattr=gene_id --mode=intersection-nonempty -qSRRgc93.unique.bam /home/gcfinal/gc.final.gtf > SRR93.htseq_count.xls    
#cufflinks
 cd /home/train/gc_reovirus_data/gc_database/
 cufflinks -o cufflinks93 -p 31 --library-type fr-unstranded -G /home/gcfinal/gc.final.gtf /home/train/gc_reovirus_data/gc_database/mappingout93/SRRgc93.unique.bam
cufflinks -p 31 -u -g gc.final.gtf -b C_idella_female_scaffolds.fasta.v1 -o cufflinks93_nweRNA mappingout93/accepted_hits.bam
#将多个新组装转录本merge
ls cufflinks9*_nweRNA/transcripts.gtf > assembly_GTF_list.txt
cuffmerge -g gc.final.gtf -s C_idella_female_scaffolds.fasta.v1 -p 31 -o cuffmerge_out assembly_GTF_list.txt 
#将merge的gtf与参考基因组的gtf文件进行比较
cuffcompare -s C_idella_female_scaffolds.fasta.v1 -r gc.final.gtf -R -o compareGTF cuffmerge_out/merged.gtf
grep "class_code \"u\"" compareGtf.combined.gtf >novel.gtf
gtf_to_fasta novel.gtf C_idella_female_scaffolds.fasta.v1 novel.fa
#按照序列长度进行排序,然后再用UE将大于200bp的序列筛选出来(一行61bp,选大于3行,并多出17个碱基的序列即可)
seqkit sort -l novel.fa > test
cuffdiff -o test_diff -p 31 -L gc93,gc94,gc95,gc96,gc97,gc98,gc99 --library-type fr-unstranded -u cuffmerge_out/merged.gtf mappingout93/accepted_hits.bam mappingout94/accepted_hits.bam mappingout95/accepted_hits.bam mappingout96/accepted_hits.bam mappingout97/accepted_hits.bam mappingout98/accepted_hits.bam mappingout99/accepted_hits.bam
cuffnorm -o cuffnorm/ -p 31 -L gc93,gc94,gc95,gc96,gc97,gc98,gc99 cuffmerge_out/merged.gtf mappingout93/accepted_hits.bam mappingout94/accepted_hits.bam mappingout95/accepted_hits.bam mappingout96/accepted_hits.bam mappingout97/accepted_hits.bam mappingout98/accepted_hits.bam mappingout99/accepted_hits.bam

文章名:
Transcriptome data analysis of grass carp (Ctenopharyngodon idella) infected by reovirus provides insights into two immune-related genes

此条目发表在NGS_analysis分类目录。将固定链接加入收藏夹。

发表评论

邮箱地址不会被公开。 必填项已用*标注

To create code blocks or other preformatted text, indent by four spaces:

    This will be displayed in a monospaced font. The first four 
    spaces will be stripped off, but all other whitespace
    will be preserved.
    
    Markdown is turned off in code blocks:
     [This is not a link](http://example.com)

To create not a block, but an inline code span, use backticks:

Here is some inline `code`.

For more help see http://daringfireball.net/projects/markdown/syntax

Protected with IP Blacklist CloudIP Blacklist Cloud