构建自定义注释包(make annotation package)

发表于2019年9月21日由daizao

#以拟南芥为例
library(RSQLite)
library(AnnotationForge)
options(stringsAsFactors=F)

setwd("C:/Users/daizao/Desktop/do_R_annotation/TAIR_DATA_20190711/TAIR_DATA_20190711")
#修改错误行，保存为ATH_GO_GOSLIM_new_dz.txt
test <- read.table("ATH_GO_GOSLIM_new_dz.txt",sep="\t",header=F,check.names=F,quote="",stringsAsFactors=F)
nrow(test)
go_df <- test[,c(1,6,8,10)]
go_df[,3] <- ifelse(go_df[,3]=="C","CC",go_df[,3])
go_df[,3] <- ifelse(go_df[,3]=="P","BP",go_df[,3])
go_df[,3] <- ifelse(go_df[,3]=="F","MF",go_df[,3])
go_df <- go_df[grepl("^AT\\d+",go_df[,1]),]
colnames(go_df) <- c("GID","GO","ONTOLOGY","EVIDENCE")
pub_df <- read.table("Locus_Published_20170630.txt",sep="\t",header=F,quote="",stringsAsFactors=F)
colnames(pub_df) <- c("name","reference_id","pubmed_id","publication_year")
pub_df[,3] <- ifelse((is.na(pub_df[,3]) | is.null(pub_df[,3]) | pub_df[,3] == "NULL"),"",pub_df[,3])
pub_df <- pub_df[grepl("^AT\\d+",pub_df$name),]

#循环好慢
# b <- data.frame()
# for (i in (1:nrow(pub_df))){
    # if((length(unlist(strsplit(pub_df$name, split = "\\.")[i])) == 2)){(b <- rbind(b,cbind(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],paste(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],unlist(strsplit(pub_df$name, split = "\\.")[i])[2],sep="."))))}else{b<- rbind(b,(cbind(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],"")))}
# }
# b <- cbind(b,pub_df[,2:4])

pub_df <- cbind(GID=do.call(rbind,strsplit(pub_df$name,"\\."))[,1],pub_df[,1:4])
colnames(pub_df) <- c("GID","GENEID","REFID","PMID","PUBYEAR")

symbol_df <- read.table("gene_aliases_20170630.txt",sep="\t",header=F,quote="",stringsAsFactors=F,check.names=F,comment.char="")
symbol_df <- symbol_df[grepl("^AT\\d+",symbol_df[,1]),]
colnames(symbol_df) <- c("GID","SYMBOL","SYMBOL_NAME")

func_df <- read.table("Araport11_functional_descriptions_20170630.txt",sep="\t",quote="",header=F,stringsAsFactors=F,check.names=F,comment.char="")
func_df <- func_df[grepl("^AT\\d+",func_df[,1]),]
func_df <- cbind(GID=do.call(rbind,strsplit(func_df[,1],"\\."))[,1],func_df[,1:5])
colnames(func_df) <- c("GID","TXID","GENE_MODEL_TYPE","SHORT_DESCRIPTION","CURATED_DESCRIPTION","DESCRIPTION")
func_df[,4] <- ifelse((is.na(func_df[,4]) | is.null(func_df[,4]) | func_df[,4] == "NULL"),"",func_df[,4])
func_df$DESCRIPTION <- gsub("\\(source\\:Araport11\\)","",func_df$DESCRIPTION)
func_df <- func_df[order(func_df$GID),]


go_df <- go_df[!duplicated(go_df),]
go_df <- go_df[,c(1,2,4)]
pub_df <- pub_df[!duplicated(pub_df),]
symbol_df <- symbol_df[!duplicated(symbol_df),]
func_df <- func_df[!duplicated(func_df),]

#因为需要GID列一致，所以需要选择都有的GID才行

go_and_pub <- intersect(go_df$GID,pub_df$GID)
go_pub_sumbol <- intersect(go_and_pub,symbol_df$GID)
all_jiaoji <- intersect(go_pub_sumbol,func_df$GID)

new_go_df <- data.frame()
for (i in all_jiaoji){new_go_df <- rbind(new_go_df,subset(go_df,go_df$GID==i))}


new_pub_df <- data.frame()
for (i in all_jiaoji){new_pub_df <- rbind(new_pub_df,subset(pub_df,pub_df$GID==i))}


new_symbol_df <- data.frame()
for (i in all_jiaoji){new_symbol_df <- rbind(new_symbol_df,subset(symbol_df,symbol_df$GID==i))}

new_func_df <- data.frame()
for (i in all_jiaoji){new_func_df <- rbind(new_func_df,subset(func_df,func_df$GID==i))}


file_path <- file.path(getwd())
makeOrgPackage(go = new_go_df,
               pub_info = new_pub_df,
               symbol_info = new_symbol_df,
               function_info = new_func_df,
               version = "0.1",
               maintainer = "daizao <dzbioinformatics@qq.com>",
               author="daizao <dzbioinformatics@qq.com>",
               outputDir = file_path,
               tax_id = "3702",
               genus = "At",
               species = "tair10",
               goTable = "go"
)


#测试
install.packages("./org.Atair10.eg.db", repos = NULL,
                 type = "source")
library(org.Atair10.eg.db)
library(clusterProfiler)
library(org.Hs.eg.db)
library(hash)
org <- org.Atair10.eg.db
result <- enrichGO(gene=all_jiaoji[1:100],OrgDb=org,keyType="GID",minGSSize=1,pvalue=1)
bb <- bitr(unlist(strsplit(result$geneID,"\\/")),fromType="GID",toType="SYMBOL",OrgDb=org)
h=hash()
for (i in 1:nrow(bb)){.set(h,keys=bb[i,1],values=bb[i,2])}
dz <- as.data.frame(result)
DT::datatable(dz)
DT::datatable(bb)

其实更好的一种注释方式是：CellMarker enrichment analysis (CellMarker 富集分析)
参考连接：Bioconductor的注释包太旧怎么办？自己做呀

此条目发表在R分类目录。将固定链接加入收藏夹。

《构建自定义注释包(make annotation package)》有2条回应

柯莱特说：

2022年12月8日下午4:13

我在最后一步构建的时候一直报错
Error in .makeOrgPackage(data, version = version, maintainer = maintainer, :
The 1st column must always be the gene ID ‘GID’

回复
- daizao说：
  
  2022年12月17日下午4:47
  
  第一列的数据可能有问题
  
  回复

发表评论取消回复

搜索：
链接表
- 陈连福的生信博客
- dzwgylab
近期文章
近期评论
- daizao发表在《TCGA与dbGaP账号申请（dbGaP申请指南）》
- zzz发表在《TCGA与dbGaP账号申请（dbGaP申请指南）》
- daizao发表在《TCGA与dbGaP账号申请（dbGaP申请指南）》
- liyuqing发表在《TCGA与dbGaP账号申请（dbGaP申请指南）》
- daizao发表在《TCGA与dbGaP账号申请（dbGaP申请指南）》
分类目录
- 3D_genome
- C
- C++
- Computer
- Container
- Endnote
- Linux
- Molecular Dynamics
- NGS_analysis
- Perl
- Python
- Quantum Chemical Calculation
- R
- TCGA
- 实验操作
- 日常记录

分类目录
- 3D_genome
- C
- C++
- Computer
- Container
- Endnote
- Linux
- Molecular Dynamics
- NGS_analysis
- Perl
- Python
- Quantum Chemical Calculation
- R
- TCGA
- 实验操作
- 日常记录