构建自定义注释包(make annotation package)

#以拟南芥为例
library(RSQLite)
library(AnnotationForge)
options(stringsAsFactors=F)

setwd("C:/Users/daizao/Desktop/do_R_annotation/TAIR_DATA_20190711/TAIR_DATA_20190711")
#修改错误行,保存为ATH_GO_GOSLIM_new_dz.txt
test <- read.table("ATH_GO_GOSLIM_new_dz.txt",sep="\t",header=F,check.names=F,quote="",stringsAsFactors=F)
nrow(test)
go_df <- test[,c(1,6,8,10)]
go_df[,3] <- ifelse(go_df[,3]=="C","CC",go_df[,3])
go_df[,3] <- ifelse(go_df[,3]=="P","BP",go_df[,3])
go_df[,3] <- ifelse(go_df[,3]=="F","MF",go_df[,3])
go_df <- go_df[grepl("^AT\\d+",go_df[,1]),]
colnames(go_df) <- c("GID","GO","ONTOLOGY","EVIDENCE")
pub_df <- read.table("Locus_Published_20170630.txt",sep="\t",header=F,quote="",stringsAsFactors=F)
colnames(pub_df) <- c("name","reference_id","pubmed_id","publication_year")
pub_df[,3] <- ifelse((is.na(pub_df[,3]) | is.null(pub_df[,3]) | pub_df[,3] == "NULL"),"",pub_df[,3])
pub_df <- pub_df[grepl("^AT\\d+",pub_df$name),]

#循环好慢
# b <- data.frame()
# for (i in (1:nrow(pub_df))){
    # if((length(unlist(strsplit(pub_df$name, split = "\\.")[i])) == 2)){(b <- rbind(b,cbind(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],paste(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],unlist(strsplit(pub_df$name, split = "\\.")[i])[2],sep="."))))}else{b<- rbind(b,(cbind(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],"")))}
# }
# b <- cbind(b,pub_df[,2:4])

pub_df <- cbind(GID=do.call(rbind,strsplit(pub_df$name,"\\."))[,1],pub_df[,1:4])
colnames(pub_df) <- c("GID","GENEID","REFID","PMID","PUBYEAR")

symbol_df <- read.table("gene_aliases_20170630.txt",sep="\t",header=F,quote="",stringsAsFactors=F,check.names=F,comment.char="")
symbol_df <- symbol_df[grepl("^AT\\d+",symbol_df[,1]),]
colnames(symbol_df) <- c("GID","SYMBOL","SYMBOL_NAME")

func_df <- read.table("Araport11_functional_descriptions_20170630.txt",sep="\t",quote="",header=F,stringsAsFactors=F,check.names=F,comment.char="")
func_df <- func_df[grepl("^AT\\d+",func_df[,1]),]
func_df <- cbind(GID=do.call(rbind,strsplit(func_df[,1],"\\."))[,1],func_df[,1:5])
colnames(func_df) <- c("GID","TXID","GENE_MODEL_TYPE","SHORT_DESCRIPTION","CURATED_DESCRIPTION","DESCRIPTION")
func_df[,4] <- ifelse((is.na(func_df[,4]) | is.null(func_df[,4]) | func_df[,4] == "NULL"),"",func_df[,4])
func_df$DESCRIPTION <- gsub("\\(source\\:Araport11\\)","",func_df$DESCRIPTION)
func_df <- func_df[order(func_df$GID),]


go_df <- go_df[!duplicated(go_df),]
go_df <- go_df[,c(1,2,4)]
pub_df <- pub_df[!duplicated(pub_df),]
symbol_df <- symbol_df[!duplicated(symbol_df),]
func_df <- func_df[!duplicated(func_df),]

#因为需要GID列一致,所以需要选择都有的GID才行

go_and_pub <- intersect(go_df$GID,pub_df$GID)
go_pub_sumbol <- intersect(go_and_pub,symbol_df$GID)
all_jiaoji <- intersect(go_pub_sumbol,func_df$GID)

new_go_df <- data.frame()
for (i in all_jiaoji){new_go_df <- rbind(new_go_df,subset(go_df,go_df$GID==i))}


new_pub_df <- data.frame()
for (i in all_jiaoji){new_pub_df <- rbind(new_pub_df,subset(pub_df,pub_df$GID==i))}


new_symbol_df <- data.frame()
for (i in all_jiaoji){new_symbol_df <- rbind(new_symbol_df,subset(symbol_df,symbol_df$GID==i))}

new_func_df <- data.frame()
for (i in all_jiaoji){new_func_df <- rbind(new_func_df,subset(func_df,func_df$GID==i))}


file_path <- file.path(getwd())
makeOrgPackage(go = new_go_df,
               pub_info = new_pub_df,
               symbol_info = new_symbol_df,
               function_info = new_func_df,
               version = "0.1",
               maintainer = "daizao <dzbioinformatics@qq.com>",
               author="daizao <dzbioinformatics@qq.com>",
               outputDir = file_path,
               tax_id = "3702",
               genus = "At",
               species = "tair10",
               goTable = "go"
)


#测试
install.packages("./org.Atair10.eg.db", repos = NULL,
                 type = "source")
library(org.Atair10.eg.db)
library(clusterProfiler)
library(org.Hs.eg.db)
library(hash)
org <- org.Atair10.eg.db
result <- enrichGO(gene=all_jiaoji[1:100],OrgDb=org,keyType="GID",minGSSize=1,pvalue=1)
bb <- bitr(unlist(strsplit(result$geneID,"\\/")),fromType="GID",toType="SYMBOL",OrgDb=org)
h=hash()
for (i in 1:nrow(bb)){.set(h,keys=bb[i,1],values=bb[i,2])}
dz <- as.data.frame(result)
DT::datatable(dz)
DT::datatable(bb)

其实更好的一种注释方式是:CellMarker enrichment analysis (CellMarker 富集分析)
参考连接:Bioconductor的注释包太旧怎么办?自己做呀

此条目发表在R分类目录。将固定链接加入收藏夹。

发表评论

电子邮件地址不会被公开。 必填项已用*标注

To create code blocks or other preformatted text, indent by four spaces:

    This will be displayed in a monospaced font. The first four 
    spaces will be stripped off, but all other whitespace
    will be preserved.
    
    Markdown is turned off in code blocks:
     [This is not a link](http://example.com)

To create not a block, but an inline code span, use backticks:

Here is some inline `code`.

For more help see http://daringfireball.net/projects/markdown/syntax

Protected with IP Blacklist CloudIP Blacklist Cloud