#以拟南芥为例
library(RSQLite)
library(AnnotationForge)
options(stringsAsFactors=F)
setwd("C:/Users/daizao/Desktop/do_R_annotation/TAIR_DATA_20190711/TAIR_DATA_20190711")
#修改错误行,保存为ATH_GO_GOSLIM_new_dz.txt
test <- read.table("ATH_GO_GOSLIM_new_dz.txt",sep="\t",header=F,check.names=F,quote="",stringsAsFactors=F)
nrow(test)
go_df <- test[,c(1,6,8,10)]
go_df[,3] <- ifelse(go_df[,3]=="C","CC",go_df[,3])
go_df[,3] <- ifelse(go_df[,3]=="P","BP",go_df[,3])
go_df[,3] <- ifelse(go_df[,3]=="F","MF",go_df[,3])
go_df <- go_df[grepl("^AT\\d+",go_df[,1]),]
colnames(go_df) <- c("GID","GO","ONTOLOGY","EVIDENCE")
pub_df <- read.table("Locus_Published_20170630.txt",sep="\t",header=F,quote="",stringsAsFactors=F)
colnames(pub_df) <- c("name","reference_id","pubmed_id","publication_year")
pub_df[,3] <- ifelse((is.na(pub_df[,3]) | is.null(pub_df[,3]) | pub_df[,3] == "NULL"),"",pub_df[,3])
pub_df <- pub_df[grepl("^AT\\d+",pub_df$name),]
#循环好慢
# b <- data.frame()
# for (i in (1:nrow(pub_df))){
# if((length(unlist(strsplit(pub_df$name, split = "\\.")[i])) == 2)){(b <- rbind(b,cbind(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],paste(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],unlist(strsplit(pub_df$name, split = "\\.")[i])[2],sep="."))))}else{b<- rbind(b,(cbind(unlist(strsplit(pub_df$name, split = "\\.")[i])[1],"")))}
# }
# b <- cbind(b,pub_df[,2:4])
pub_df <- cbind(GID=do.call(rbind,strsplit(pub_df$name,"\\."))[,1],pub_df[,1:4])
colnames(pub_df) <- c("GID","GENEID","REFID","PMID","PUBYEAR")
symbol_df <- read.table("gene_aliases_20170630.txt",sep="\t",header=F,quote="",stringsAsFactors=F,check.names=F,comment.char="")
symbol_df <- symbol_df[grepl("^AT\\d+",symbol_df[,1]),]
colnames(symbol_df) <- c("GID","SYMBOL","SYMBOL_NAME")
func_df <- read.table("Araport11_functional_descriptions_20170630.txt",sep="\t",quote="",header=F,stringsAsFactors=F,check.names=F,comment.char="")
func_df <- func_df[grepl("^AT\\d+",func_df[,1]),]
func_df <- cbind(GID=do.call(rbind,strsplit(func_df[,1],"\\."))[,1],func_df[,1:5])
colnames(func_df) <- c("GID","TXID","GENE_MODEL_TYPE","SHORT_DESCRIPTION","CURATED_DESCRIPTION","DESCRIPTION")
func_df[,4] <- ifelse((is.na(func_df[,4]) | is.null(func_df[,4]) | func_df[,4] == "NULL"),"",func_df[,4])
func_df$DESCRIPTION <- gsub("\\(source\\:Araport11\\)","",func_df$DESCRIPTION)
func_df <- func_df[order(func_df$GID),]
go_df <- go_df[!duplicated(go_df),]
go_df <- go_df[,c(1,2,4)]
pub_df <- pub_df[!duplicated(pub_df),]
symbol_df <- symbol_df[!duplicated(symbol_df),]
func_df <- func_df[!duplicated(func_df),]
#因为需要GID列一致,所以需要选择都有的GID才行
go_and_pub <- intersect(go_df$GID,pub_df$GID)
go_pub_sumbol <- intersect(go_and_pub,symbol_df$GID)
all_jiaoji <- intersect(go_pub_sumbol,func_df$GID)
new_go_df <- data.frame()
for (i in all_jiaoji){new_go_df <- rbind(new_go_df,subset(go_df,go_df$GID==i))}
new_pub_df <- data.frame()
for (i in all_jiaoji){new_pub_df <- rbind(new_pub_df,subset(pub_df,pub_df$GID==i))}
new_symbol_df <- data.frame()
for (i in all_jiaoji){new_symbol_df <- rbind(new_symbol_df,subset(symbol_df,symbol_df$GID==i))}
new_func_df <- data.frame()
for (i in all_jiaoji){new_func_df <- rbind(new_func_df,subset(func_df,func_df$GID==i))}
file_path <- file.path(getwd())
makeOrgPackage(go = new_go_df,
pub_info = new_pub_df,
symbol_info = new_symbol_df,
function_info = new_func_df,
version = "0.1",
maintainer = "daizao <dzbioinformatics@qq.com>",
author="daizao <dzbioinformatics@qq.com>",
outputDir = file_path,
tax_id = "3702",
genus = "At",
species = "tair10",
goTable = "go"
)
#测试
install.packages("./org.Atair10.eg.db", repos = NULL,
type = "source")
library(org.Atair10.eg.db)
library(clusterProfiler)
library(org.Hs.eg.db)
library(hash)
org <- org.Atair10.eg.db
result <- enrichGO(gene=all_jiaoji[1:100],OrgDb=org,keyType="GID",minGSSize=1,pvalue=1)
bb <- bitr(unlist(strsplit(result$geneID,"\\/")),fromType="GID",toType="SYMBOL",OrgDb=org)
h=hash()
for (i in 1:nrow(bb)){.set(h,keys=bb[i,1],values=bb[i,2])}
dz <- as.data.frame(result)
DT::datatable(dz)
DT::datatable(bb)
其实更好的一种注释方式是:CellMarker enrichment analysis (CellMarker 富集分析)
参考连接:Bioconductor的注释包太旧怎么办?自己做呀
我在最后一步构建的时候一直报错
Error in .makeOrgPackage(data, version = version, maintainer = maintainer, :
The 1st column must always be the gene ID ‘GID’
第一列的数据可能有问题