library(pdftools)
rm(list=ls())
options(stringsAsFactors=F)
setwd("C:/Users/daizao/Desktop/pdf_table_read")
dz <- pdf_text("mmc3.pdf")
pdf2table <- function(x){
process1 <- strsplit(x,"\\r\\n")
process2 <- lapply(process1,function(x){gsub("\"\\s+(.*?)\\s+(.*?)\\s+(.*?)\\s+(.*?)\\s+(.*?)\"","\\1\t\\2\\3\\4\t\\5",x,perl=T)})
test <- data.frame(matrix(unlist(process2), nrow=(length(unlist(process2))), byrow=T))
data <- data.frame()
for (i in 2:nrow(test)){
a <- unlist(strsplit(test[i,]," "))
b <- a[grepl("^[a-zA-Z]",unlist(strsplit(test[i,]," ")))]
temp1 <- paste(b[2:(length(b)-1)],collapse=" ")
temp2 <- cbind(b[1],temp1,b[(length(b))])
data <- rbind(data,temp2)
}
return(data)
}
data <- pdf2table(dz)
colnames(data) <- data[1,]
data <- data[-1,]
write.table(data,file="gene.txt",sep="\t",row.names=F,quote=F)