下载package
install.packages("abbyyR")
## devtools::install_github('soodoku/abbyyR')
library(abbyyR)
# 设置工程目录
project.dir <- "E:/Temp/"
rm(list = ls())
抓取图片
library(rvest)
library(dplyr)
url = "http://www.c-gec.cn/a/zuixinhuodong/2017/0803/2531.html"
imageUrl <- read_html(url) %>%
html_nodes(xpath = "//div[@id='entrybody']/div/img/@src") %>%
html_text
for(item in imageUrl){
curl_download(item, destfile=paste0(getwd(),"/raw/",basename(item)))
}
建立App
首先要在http://ocrsdk.com/建立app应用,获得Id和password
setapp(c("ROcrApp1", "63WTSkZa8OZu2fQGqNh*****"))
getAppInfo()
清空App空间
all_tasks <- listTasks()
for (i in 1:nrow(all_tasks))
deleteTask(as.character(all_tasks$id[i]))
监控提交任务
filename <- paste0(project.dir,dir(project.dir, recursive=TRUE))
library(progress)
pb <- progress_bar$new(format = " downloading [:bar] :percent\n",
total = length(filename),
clear = FALSE, width= 60)
tracker <- data.frame(filename=NA, taskid=NA)
# Loop
j <- 1
for(file in filename){
print(file)
tracker[j,] <- c(basename(file), as.character(abbyyR::submitImage(file_path=file)$id))
j <- j + 1
# Prg. bar
pb$tick()
Sys.sleep(1/100)
}
执行Ocr
for (i in 1:nrow(tracker))
processDocument(tracker$taskid[i], language="ChinesePRC", profile="documentConversion", exportFormat="xlsx")
任务状态
i <- 1
while(TRUE){
i <- nrow(listFinishedTasks())
if (i == length(filename)){
print("All Done!")
break;
}
Sys.sleep(2)
}
下载文件
finishedlist <- listFinishedTasks() %>%
mutate(status = as.character(status)) %>%
filter(status == "Completed")
results <- merge(tracker, finishedlist, by.x="taskid", by.y="id")
library(curl)
setwd(project.dir)
for(i in 1:nrow(results)){
print(i)
curl_download(as.character(results$resultUrl[i]), destfile=paste0(getwd(),"/res/",sub(".png","",results$filename[i]),".xlsx"))
}