安装RSelenium/Rwebdriver/rdom包
install.packages("RSelenium")
# 直接从CRAN下载RSelenium包
install.packages("devtools")
library(devtools)
install_github(repo = "Rwebdriver", username = "crubba")
# 从github下载Rwebdriver包
install_github("cpsievert/rdom")
# 从github下载rdom包
启动Selenium服务器
用RSelenium、Rwebdriver包抓取表格数据时,需在控制台输入
java -jar D:\R\library\Rwebdriver\selenium-server-standalone-3.7.1.jar
以启动Selenium服务器,配合plantomjs或其他浏览器,有建立-断开连接、打开-结束会话的过程而用rdom包时,其通过调用plantomjs浏览器,模拟真实浏览器渲染和解析DOM,因此只需要下载plantomjs无头浏览器,并将存放路径添加到系统变量
案例一:北京天气数据
- 页面准备
library(dplyr)
library(xml2)
# 管道操作符/网页转码函数
weather.url <- "https://www.aqistudy.cn/historydata/monthdata.php?city=北京" %>% url_escape(reserved = "][!$&'()*+,;=:/?@#")
# url_escape()是xml2包的函数,参数reserved用来指定网址中不需要转码的字符,这里要保留除“北京”之外的任何标点符号如][!$&'()*+,;=:/?@#等
# 也可通过在线平台(http://tool.oschina.net/encode?type=4)进行转码,再将转码后的网址复制过来
- RSelenium包
library(RSelenium)
remDr <- remoteDriver(browserName = "phantomjs")
remDr$open()
# 用无头浏览器模拟访问,创建一个remoteDriver对象并打开
remDr$navigate(weather.url)
# 访问指定页面
library(XML)
table <- remDr$getPageSource()[[1]] %>% htmlParse(encoding = "UTF-8") %>% readHTMLTable(header = FALSE, which = 1)
# 法一:用XML包的函数获取内容-解析结构-抓取表格(同rvest)
library(rvest)
table <- remDr$getPageSource()[[1]] %>% read_html(encoding = "UTF-8") %>% html_table(header = FALSE) %>% .[[1]]
# 法二:用rvest包的函数获取内容-解析结构-抓取表格(同XML)
View(table)
write.table(table, row.names = FALSE, col.names = FALSE, sep = ",", "weather.csv")
# View()函数查看数据并导出到本地
remDr$close()
# 关闭remoteDriver对象
- Rwebdriver包
library(Rwebdriver)
start_session(root = "http://localhost:4444/wd/hub/", browser = "phantomjs")
# 创建一个新的会话过程
post.url(weather.url)
# 访问指定页面
library(XML)
table <- page_source() %>% htmlParse(encoding = "UTF-8") %>% readHTMLTable(header = FALSE, which = 1)
# 法一:用XML包的函数获取内容-解析结构-抓取表格(同rvest)
library(stringi)
library(rvest)
table <- page_source() %>% stri_conv(., from = "UTF-8") %>% read_html() %>% html_table() %>% .[[1]]
# 法二:用rvest包的函数获取内容-解析结构-抓取表格(同XML)。stri_conv()是stringi包的字符编码函数
View(table)
write.table(table, row.names = FALSE, sep = ",", "weather.csv")
# View()函数查看数据并导出到本地
quit_session()
# 结束会话
- rdom包
library(rdom)
library(XML)
table <- rdom(weather.url) %>% readHTMLTable(header = FALSE)
# rdom(url)与htmlParse(url)得到的文件类型相同,均为"HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" "XMLAbstractDocument",进一步用readHTMLTable()提取表格
View(table)
write.table(table, row.names = FALSE, col.names = FALSE, sep = ",", "weather.csv")
# View()函数查看数据并导出到本地
- 查看数据
案例二:lol选手排名表
- 页面准备
lol.url <- "http://www.wanplus.com/lol/playerstats"
# 页面无需转码
- RSelenium
library(RSelenium)
remDr <- remoteDriver(browserName = "phantomjs")
remDr$open()
# 用无头浏览器模拟访问,创建一个remoteDriver对象并打开
remDr$navigate(lol.url)
# 访问指定页面
library(XML)
library(dplyr)
table <- remDr$getPageSource()[[1]] %>% htmlParse(encoding = "UTF-8") %>% readHTMLTable(header = FALSE, which = 1)
# 法一:用XML包的函数获取内容-解析结构-抓取表格(同rvest)
library(rvest)
table <- remDr$getPageSource()[[1]] %>% read_html(encoding = "UTF-8") %>% html_table(header = T) %>% .[[1]]
# 法二:用rvest包的函数获取内容-解析结构-抓取表格(同XML)
colnames(table) <-c ("名次", "选手", "战队", "位置", "出场次数", "KDA", "参团率", "场均击杀", "单场最高击杀", "场均死亡", "单场最高死亡", "场均助攻")
# names(table) <- c("", "", ...)或colnames(table) <- c("", "", ...)赋列名
View(table)
write.table(table, row.names = FALSE, sep = ",", "lolplayer.csv")
# View()函数查看数据并导出到本地
remDr$close()
# 关闭remoteDriver对象
- Rwebdriver
library(Rwebdriver)
start_session(root = "http://localhost:4444/wd/hub/", browser = "phantomjs")
# 创建一个新的会话过程
post.url(lol.url)
# 访问指定页面
library(XML)
library(dplyr)
table <- page_source() %>% htmlParse(encoding = "UTF-8") %>% readHTMLTable(header = FALSE, which = 1)
# 法一:用XML包的函数获取内容-解析结构-抓取表格(同rvest)
library(stringi)
library(rvest)
table <- page_source() %>% stri_conv(from = "UTF-8") %>% read_html() %>% html_table() %>% .[[1]]
# 法二:用rvest包的函数获取内容-解析结构-抓取表格(同XML)。stri_conv()是stringi包的字符编码函数
colnames(table) <- c("名次", "选手", "战队", "位置", "出场次数", "KDA", "参团率", "场均击杀", "单场最高击杀", "场均死亡", "单场最高死亡", "场均助攻")
# names(table) <- c("", "", ...)或colnames(table) <- c("", "", ...)赋列名
View(table)
write.table(table, row.names = FALSE, sep = ",", "lolplayer.csv")
# View()函数查看数据并导出到本地
quit_session()
# 结束会话
- rdom包
library(rdom)
library(XML)
table <- rdom(lol.url) %>% readHTMLTable(header = FALSE, which = 1)
# rdom(url)与htmlParse(url)得到的文件类型相同,均为"HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" "XMLAbstractDocument",进一步用readHTMLTable()提取表格
colnames(table) <- c("名次", "选手", "战队", "位置", "出场次数", "KDA", "参团率", "场均击杀", "单场最高击杀", "场均死亡", "单场最高死亡", "场均助攻")
# names(table) <- c("", "", ...)或colnames(table) <- c("", "", ...)赋列名
View(table)
write.table(table, row.names = FALSE, sep = ",", "lolplayer.csv")
# View()函数查看数据并导出到本地
- 查看数据