Python
import csv
import time
import codecs
import random
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
}
f = codecs.open('D://Spyder/WD/xiaozhu.csv', 'w', 'utf-8_sig')
writer = csv.writer(f)
writer.writerow(('title', 'address', 'price', 'img', 'name', 'gender'))
def judgement_gender(class_name):
if class_name == ['member_ico1']:
return '女'
else:
return '男'
def get_links(url):
destination = requests.get(url, headers = headers)
soup = BeautifulSoup(destination.text, 'lxml')
links = soup.select('#page_list ul li > a')
for link in links:
href = link.get('href')
get_info(href)
def get_info(url):
destination = requests.get(url, headers = headers)
soup = BeautifulSoup(destination.text, 'lxml')
title = soup.select('div.pho_info h4 em')[0].text
addr = soup.select('span.pr5')
if len(addr) == 0:
address = 'NA'
else:
address = soup.select('span.pr5')[0].text
price = soup.select('div.day_l span')[0].text
img = soup.select('div.member_pic a img')[0].get('src')
name = soup.select('div.w_240 h6 a')[0].text
gender = judgement_gender(soup.select('div.member_pic div')[0].get('class'))
writer.writerow((title, address, price, img, name, gender))
if __name__ == '__main__':
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) \
for number in range (1, 14)]
for single_url in urls:
get_links(single_url)
time.sleep(random.randint(1, 5))
f.close()
R
# 加载包
library(rvest)
library(stringr)
# 定义judgeFunc,判断房东性别
judgeFunc <- function(class_name) {
if (class_name == 'member_ico1') {
return('女')
} else {
return('男')
}
}
# 定义GetlinkFunc,获取房屋详情页的链接
GetlinkFunc <- function(url) {
result <- list()
for (i in seq_along(url)){
destination <- read_html(url[i], encoding = 'UTF-8')
data <- destination %>% html_nodes('#page_list ul li > a') %>% html_attr('href')
result <- rbind(result, data)
cat(sprintf('第【%d】页抓取成功', i), sep = '\n')
Sys.sleep(runif(1, 1, 5))
}
return(result)
}
# 定义GetinfoFunc,获取房屋详情页面里的信息:标题、地址、价格、房东照片、昵称、性别
GetinfoFunc <- function(url) {
result <- data.frame()
for (i in seq_along(url)) {
destination <- read_html(url[i], encoding = 'UTF-8')
title <- destination %>% html_nodes('div.pho_info h4 em') %>% html_text()
addr <- destination %>% html_nodes('span.pr5') %>% html_text() %>% str_trim()
if (length(addr) == 0) {
address <- NA
} else {
address <- addr
}
price <- destination %>% html_nodes('div.day_l span') %>% html_text()
img <- destination %>% html_nodes('div.member_pic a img') %>% html_attr('src')
name <- destination %>% html_nodes('div.w_240 h6 a') %>% html_text()
gender <- destination %>% html_nodes('div.member_pic div') %>% html_attr('class') %>% judgeFunc()
data <- data.frame(title, address, price, img, name, gender)
cat(sprintf('第【%d】条房屋链接抓取成功', i), sep = '\n')
result <- rbind(result, data)
}
return(result)
}
# 执行函数
base <- 'http://bj.xiaozhu.com/search-duanzufang-p'
url <- paste0(base, 1:13, '-0/')
link <- GetlinkFunc(url) %>% unlist()
xiaozhu <- GetinfoFunc(link)
# 导出csv文件
write.table(xiaozhu, row.names = FALSE, sep = ',', 'xiaozhu.csv')