【CSS Selector】小猪网短租房（Python & R）

最新推荐文章于 2021-06-23 21:00:13 发布

原创最新推荐文章于 2021-06-23 21:00:13 发布 · 545 阅读

CC 4.0 BY-SA版权

Python

# 加载模块
import csv
import time
import codecs
import random
import requests
from bs4 import BeautifulSoup

# 伪装报头
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
           AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
          }

# 在指定路径新建csv文件，后续写入数据
f = codecs.open('D://Spyder/WD/xiaozhu.csv', 'w', 'utf-8_sig')
writer = csv.writer(f)
writer.writerow(('title', 'address', 'price', 'img', 'name', 'gender'))

# 定义judgement_gender，判别房东性别
def judgement_gender(class_name):
    if class_name == ['member_ico1']:
        return '女'
    else:
        return '男'
    
# 定义get_links，获取房屋详情页的链接 
def get_links(url):
    destination = requests.get(url, headers = headers)
    soup = BeautifulSoup(destination.text, 'lxml')
    links = soup.select('#page_list ul li > a')
    for link in links:
        href = link.get('href')
        get_info(href)
        
# 定义get_info，获取房屋详情页面里的信息：标题、地址、价格、房东照片、昵称、性别
def get_info(url):
    destination = requests.get(url, headers = headers)
    soup  = BeautifulSoup(destination.text, 'lxml')
    title = soup.select('div.pho_info h4 em')[0].text
    addr  = soup.select('span.pr5')
    if len(addr) == 0:
        address = 'NA'
    else:
        address = soup.select('span.pr5')[0].text
    price  = soup.select('div.day_l span')[0].text
    img    = soup.select('div.member_pic a img')[0].get('src')
    name   = soup.select('div.w_240 h6 a')[0].text
    gender = judgement_gender(soup.select('div.member_pic div')[0].get('class'))
    writer.writerow((title, address, price, img, name, gender))
    
# 程序入口
if __name__ == '__main__':
    urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) \
            for number in range (1, 14)]
    for single_url in urls:
        get_links(single_url)
        time.sleep(random.randint(1, 5))
        
# 停止写入
f.close()

R

# 加载包
library(rvest)
library(stringr)

# 定义judgeFunc，判断房东性别
judgeFunc <- function(class_name) {
  if (class_name == 'member_ico1') {
    return('女')
  } else { 
    return('男')
  }
}

# 定义GetlinkFunc，获取房屋详情页的链接
GetlinkFunc <- function(url) {
  result <- list()
  for (i in seq_along(url)){
    destination <- read_html(url[i], encoding = 'UTF-8')
    data <- destination %>% html_nodes('#page_list ul li > a') %>% html_attr('href')
    result <- rbind(result, data)
    cat(sprintf('第【%d】页抓取成功', i), sep = '\n')
    Sys.sleep(runif(1, 1, 5))
  }
  return(result)
}

# 定义GetinfoFunc，获取房屋详情页面里的信息：标题、地址、价格、房东照片、昵称、性别
GetinfoFunc <- function(url) {
  result <- data.frame()
  for (i in seq_along(url)) {
    destination <- read_html(url[i], encoding = 'UTF-8')
    title <- destination %>% html_nodes('div.pho_info h4 em') %>% html_text()
    addr <- destination %>% html_nodes('span.pr5') %>% html_text() %>% str_trim()
    if (length(addr) == 0) {
      address <- NA
    } else {
      address <- addr
    }
    price  <- destination %>% html_nodes('div.day_l span') %>% html_text()
    img    <- destination %>% html_nodes('div.member_pic a img') %>% html_attr('src')
    name   <- destination %>% html_nodes('div.w_240 h6 a') %>% html_text()
    gender <- destination %>% html_nodes('div.member_pic div') %>% html_attr('class') %>% judgeFunc()
    data   <- data.frame(title, address, price, img, name, gender)
    cat(sprintf('第【%d】条房屋链接抓取成功', i), sep = '\n')
    result <- rbind(result, data)
  }
  return(result)
} 

# 执行函数
base    <- 'http://bj.xiaozhu.com/search-duanzufang-p'
url     <- paste0(base, 1:13, '-0/')
link    <- GetlinkFunc(url) %>% unlist()
xiaozhu <- GetinfoFunc(link)

# 导出csv文件
write.table(xiaozhu, row.names = FALSE, sep = ',', 'xiaozhu.csv')