Python网络爬虫实战：链家数据抓取与经纪人信息分析-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_44452160/article/details/110876926

import requests
from lxml import etree
import time
import csv
import random
import re
import os
print(os.getpid())

#定义一些全局变量

time_begin=time.time()
#定义开始的时间

#撰写解析移动端网页的函数

def web_crawl(url_single):
# 定义一些基础且基本不变的变量
header_1 = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36’}

# 定义保存为文件的函数,先暂时不保存，看看时间效率
'''
def data_writer(item):
    with open('lianjia_fangjiaxinxi.csv', 'a', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(item)
        '''

# 定义下载页面的函数
def download(url_single):
    response_1 = requests.get(url_single, headers=header_1)
    time.sleep(3)  # 防止被反爬虫
    return etree.HTML(response_1.text)  # 返回一个Element对象

#下一段代码能从房源列表页面中解析出房源详情页的URL，然后打开这个URL,从中提取房源的其他具体信息
seletor_1 = download(url_single)#调用上面定义的download函数，下载每一页的内容
house_list = seletor_1.xpath(’//[@id=“content”]/div[1]/ul/li’)#提取每一页包含的房源的列表
for house in house_list:
house_layout = house.xpath(‘div[1]/div[3]/div/text()’)[0]
total_price = house.xpath(’//[@id=“content”]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span/text()’)[0]
apartment = house.xpath(‘div[1]/div[1]/a/text()’)[0]
# 接着进入每一个房源的详情页面 div[1]/div[1]/a/@href
deep_url = house.xpath(‘div[1]/div[1]/a/@href’)[0]
# 然后再次将这个详情页面的内容下载下来进行数据提取
seletor_2 = download(deep_url)
time.sleep(3)
housekeeper = seletor_2.xpath(’//*[@id=“zuanzhan”]/div[2]/div/div[1]/a/text()’)[0]
item = [housekeeper, house_layout, apartment, total_price]
print(‘正在抓取’ + apartment)
print(item)
#data_writer(item)

#爬取移动版链家网页的经纪人的网址（XHR类型的网址）
#https://m.lianjia.com/liverpool/api/jingjiren/getList?cityId=110000&condition=%252Fao12pg99&from=lianjia_m&searchFrom=jingjiren
#99所在的位置会边，下限1，上限100
def mobile_crawl(url_single):

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

response = requests.get(url_single, proxies=random.choice(proxies), headers=headers)

response = requests.get(url_single,headers=headers)
time.sleep(3)
sel = etree.HTML(response.text)
print(sel)

#定义全部的url:移动端和网页端的URL各20条，其实就是上面两个列表的相加

def main():
all_url_list = [
‘https://m.lianjia.com/liverpool/api/jingjiren/getList?cityId=110000&condition=%252Fao12pg1&from=lianjia_m&searchFrom=jingjiren’,
‘https://m.lianjia.com/liverpool/api/jingjiren/getList?cityId=110000&condition=%252Fao12pg2&from=lianjia_m&searchFrom=jingjiren’,
‘https://m.lianjia.com/liverpool/api/jingjiren/getList?cityId=110000&condition=%252Fao12pg3&from=lianjia_m&searchFrom=jingjiren’,
‘https://m.lianjia.com/liverpool/api/jingjiren/getList?cityId=110000&condition=%252Fao12pg4&from=lianjia_m&searchFrom=jingjiren’,
‘https://m.lianjia.com/liverpool/api/jingjiren/getList?cityId=110000&condition=%252Fao12pg5&from=lianjia_m&searchFrom=jingjiren’,
#正则表达式为^https://m.$ 通用匹配
‘https://gz.lianjia.com/ershoufang/pg16’,
‘https://gz.lianjia.com/ershoufang/pg17’,
‘https://gz.lianjia.com/ershoufang/pg18’,
‘https://gz.lianjia.com/ershoufang/pg19’,
‘https://gz.lianjia.com/ershoufang/pg20’
#正则表达式为：^{https://gz\W\w{7}\W\w{3}\W\w{10}\W\w{4}或者}https://gz.$ 通用匹配
]
for content in all_url_list:
if re.match(’^https://m.*$’, content):
print(content, ‘执行移动端函数’)
mobile_crawl(content)
else:
print(content, ‘执行网页端函数’)
web_crawl(content)
main()
#定义程序结束的时间
time_end=time.time()
time_consuming=time_end-time_begin
print(‘采用正则表达分类逻辑进行URL分类的爬虫程序一共消耗时间为：’+str(time_consuming))