使用高效代理抓取58同城巴州二手房信息并保存至excel-优快云博客

声明：此程序旨在技术学习交流，促进网络安全，不作任何商业用途，违者责任自负

此程序就是使用代理IP来反爬的一个小案例，使用的高效代理，通过API每次请求提取一个代理IP，一个代理IP（必须是高匿代理，隐藏真实IP）相当于一台主机，只要主机足够多，不停地更换主机也就实现了反爬。经过反复测试，程序已经可以正常爬取50页的内容，因为使用了代理，并且异常处理的代码也做了多次的修改完善，达到了反爬的目的，只是需要多消耗几个代理IP，3元1000个代理IP，50页至少需要50个，理想状态下1毛5分钱爬50页数据，一般就按2毛钱算。大家除了可以像我这样统计抓取成功率，还可以统计每次爬完50页消耗了多少个代理IP，以3元1000个代理来算本次花了多少MONEY都可以算一下只要程序中途不停止就没问题。

1.程序运行效果演示:

在这里插入图片描述

2.保存至Excel里的数据示例，50页基本是3000多条，每次都会有一些不一样的

在这里插入图片描述

3.代码示例

import requests
from lxml import html
import random
import xlwt
import time
from requests.adapters import HTTPAdapter
from retry import retry

global list_title


def get_ip():
    try:
        url1 = '填写IP代理的API接口网址'
        response = requests.get(url1)
        if response.status_code == 200:
            while response.text[0] == '{':
                time.sleep(2)
                response = requests.get(url1)
            return [x for x in response.text.split('\r')]
            # print('获取ip失败')

            # 此处返回的内容是多行的字符串，使用列表表达式使其拆分成组合成列表

        else:
            print('请求失败')
    except Exception as e:
        print(e)


i = 0


@retry(tries=3, delay=1, backoff=1, jitter=0, max_delay=1)
def my_request(url):
    requests.adapters.DEFAULT_RETRIES = 15
    s = requests.session()
    s.keep_alive = False  # 关闭之前的连接，避免连接过多
    global r
    try:
        ips = get_ip()
        proxy = {'https': ips[0]}
        print(proxy)
        r = requests.get(url, headers=head, proxies=proxy, verify=False, timeout=5)
        r.encoding = 'utf-8'

    except BaseException:  # 捕获异常的时候，这里粗略的写了BaseException，根据需要可写的更具体。
        print(url, "请求失败，开始重试")
        ips = get_ip()
        proxy = {'https': ips[0]}
        print(proxy)
        r = requests.get(url, headers=head, proxies=proxy, verify=False, timeout=5)
        r.encoding = 'utf-8'
    return r


global r
head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
    "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9"}

work_book = xlwt.Workbook(encoding="utf-8")
sheet = work_book.add_sheet("巴州二手房信息")
sheet.write(0, 3, "小区名称")
sheet.write(0, 4, "区域1")
sheet.write(0, 5, "区域2")
sheet.write(0, 6, "地址")

sheet.write(0, 7, "总价(万元)")
sheet.write(0, 8, "单价(元/㎡)")
sheet.write(0, 2, "房子大小(㎡)")
sheet.write(0, 1, "房型")
sheet.write(0, 0, "标题")
row_num = 1
z = 0
v = 0

for i in range(0, 50):
    v += 1
    i += 1
    try:
        url = "https://bygl.58.com/ershoufang/p" + str(i) + "/"
        print(url)
        my_request(url)

    except Exception as e:
        print(e)
        print("第" + str(i) + "页出错!")
        print("--------------------------")
        continue

    else:

        preview_html = html.fromstring(r.text)

        list_title = preview_html.xpath("//div[@class='property-content-title']/h3/text()|//p["
                                        "@class='property-content-info-comm-name']/text()|//p[ "
                                        "@class='property-content-info-comm-address']//span/text()|//span[ "
                                        "@class='property-price-total-num']/text()|//p["
                                        "@class='property-price-average']/text()|//p["
                                        "@class='property-content-info-text'][1]/text()|//p["
                                        "@class='property-content-info-text property-content-info-attribute']//span//text()")
        list_title = [str(x) for x in list_title]
        if not list_title:
            print("列表为空,重新获取代理IP:")
            my_request(url)

            list_title = preview_html.xpath("//div[@class='property-content-title']/h3/text()|//p["
                                            "@class='property-content-info-comm-name']/text()|//p[ "
                                            "@class='property-content-info-comm-address']//span/text()|//span[ "
                                            "@class='property-price-total-num']/text()|//p["
                                            "@class='property-price-average']/text()|//p["
                                            "@class='property-content-info-text'][1]/text()|//p["
                                            "@class='property-content-info-text property-content-info-attribute']//span//text()")
            list_title = [str(x) for x in list_title]

            time.sleep(random.random() * 2)

        print("成功爬取第" + str(i) + "页数据")
        z += 1
        print("抓取成功率:{:.2%}\n--------------------------".format(z / v))

    for j in range(len(list_title)):

        if j % 14 == 0:
            title = list_title[j + 8]
            area1 = list_title[j + 9]
            biaoti = list_title[j]
            area2 = list_title[j + 10]
            area3 = list_title[j + 11]
            totalnum = list_title[j + 12]
            avg = list_title[j + 13]
            size = list_title[j + 7].strip().strip('\n')
            house_type = list_title[j + 1] + list_title[j + 2] + list_title[j + 3] + list_title[j + 4] + list_title[
                j + 5] + list_title[j + 6]
            # print(type(list_title[j + 6]))

            sheet.write(row_num, 3, title)
            sheet.write(row_num, 4, area1)
            sheet.write(row_num, 5, area2)
            sheet.write(row_num, 6, area3)
            sheet.write(row_num, 7, totalnum)
            sheet.write(row_num, 8, avg)
            sheet.write(row_num, 2, size)
            sheet.write(row_num, 1, house_type)
            sheet.write(row_num, 0, biaoti)
            row_num += 1
    time.sleep(3)
file_name = r"F:\巴州二手房爬取.xls"
work_book.save(file_name)