python爬虫#实战练习1

3和4加五

已于 2024-10-17 09:28:26 修改

阅读量332

点赞数 9

文章标签： python 爬虫开发语言

于 2024-10-17 09:27:57 首次发布

本文链接：https://blog.youkuaiyun.com/RX0117/article/details/142996966

版权

案例需求：

- 在这里插入图片描述

进入房产信息详情页，获取房产的户型、建筑面积、朝向、楼层、装修等等基本信息

代码实现：

导入需要用到的库

import os
import re
import threading
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
import pandas as pd

获取信息方法，房产信息大致分为楼房、电梯房和独栋，因此部分信息xpath无法完全获取，使用正则爬取可以获取完整信息，网页未上传的信息，我们将其设为未知，保证信息打印工整

def get_house_info(url):
    driver = webdriver.Edge()
    driver.get(url)
    i = 1
    while True:
        try:
            infomation = driver.find_elements(By.XPATH, '//dd/h4/a/span')
            for info in infomation:
                info.click()
                driver.switch_to.window(driver.window_handles[-1])
                driver.implicitly_wait(20)
                time.sleep(1)
                html = driver.page_source
                text = etree.HTML(html)
                # 户型
                room = re.findall(r"pageConfig.room='(.*?)';", html)
                hall = re.findall(r"pageConfig.hall='(.*?)';", html)
                if room == [] or hall == []:
                    house_type = '未知'
                else:
                    house_type = room[0] + '室' + hall[0] + '厅'
                # 朝向
                toward = re.findall(r"pageConfig.forward='(.*?)';", html)
                if toward == [] or toward is None:
                    toward = '未知'
                else:
                    toward = toward[0]
                # 建筑面积
                floor_space = re.findall(r"area: '(.*?)'", html)
                if floor_space == [] or floor_space is None:
                    floor_space = '未知'
                else:
                    floor_space = floor_space[0]
                # 楼层
                floor = re.findall(r'"vwe.housefloor": "(.*?)"', html)
                if floor == [] or floor is None:
                    floor = '未知'
                else:
                    floor = floor[0]
                # 装修
                renovation = re.findall(r'"vwe.fixstatus": "(.*?)",', html)
                if renovation == [] or renovation is None:
                    renovation = '未知'
                else:
                    renovation = renovation[0]
                # 建筑年代
                year_built = re.findall(r'"vwe.createtime": "(.*?)"', html)
                if year_built == [] or year_built is None:
                    year_built = '未知'
                else:
                    year_built = year_built[0]
                # 建筑类别
                building_type = re.findall(r'"vwe.buildcategory": "(.*?)",', html)
                if building_type == [] or building_type is None:
                    building_type = '其他'
                else:
                    building_type = building_type[0]
                # 电梯
                if building_type == '独栋':
                    lift = '无'
                else:
                    lift = re.findall(r'<span class="lab">有无电梯</span><span class="rcont">(.*?) </span>', html)
                    if lift == [] or lift is None:
                        lift = '无'
                    else:
                        lift = lift[0]
                # 产权性质
                property_right = re.findall(
                    r'产权性质</span><span class="rcont"><a href=.*? target="_blank" class="link_rk">(.*?)</a>', html)
                if property_right == [] or property_right is None:
                    property_right = '未知'
                else:
                    property_right = property_right[0]
                # 住宅类别
                house_category = re.findall(r'"vwe.purpose": "(.*?)",', html)
                if house_category == [] or house_category is None:
                    house_category = '其他'
                else:
                    house_category = house_category[0]
                # 建筑结构
                structure1 = re.findall(
                    r'<span class="lab">厅结构</span><span class="rcont"><a href=.*? target="_blank" class="link_rk">(.*?)</a></span>',
                    html)
                structure2 = re.findall(
                    r'<span class="lab">建筑结构</span><span class="rcont"><a href=.*? target="_blank" class="link_rk">(.*?)</a></span>',
                    html)
                if structure1 == [] or structure1 is None:
                    if structure2 == [] or structure2 is None:
                        structure = '其他'
                    else:
                        structure = structure2[0]
                else:
                    if structure1 == [] or structure1 is None:
                        structure = '其他'
                    else:
                        structure = structure1[0]
                # 区域
                area = text.xpath('//div[@id="address"]')
                if area == [] or area is None:
                    area = '未知'
                else:
                    try:
                        for a in area:
                            area = a.xpath('string(.)').replace('\n', '').replace(' ', '')
                    except Exception as e:
                        area = '未知'
                # 总价
                total_price = re.findall(r'"vwe.totalprice": (.*?),', html)
                if total_price == [] or total_price is None:
                    total_price = '未知'
                else:
                    total_price = total_price[0]
                # 单价
                unit_price = re.findall(r'"vwe.unitprice": "(.*?)",', html)
                if unit_price == [] or unit_price is None:
                    unit_price = '未知'
                else:
                    unit_price = unit_price[0]
                    
                df = pd.DataFrame({'户型': [house_type], '建筑面积': [floor_space], '朝向': [toward], '楼层': [floor],
                                   '装修': [renovation], '建筑年代': [year_built], '电梯': [lift],
                                   '产权性质': [property_right], '住宅类别': [house_category], '建筑结构': [structure],
                                   '建筑类别': [building_type], '区域': [area], '总价': [total_price],
                                   '单价': [unit_price]})
                if not os.path.exists('房天下.csv'):
                    df.to_csv('房天下.csv', mode='w', header=False, index=False)
                else:
                    df.to_csv('房天下.csv', mode='a', header=False, index=False)
                print('---保存成功---')
                # 关闭当前窗口
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
            driver.find_element(By.LINK_TEXT, '下一页').click()
        except Exception as e:
            print(e)
        i+=1
        if i > 10:
            break

由于数据过多，我们采用多线程模式爬取
启动线程和关闭线程

def start_thread(urls):
    thread_list = []
    for url in urls:
        t = threading.Thread(target=get_house_info, args=(url,))
        t.start()
        thread_list.append(t)
        print(f'{t.name}启动成功')
    return thread_list


def stop_thread(thread_list):
    for t in thread_list:
        print(f'{t.name}等待结束')
        t.join()

房产信息总共100页，我们观察网页url，发现url末尾是以i3**结尾，设置十个线程，每个线程爬取10页

if __name__ == '__main__':
    # url = 'https://cs.esf.fang.com/house/i31/'
    # get_house_info(url)
    urls = []
    for i in range(10):
        url = f'https://cs.esf.fang.com/house/i3{1 + 10 * i}/'
        urls.append(url)
    thread_list = start_thread(urls)
    stop_thread(thread_list)
    print('---全部爬取完成---')