python爬虫,链家

此博客展示了一段Python代码,用于抓取链家上海二手房小区信息。代码通过requests库发送请求获取网页内容,使用re库进行正则匹配提取信息。从初始页面开始,逐步获取小区链接,再进一步获取小区名称、地址、单价、建成年份等详细信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""


import json
import requests
from requests.exceptions import RequestException
import re
import time
import pymysql
import pandas

url = 'https://sh.lianjia.com/ershoufang/'
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
    
html = get_one_page(url)

def get_re(res):
    titles = re.compile(res, re.S)
    titles = re.findall(titles, html)
    return titles

url2 = []
res = '<a href="/ershoufang/\w.+?"\s\stitle'
titles =get_re(res)
for title in titles:   
    x = title.split('/')
    x = x[2]
    url = 'https://sh.lianjia.com/xiaoqu/'
    url += x
    url2.append(url)

url3 = []
for url in url2 :
    html = get_one_page(url)
    res = '<a href="/xiaoqu/\w+/"\s>'
    titles = get_re(res)
    for title in titles :
        x = title.split('/')
        x = x[2]
        url = 'https://sh.lianjia.com/xiaoqu/'
        url += x
        url3.append(url)
url4 = []
count =1
try:
    for url in url3:
        html = get_one_page(url)
        res = '"totalPage":\w+,"'    
        num = get_re(res)
        num = num[0]
        res = '\d+'
        num1 = re.compile(res, re.S)
        nums = re.findall(num1, num)
        print(nums)
        nums = int(nums[0])+1

        for num in range(1,nums) :
            y = '/pg'      
            num =str(num)
            y +=num
            url1 = url
            url1 += y
            z = '/'
            url1 += z
            url4.append(url1)
        print('percent: {:.0%}'.format( count/len(url3)))
        count = count +1
except IndexError:
    None
    
url5 = []
xiaoqu1 = []
count =1
try:
    for url in url4 :
        html = get_one_page(url)
        res = 'href="https://sh.lianjia.com/xiaoqu/\d+/" target="_blank">.+?</a>'
        titles = get_re(res)
        for title in titles:
            x = title.split('"')
            xiaoqu = x[3]
            xiaoqu1.append(xiaoqu)
            url = x[1]
            url5.append(url)
        print('percent: {:.0%}'.format( count/len(url4)))
        count = count +1
except IndexError:
    None

count = 1    
xiaoqu_list = []
for url in url5:
    try:
        x = []
        html = get_one_page(url)
        res = 'class="detailTitle">.+?<'
        xiaoqumingcheng = get_re(res)
        xiaoqumingcheng = xiaoqumingcheng[0].split('>')
        xiaoqumingcheng = xiaoqumingcheng[1].replace('<','')
        x.insert(1,xiaoqumingcheng)
        
        res = 'class="detailDesc">.+?<'
        dizhi = get_re(res)
        dizhi = dizhi[0].split('>')
        dizhi = dizhi[1].replace('<','')
        x.insert(2,dizhi)
            
        res = 'class="xiaoquUnitPrice">\w+<'
        price = get_re(res)
        price = price[0].split('>')
        price = price[1].replace('<','')
        x.insert(3,price)
        
        res = 'class="xiaoquInfoContent">\w+年建成\s<'
        year =  get_re(res)
        year = year[0].split('>')
        year = year[1].replace('<','')
        x.insert(4,year)
        xiaoqu_list.append(x)
        print (count)
        print('percent: {:.0%}'.format(count/len(url5)))       
        count = count+1
    except :
        None


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值