python爬虫，链家

最新推荐文章于 2024-04-12 14:15:06 发布

史小屁

最新推荐文章于 2024-04-12 14:15:06 发布

阅读量355

点赞数 1

CC 4.0 BY-SA版权

分类专栏： python

本文链接：https://blog.youkuaiyun.com/weixin_41312918/article/details/82109163

python 专栏收录该内容

2 篇文章

订阅专栏

此博客展示了一段Python代码，用于抓取链家上海二手房小区信息。代码通过requests库发送请求获取网页内容，使用re库进行正则匹配提取信息。从初始页面开始，逐步获取小区链接，再进一步获取小区名称、地址、单价、建成年份等详细信息。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

import json
import requests
from requests.exceptions import RequestException
import re
import time
import pymysql
import pandas

url = 'https://sh.lianjia.com/ershoufang/'
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None

html = get_one_page(url)

def get_re(res):
titles = re.compile(res, re.S)
titles = re.findall(titles, html)
return titles

url2 = []
res = '<a href="/ershoufang/\w.+?"\s\stitle'
titles =get_re(res)
for title in titles:
x = title.split('/')
x = x[2]
url = 'https://sh.lianjia.com/xiaoqu/'
url += x
url2.append(url)

url3 = []
for url in url2 :
html = get_one_page(url)
res = '<a href="/xiaoqu/\w+/"\s>'
titles = get_re(res)
for title in titles :
x = title.split('/')
x = x[2]
url = 'https://sh.lianjia.com/xiaoqu/'
url += x
url3.append(url)
url4 = []
count =1
try:
for url in url3:
html = get_one_page(url)
res = '"totalPage":\w+,"'
num = get_re(res)
num = num[0]
res = '\d+'
num1 = re.compile(res, re.S)
nums = re.findall(num1, num)
print(nums)
nums = int(nums[0])+1

for num in range(1,nums) :
y = '/pg'
num =str(num)
y +=num
url1 = url
url1 += y
z = '/'
url1 += z
url4.append(url1)
print('percent: {:.0%}'.format( count/len(url3)))
count = count +1
except IndexError:
None

url5 = []
xiaoqu1 = []
count =1
try:
for url in url4 :
html = get_one_page(url)
res = 'href="https://sh.lianjia.com/xiaoqu/\d+/" target="_blank">.+?</a>'
titles = get_re(res)
for title in titles:
x = title.split('"')
xiaoqu = x[3]
xiaoqu1.append(xiaoqu)
url = x[1]
url5.append(url)
print('percent: {:.0%}'.format( count/len(url4)))
count = count +1
except IndexError:
None

count = 1
xiaoqu_list = []
for url in url5:
try:
x = []
html = get_one_page(url)
res = 'class="detailTitle">.+?<'
xiaoqumingcheng = get_re(res)
xiaoqumingcheng = xiaoqumingcheng[0].split('>')
xiaoqumingcheng = xiaoqumingcheng[1].replace('<','')
x.insert(1,xiaoqumingcheng)

res = 'class="detailDesc">.+?<'
dizhi = get_re(res)
dizhi = dizhi[0].split('>')
dizhi = dizhi[1].replace('<','')
x.insert(2,dizhi)

res = 'class="xiaoquUnitPrice">\w+<'
price = get_re(res)
price = price[0].split('>')
price = price[1].replace('<','')
x.insert(3,price)

res = 'class="xiaoquInfoContent">\w+年建成\s<'
year = get_re(res)
year = year[0].split('>')
year = year[1].replace('<','')
x.insert(4,year)
xiaoqu_list.append(x)
print (count)
print('percent: {:.0%}'.format(count/len(url5)))
count = count+1
except :
None