作业:按行政区整理信息
import requests
import os
import csv
from bs4 import BeautifulSoup
from ujson import loads
from tqdm import tqdm
def make_dir(path):
if not os.path.exists(path):
os.mkdir(path)
def proxy_get():
api_url = 'http://d.jghttp.alicloudecs.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&time=2&ts=0&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1®ions='
origin_data = loads(requests.get(url=api_url).text)['data'][0]
return {
'http': f'http://{origin_data["ip"]}:{origin_data["port"]}',
'https': f'http://{origin_data["ip"]}:{origin_data["port"]}'
}
def requests_get(href, proxy):
Headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
}
resp = requests.get(url=href, headers=Headers, proxies=proxy)
if resp.status_code == 200:
return resp
else:
print(resp.status_code)
def area_get(soup):
area_list = []
for x in soup.select(
'body > div:nth-child(12) > div > div.position > dl:nth-child(2) > dd > div:nth-child(1) > div:nth-child(1) > a'):
area_list += [(x.text, 'https://cd.lianjia.com' + x.attrs['href'])]
return area_list
def page_get(soup):
page = eval(soup.select_one('#content > div.leftContent > div.contentBottom.clear > div.page-box.fr > div').attrs[
'page-data'])['totalPage']
return page
def target_info_get(single_house):
house_name = single_house.select_one('div.title > a').text
house_position1 = f'{single_house.select_one("div.flood > div.positionInfo > a:nth-child(2)").text}'
house_position2 = f'{single_house.select_one("div.flood > div.positionInfo > a:nth-child(3)").text}'
house_position = house_position1 + '- ' + house_position2
house_address0 = single_house.select_one('div.address > div.houseInfo').text
house_address = "|".join(house_address0.split('|')[0:4])
house_price = f'{single_house.select_one("div.priceInfo > div > span").text}{single_house.select_one("div.priceInfo > div > i:nth-child(3)").text}'
house_unit_price = single_house.select_one('div.unitPrice > span').text
return [house_name, house_position, house_address, house_price, house_unit_price]
def skip(area_list, path):
exist_list = os.listdir(path)
exist_list.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
result = []
for x in area_list:
if x[0] + '.csv' not in exist_list[:-1]:
result += [x]
return result
if __name__ == '__main__':
make_dir('成都房源信息')
proxy = proxy_get()
resp0 = requests_get('https://cd.lianjia.com/ershoufang/jinjiang/', proxy)
soup_area = BeautifulSoup(resp0.text, 'lxml')
area_list0 = area_get(soup_area)
area_list1 = skip(area_list0, '成都房源信息')
for i in tqdm(area_list1):
area_name, area_link = i[0], i[1]
resp1 = requests_get(area_link, proxy).text
page = page_get(BeautifulSoup(resp1, 'lxml'))
f = open(f'成都房源信息/{area_name}.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(f)
for x in tqdm(range(1, page + 1)):
writer.writerow(['房源名', '房源地址', '房源信息', '房源价格', '房源每平方米价格'])
resp = requests_get(f'{area_link}pg{x}/', proxy)
house_origin_data = BeautifulSoup(resp.text, 'lxml')
house_data = house_origin_data.select('div#content > div.leftContent > ul > li > div:nth-child(2)')
for single_house in house_data:
single_data = target_info_get(single_house)
writer.writerow(single_data)
f.close()
本文档展示了使用Python爬虫技术,通过requests和BeautifulSoup库,从链家网站抓取成都不同区域的房产信息,包括房源名、位置、价格等关键数据,并整理到CSV文件中。重点在于行政区的整理和数据抓取流程。
4万+

被折叠的 条评论
为什么被折叠?



