按照美食、购物、娱乐、服务、医疗、教育、酒店、金融、汽车等行业爬取。
以下就是部分案例数据
import csv
import random
import time
import requests
import chardet
from lxml import etree
cookies = {
'ASPSESSIONIDASRRRCQD': 'GHPABLOBBLAAINDDHPEEDPMO',
'iecity': 'cityEn=nanning&cityProvince=%B9%E3%CE%F7&citySimple=%C4%CF%C4%FE&cityCode=0771&cityName=%C4%CF%C4%FE',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://brand.iecity.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
}
if __name__=='__main__':
q = 0
output_file = '全国连锁品牌数据清单.csv'
with open(output_file, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
if file.tell() == 0: # 如果文件是空的,写入表头
writer.writerow(['page', 'index', 'name', 'img_src', 'uid', 'lx']) # 写入表头
for page in range(1, 600):
#1001,739,828
response = requests.get(f'http://brand.iecity.com/list-828----{page}.html', cookies=cookies, headers=headers,
verify=False)
t_time = random.uniform(3, 5)
time.sleep(t_time)
# 使用 chardet 自动检测编码
encoding = chardet.detect(response.content)['encoding']
response.encoding = encoding
content = etree.HTML(response.text)
for index in range(1, 61):
try:
# 获取名称
name = content.xpath(f"//ul[@class='brand-main clearfix']/li[{index}]/a/h6/text()")
name = name[0] if name else None # 防止列表为空
except Exception as e:
print(f"Error extracting name: {e}")
name = None
try:
# 获取图片地址
img_src = content.xpath(f"//ul[@class='brand-main clearfix']/li[{index}]/a/img/@data-original")
img_src = img_src[0] if img_src else None
except Exception as e:
print(f"Error extracting image source: {e}")
img_src = None
try:
# 获取链接 UID
uid = content.xpath(f"//ul[@class='brand-main clearfix']/li[{index}]/a/@href")
uid = uid[0] if uid else None
except Exception as e:
print(f"Error extracting UID: {e}")
uid = None
try:
# 获取类型 lx
lx = content.xpath('//*[@id="Head"]/div/div/div/ul/li[1]/a/text()')
lx = lx[0] if lx else None
except Exception as e:
print(f"Error extracting type (lx): {e}")
lx = None
if img_src is None:
q = 1
else:
writer.writerow([page, index, name, img_src, uid, lx])
print(page,index,name, img_src,uid,lx)
if q == 1:
break