import openpyxl
import requests # 模拟请求
import pandas as pd # 清洗数据
from bs4 import BeautifulSoup
#打开目标execl,这里注意openpyxl能读取的execl后缀名是'.xlsx'文件
workbook1=openpyxl.load_workbook('xxx.xlsx')
#选定目标sheet
worksheet1 = workbook1.active
**# 请求头**
headers = {
# 把xxxx 换成自己浏览器中参数信息
"user-agent":"xxxxxx",
"referer": "xxxx",
"sec-fetch-mode": "xxxx",
"sec-fetch-site": "xxxx",
"cookie": "xxxx",
}
ko = []
**# 循环获取 B 列**
for cell in worksheet1['B']:
# print(cell.value)
# 路径
url = 'https://www.qcc.com/web/search?key=' + cell.value
# 模拟请求网页
html_text = requests.get(url, headers=headers).text
soup = BeautifulSoup(html_text, 'html.parser')
# 循环获取div
for ks in soup.find_all('div', {'class': 'maininfo'}):
# print(ks.span.text)
# 追加到ko数组 中
ko.append([ks.span.text, ks.a['href']])
# print(ks.a['href'])
break
print(ko)
# 保存的标题
title = ['公司名称', '公司链接']
# 保存的数据
table = pd.DataFrame(ko, columns=title)
# 保存路径 如E:/xxxxx.xlsx 设置保存路径
table.to_excel('xxxx.xlsx', sheet_name='sheet1')
企查查爬虫获取公司链接
最新推荐文章于 2025-03-20 10:42:24 发布