企查查爬虫获取公司链接

最新推荐文章于 2025-03-20 10:42:24 发布

北忆₯

最新推荐文章于 2025-03-20 10:42:24 发布

阅读量933

点赞数 1

文章标签： python

本文链接：https://blog.youkuaiyun.com/qq_48989313/article/details/114500858

版权

import openpyxl
import requests  # 模拟请求
import pandas as pd  # 清洗数据
from bs4 import BeautifulSoup

#打开目标execl，这里注意openpyxl能读取的execl后缀名是'.xlsx'文件
workbook1=openpyxl.load_workbook('xxx.xlsx')
#选定目标sheet
worksheet1 = workbook1.active
**# 请求头**
headers = {
    # 把xxxx 换成自己浏览器中参数信息
    "user-agent":"xxxxxx",
    "referer": "xxxx",
    "sec-fetch-mode": "xxxx",
    "sec-fetch-site": "xxxx",
    "cookie": "xxxx",
}
ko = []
**# 循环获取 B 列**
for cell in worksheet1['B']:
    # print(cell.value)
    # 路径
    url = 'https://www.qcc.com/web/search?key=' + cell.value
    # 模拟请求网页
    html_text = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html_text, 'html.parser')
    # 循环获取div
    for ks in soup.find_all('div', {'class': 'maininfo'}):
        # print(ks.span.text)
        # 追加到ko数组 中
        ko.append([ks.span.text, ks.a['href']])
        # print(ks.a['href'])
        break
print(ko)
# 保存的标题
title = ['公司名称', '公司链接']
# 保存的数据
table = pd.DataFrame(ko, columns=title)
# 保存路径  如E:/xxxxx.xlsx  设置保存路径
table.to_excel('xxxx.xlsx', sheet_name='sheet1')