Python 爬虫抓取页面信息案例_python窗口程序爬取网页信息-优快云博客

本文链接：https://blog.youkuaiyun.com/wasdcsdn2017/article/details/115697035

本文档展示了如何使用Python爬虫技术抓取律所名称、电话和详情信息，通过requests和BeautifulSoup库实现数据抓取，并将数据存储到MySQL数据库中。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import requests
from lxml import etree
import time
import pymysql

# 定义函数抓取
def crow_first(a):
# 构造每一页的url变化
url = 'https://www.*****.cn/lawyeroffice/'+str(a)+'/'
head = {'authority': 'www.66law.cn',
'method': 'GET',
'path': '/lawyeroffice/'+str(a)+'/',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'hl.guid=14d0dc68-ba11-4105-9346-cba9b8bb3914; a=5110102; Hm_lvt_ac3abb01a9ad71f2dc9f7344e138c993=1617092779; fp=0ba02182-db7d-12d2-668e-f2643cc6d0df; hm.sid=e0822e3f-3e10-e06a-c6e4-5a91d01cc049; UM_distinctid=178823e0c2f564-00889831a07fa4-651a107e-100200-178823e0c305a1; CNZZDATA1260982728=24884891-1617088221-https%253A%252F%252Fwww.66law.cn%252F%7C1617088221; Hm_lvt_b6aaa133bc44aab3150966581c93ace1=1617092857; Hm_lpvt_b6aaa133bc44aab3150966581c93ace1=1617092857; route=18180d9b011cb62de860e08ffecb194a; Hm_lpvt_ac3abb01a9ad71f2dc9f7344e138c993=1617093792',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
r = requests.get(url, headers=head)
# 指定编码方式，不然会出现乱码
r.encoding = 'utf-8'
html1 = etree.HTML(r.text)
name = html1.xpath('//body//div//p[@class="nav-f24"]/text()')
phone = html1.xpath('//body//div//ul[@class="contact-list fr"]//li[2]/text()')

#律所详情url追加/intro.aspx
url1 = 'https://www.*****.cn/lawyeroffice/'+str(a)+'/intro.aspx'
r1 = requests.get(url1, headers=head)
# 指定编码方式，不然会出现乱码
r1.encoding = 'utf-8'
html2 = etree.HTML(r1.text)
details = html2.xpath('//body//div//p[@class="t2"]/text()')

print(name[0]+"------"+phone[0]+"------"+details[0])
print( "------当前数据ID为" + str(a));

config = {
'host': '127.0.0.1'
, 'user': 'root'
, 'password': 'root'
, 'database': 'test'
, 'charset': 'utf8'
, 'port': 3306 # 注意端口为int 而不是str
}

db = pymysql.connect(**config)
cursor = db.cursor()

lawName=name[0]
phoneStr=phone[0]
detailsStr=details[0]
try:
db.select_db("test")
sql = "INSERT INTO test.data(id,law_firm_name,phone,details)VALUES(%s,%s,%s,%s)"
cursor.execute(sql,(a,lawName,phoneStr,detailsStr))
db.commit()
print('插入数据成功')
except Exception as e:
db.rollback()
print("插入数据失败")
print('Failed:', e)

cursor.close()
db.close()

if __name__ == '__main__':

for i in range(2992, 40000):
# 下面的print函数
print('***************************************************')
time.sleep(4)
try:
crow_first(100+i)#根据https://www.*****.cn/lawyeroffice/1000分析前100没有数据所以从100开始
except Exception as e:
print(e)
print('------------------')