Python2.7 浙江新华网爬虫

最新推荐文章于 2025-04-15 23:11:14 发布

原创

最新推荐文章于 2025-04-15 23:11:14 发布 · 826 阅读

1 ·

CC 4.0 BY-SA版权

本文展示了使用Python2.7进行网络爬虫的实践，目标是浙江新华网上的图书信息。通过PyQuery和requests库解析HTML，提取书籍的ISBN、作者、出版社、价格等关键信息，并下载封面图片。数据存储到MySQL数据库中，同时处理了编码问题和图片下载。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

encoding=utf8

import pymysql
import time
import sys
import requests
import os
import types

#将html实体化

import cgi
import urllib2
import warnings

#正则

import re
reload(sys)
sys.setdefaultencoding('utf-8')
from pyquery import PyQuery as pq
from lxml import etree

#屏蔽错误

warnings.filterwarnings("ignore")

#下载图片

def dowloadPic(imageUrl,filePath):
r = requests.get(imageUrl,timeout=60)
status=r.status_code
if status == 404:
return 404
with open(filePath, "wb") as code:
code.write(r.content)
def getData(final_url):
print final_url
file_open=open('./url.txt', 'w')
file_open.write(final_url)
file_open.close()
#链接数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')
#设置浮标
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
#插入数据库来源地址
detail_url=final_url
r=requests.get(final_url)
d=pq(r.content.decode('gbk',"ignore"))
#获取大图地址
none_img='http://images.zxhsd.com/photo/book_m//nologo.gif'
#大图地址前缀
bigtop='http://images.zxhsd.com/photo/book_m'
#小图地址前缀
smalltop='http://www.zxhsd.com/photo/book_m'
big_path=d('.pic').find('a').attr('href')
if big_path is None:
return 'back'
elif big_path == none_img:
big_path=''
small_path=''
else:
small_path=big_path.replace(bigtop,smalltop)
#获取a标签html
ahtml=d('.top').html()
#解析a标签HTML
cate=pq(ahtml)
#获取分类的最后一个分类
#获取最后一个标签必须先获取a标签的HTML解析后才能