encoding=utf8
import pymysql
import time
import sys
import requests
import os
import types
#将html实体化
import cgi
import urllib2
import warnings
#正则
import re
reload(sys)
sys.setdefaultencoding('utf-8')
from pyquery import PyQuery as pq
from lxml import etree
#屏蔽错误
warnings.filterwarnings("ignore")
#下载图片
def dowloadPic(imageUrl,filePath):
r = requests.get(imageUrl,timeout=60)
status=r.status_code
if status == 404:
return 404
with open(filePath, "wb") as code:
code.write(r.content)
def getData(final_url):
print final_url
file_open=open('./url.txt', 'w')
file_open.write(final_url)
file_open.close()
#链接数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')
#设置浮标
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
#插入数据库来源地址
detail_url=final_url
r=requests.get(final_url)
d=pq(r.content.decode('gbk',"ignore"))
#获取大图地址
none_img='http://images.zxhsd.com/photo/book_m//nologo.gif'
#大图地址前缀
bigtop='http://images.zxhsd.com/photo/book_m'
#小图地址前缀
smalltop='http://www.zxhsd.com/photo/book_m'
big_path=d('.pic').find('a').attr('href')
if big_path is None:
return 'back'
elif big_path == none_img:
big_path=''
small_path=''
else:
small_path=big_path.replace(bigtop,smalltop)
#获取a标签html
ahtml=d('.top').html()
#解析a标签HTML
cate=pq(ahtml)
#获取分类的最后一个分类
#获取最后一个标签必须先获取a标签的HTML解析后才能
Python2.7 浙江新华网爬虫
最新推荐文章于 2025-04-15 23:11:14 发布