基本思路:
1.定义爬虫类并初始化
2.拼接URL并翻页
3.请求网络
4.分析数据,使用bs4匹配想要的信息,进入内部获得书的下载地址
每本书的信息都放在div标签’entry-body’类里的,先用select(’.entry-body’)把书的信息都取下来
4.1匹配书名和链接
书的链接在a标签里,链接属性href:a[‘href’]
书名在h2标签的entry-title类里面,用get_text()获取内容:select(’.entry-title’)[0].get_text()
4.2匹配作者
select(’.entry-author’)[0].get_text()
4.3匹配书的简介
select(’.entry-summary’)[0].get_text()
4.4获取每本书的下载链接
下载链接在span标签的download-links类中,有的书有PDF和EPUB两种格式,优先下载EPUB格式的书,EPUB格式的书总是排 在第 二的位置:download_url[1]
res = requests.get(book_url, self.User_Agent).content.decode('utf-8')
soup_1 = BeautifulSoup(res, 'lxml')
download_url = soup_1.select('.download-links')
if len(download_url) == 2: # 有的书有PDF和EPUB两种格式
download_url_1 = download_url[1].a['href'] # 优先下载EPUB格式的书,即下载download_url_1
else:
download_url_1 = download_url[0].a['href']
4.5数据汇总,将上面四种信息保存成一个list
5.保存数据,保存成cvs
5.1这样写入的CSV文件会把每个字符都用逗号隔开,用excel打开很乱
# 4.1定义CSV写入器
writer = csv.writer(self.save_csv_fp)
# 4.1写入表头
writer.writerow(['book_name', 'book_auther', 'book_introduction', 'book_url'])
# 4.2写入数据
writer.writerows(info_list)
5.2下面的代码能够实现CVS写入,使用pandas.DataFrame()写入,再转换成CSV
title = ['book_name', 'book_auther', 'book_introduction', 'book_url']
saver = pandas.DataFrame(data=info_list, columns=title)
saver.to_csv(self.save_csv_fp, header=False index=False, sep=',', mode='a', encoding='utf-8')
# header=False使得转换后的表不输出colums,避免每次循环都要写入一个表头
5.3直接保存成excel表格,这里参考了https://blog.youkuaiyun.com/u013250071/article/details/81911434
book = xlwt.Workbook(encoding='utf-8') # 创建Workbook,相当于创建Excel
# 创建sheet,Sheet1为表的名字,cell_overwrite_ok为是否覆盖单元格
sheet1 = book.add_sheet(u'Sheet1', cell_overwrite_ok=False)
title = ['book_name', 'book_auther', 'book_introduction', 'book_url']
# 向表中添加数据
i = i + 1
for j in range(0, 4):
sheet1.write(i, j, info_list[0][j])
if i == 9:
book.save(self.save_csv_fp)
保存excel用wb模式写入,即self.save_csv_fp== open(‘IT_ebooks_save.csv’, ‘wb’),否则容易出现编码错误。
6.下载书籍
6.1把下载链接保存成txt文件,再通过迅雷调用下载链接进行下载
同时有PDF和EPDU两种格式的书,把两个下载链接都保存。每本书的书名和下载链接存成一行,用制表符分隔。
res = requests.get(book_url, self.User_Agent).content.decode('utf-8')
soup_1 = BeautifulSoup(res, 'lxml')
download_url = soup_1.select('.download-links')
with open('save_txt.txt', 'a') as f:
f.write(book_name+'\t')
if len(download_url) ==2:
download_url_1 = download_url[0].a['href']
download_url_2 = download_url[1].a['href']
f.write(download_url_1 + '\t'+download_url_2+'\n')
else:
download_url_1 = download_url[0].a['href']
f.write(download_url_1 + '\n')
使用上面保存的链接的TXT文件,用代码调用迅雷的API,实现自动下载,迅雷要设置成一键下载(我用的迅雷X:设置中心—>下载管理)就不需要人去点立即下载按钮了,迅雷下载参考了https://blog.youkuaiyun.com/yanhuangzhinu0/article/details/82750177
o = Dispatch("ThunderAgent.Agent64.1") # 64位的
# o = Dispatch("ThunderAgent.Agent.1") # 32位的
# #AddTask("下载地址", "另存文件名", "保存目录","任务注释","引用地址","开始模式", "只从原始地址下载","从原始地址下载线程数")
course_path = "F:\\迅雷下载"
o.AddTask(download_url, '', course_path, "", "", 1, 0, 5)
o.CommitTasks()
6.2直接下载
最初参考了https://www.cnblogs.com/tv151579/p/4470140.html直接用requests.get()方法下载,结果下载下来的都是404错误,加了header后也不行。最后直接找到了前人经验https://www.jianshu.com/p/abbf8d331c06,里面使用了requests.get()方法的 stream模式就能成功下载,stream模式是专门用来下载大文件的。
def download_books(self, info_list):
download_list = info_list[4]
for url in download_list:
# 随机浏览器 User-Agent
headers = {"User-Agent": random.choice(self.USER_AGENTS)}
file_name = re.findall(r'\d{8}/(.*)', url)
with open('E:\迅雷下载\电子书\英语IT书\\' + str(file_name[0]), 'wb') as f:
print("正在下载", file_name[0])
response = requests.get(url, stream=True, headers=headers)
# 获取文件大小
total_length = response.headers.get('content-length')
# 如果文件大小不存在,则返回
if total_length is None:
f.write(response.content)
else:
# 下载进度条
dl = 0
total_length = int(total_length) # 文件大小
fsize = total_length / 1024
print("文件大小:{}k,正在下载...".format(fsize))
for data in response.iter_content(chunk_size=4096): # 每次响应获取 4096 字节
dl += len(data)
f.write(data)
done = int(100 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (100 - done))) # 打印进度条
sys.stdout.write("已下载:{}k".format(dl / 1024))
sys.stdout.flush()
print('\n'+str(file_name[0]) + '下载完成!')
完整代码
# !/user/bin/env python
# !-*-coding:utf-8 -*-
# !@Time :2019/5/12 13:23
import requests
from bs4 import BeautifulSoup
import pandas
import re
import sys
import random
class All_itebooks_Spider():
def _init_(self):
self.base_url = "http://www.allitebooks.org/page/"
self.User_Agent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
self.save_csv_fp = open('IT_ebooks_save.csv', 'w', newline='')
self.USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
# 1.请求网页
def get_responce(self, url):
res = requests.get(url, self.User_Agent).content.decode('gbk')
# print(res)
return res
# 2.拼接URL翻页
def get_newUrl(self, page):
url = self.base_url+str(page)
print('正在下载第{}页:'.format(page) + url)
data = self.get_responce(url)
return data
# 3.分析数据,使用bs4匹配想要的信息,进入内部获得书的下载地址
def ansys_data(self, data):
soup = BeautifulSoup(data, 'lxml')
coment = soup.select('.entry-body')
info_list = [[], [], [], [], []]
for one in coment:
# 3.1匹配书名和链接
book_url = one.a['href']
book_name = one.select('.entry-title')[0].get_text()
# 3.2作者
auther = one.select('.entry-author')[0].get_text()
# 3.3简介
introduction = one.select('.entry-summary')[0].get_text()
introduction = re.findall(r'\n(.*)\n', introduction)
# 3.4获取下载链接
res = requests.get(book_url, self.User_Agent).content.decode('utf-8')
soup_1 = BeautifulSoup(res, 'lxml')
download_url = soup_1.select('.download-links')
if len(download_url) == 2: # 有的书有PDF和EPUB两种格式
download_url_1 = download_url[1].a['href'] # 优先下载EPUB格式的书,即下载download_url_1
else:
download_url_1 = download_url[0].a['href']
# 3.5数据汇总
info_list[0].extend([book_name])
info_list[1].extend([auther])
info_list[2].extend([introduction])
info_list[3].extend([book_url])
info_list[4].extend([download_url_1])
return info_list
# 4.保存数据,保存成cvs
def save_csv(self, info_list, page):
saver = pandas.DataFrame({'book_name': info_list[0],
'book_auther': info_list[1],
'book_introduction': info_list[2],
'book_url': info_list[3]
})
if page == 1:
saver.to_csv(self.save_csv_fp, index=False, sep=',', mode='a', encoding='utf-8')
else:
# header=False不写列名
saver.to_csv(self.save_csv_fp, header=False, index=False, sep=',', mode='a', encoding='utf-8')
# 5.下载书籍
def download_books(self, info_list):
download_list = info_list[4]
for url in download_list:
# 随机浏览器 User-Agent
headers = {"User-Agent": random.choice(self.USER_AGENTS)}
file_name = re.findall(r'\d{8}/(.*)', url)
with open('E:\迅雷下载\电子书\英语IT书\\' + str(file_name[0]), 'wb') as f:
print("正在下载", file_name[0])
response = requests.get(url, stream=True, headers=headers)
# 获取文件大小
total_length = response.headers.get('content-length')
# 如果文件大小不存在,则返回
if total_length is None:
f.write(response.content)
else:
# 下载进度条
dl = 0
total_length = int(total_length) # 文件大小
fsize = total_length / 1024
print("文件大小:{}k,正在下载...".format(fsize))
for data in response.iter_content(chunk_size=4096): # 每次响应获取 4096 字节
dl += len(data)
f.write(data)
done = int(100 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (100 - done))) # 打印进度条
sys.stdout.write("已下载:{}k".format(dl / 1024))
sys.stdout.flush()
print('\n'+str(file_name[0]) + '下载完成!')
# 6.运行程序
def start(self):
self._init_()
for page in range(3, 4):
data = self.get_newUrl(page)
info_list = self.ansys_data(data)
self.save_csv(info_list, page)
self.download_books(info_list)
All_itebooks_Spider().start()