1、任选一小说网站,爬取任意一部小说,以记事本的形式保存。
第一种情况(网址可能已失效):
import requests
from lxml import etree
def get_url():
url = 'http://www.yuetutu.com/cbook_24378/'
r = requests.get(url)
tree = etree.HTML(r.text)
return tree
def get_book_url(tree):
book_list= []
dd_list = tree.xpath('//*[@id="list"]/dl/dd')
i = 1
for dd in dd_list:
if i >= 9:
url_book = dd.xpath('a')[0].attrib['href']
title_book = dd.xpath('a')[0].text
book_list.append((title_book, url_book))
i += 1
return book_list
def get_content(book):
domain = 'http://www.yuetutu.com'
for i in book:
url = domain + i[1]
r = requests.get(url)
r.encoding = 'utf8'
tree = etree.HTML(r.text)
title = tree.xpath('//*[@id="wrapper"]/div[3]/div/div[2]/h1')[0].text
text = tree.xpath('//*[@id="content"]/text()')
path = r'C:\Users\dell\Desktop\小说\ '
with open(path + title + '.txt', 'w', encoding='utf8') as f:
for j in text:
f.write(j)
print('******{} 下载完成! ******'.format(title))
if __name__ == '__main__':
a = get_url()
b = get_book_url(a)
get_content(b)
第二种情况:
import requests
from lxml import etree
def get_url():
url = 'https://www.51biquge.com/book_12066/'
r = requests.get(url)
tree = etree.HTML(r.text)
return tree
def get_book_url(tree):
book_list = []
dd_list = tree.xpath('//*[@id="list"]/dl/dd')
i = 1
for dd in dd_list:
if i >= 13:
url_book = dd.xpath('a')[0].attrib['href']
title_book = dd.xpath('a')[0].text.strip()
book_list.append((title_book, url_book))
i += 1
return book_list
def get_content(book):
domain = 'https://www.51biquge.com'
for i in book:
url = domain + i[1]
r = requests.get(url)
r.encoding = 'utf8'
tree = etree.HTML(r.text)
title = tree.xpath('//*[@id="main"]/div/div/div[2]/h1')[0].text
text = tree.xpath('//*[@id="content"]//p/text()')
path = r'C:\Users\dell\Desktop\小说\ '
with open(path + title + '.txt', 'w', encoding='utf8') as f:
for j in text:
f.write(j)
print('******{} 下载完成! ******'.format(title))
if __name__ == '__main__':
a = get_url()
b = get_book_url(a)
get_content(b)