爬虫新手刚入门,萌新练手交流作
工具
- Pycharm
- 请求库requests
- 解析库BeautifulSoup
- 谷歌浏览器
代码
import requests
import bs4
from bs4 import BeautifulSoup
# 伪装浏览器,获取源代码
def getHtml(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
} # 伪装浏览器
response = requests.get(url,headers=headers) # 使用request的声明
if response.status_code == 200:
return response.text # 如果响应正常,返回网页源代码
return None
# 获取页面上所有小说
def get_chinese_novel(url):
html = getHtml(url)
soup = BeautifulSoup(html, 'lxml') #使用BeautifulSoup的lxml库
novels_dir = [] # 定义列表
for all_li in soup.find_all('li', class_='media'): #寻找所有<li>标签下的<class>为media的节点的内容
if isinstance(all_li, bs4.element.Tag): #判断返回的all_al 是否为bs4.element.Tag类型
for all_novels in all_li.find_all('a', class_='text-white'):
novels_info = {}
novels_info['name'] = (all_novels.get_text().strip('\n')) # 将节点内容的文本放入字典
novels_info['url'] = 'https://www.wuxiaworld.com' + all_novels.get('href') #将https://www.wuxiaworld.com与节点中的<href>里的链接合起来放入字典
novels_dir.append(novels_info['name']) #将字典里的name放入列表
novels_dir.append(novels_info['url']) #将字典的url放入列表
return novels_dir #返回列表
# 获取具体的小说的目录,只是url变成了具体哪本小说的url
def get_novel_dir(url):
html = getHtml(url)
soup = BeautifulSoup(html, 'lxml')
books_dir = []
books_info = {}
for all_title in soup.find('div', class_='panel-group').children:
if isinstance(all_title, bs4.element.Tag):
books_info = {}
for all_a in all_title.find_all('li', class_='chapter-item'):
books_info['name'] = (all_a.get_text())
books_info['url'] = 'https://www.wuxiaworld.com' + all_a.a.get('href')
books_dir.append(books_info['name'])
books_dir.append(books_info['url'])
return books_dir
#将 get_chinese_novel()获得的列表转化为字典
def novels_dict(url):
dict = {}
list_novels = get_chinese_novel(url)
for i in range(len(list_novels) - 1):
if i % 2 == 0:
dict[list_novels[i]] = list_novels[i + 1] # 将列表两两配对成字典,便于根据书名查找链接
else:
continue
return dict
url = 'https://www.wuxiaworld.com/language/chinese'
novel_name = input("请输入书名:")
for i in get_novel_dir(novels_dict(url)[novel_name]):
print(i)
结果如图

本文介绍了一位爬虫新手如何使用Python的Pycharm、requests和BeautifulSoup库抓取wuxiaworld.com网站上的中文小说信息,包括小说名称和URL,并展示了如何解析网页源代码,获取小说目录。
1039

被折叠的 条评论
为什么被折叠?



