python爬虫-糗事百科段子

最新推荐文章于 2024-05-03 14:57:49 发布

LJXZDN

最新推荐文章于 2024-05-03 14:57:49 发布

阅读量319

点赞数 1

分类专栏： Python

本文链接：https://blog.youkuaiyun.com/LJXZDN/article/details/80939697

版权

Python 专栏收录该内容

15 篇文章

订阅专栏

from bs4 import BeautifulSoup
import requests
import threading
import math
import lxml

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}


# 获取单个页面内容:
def get_single_page(url, headers):
    html = requests.get(url, headers)
    return html.text


# 解析目标内容
def get_content(pages):
    # 作者
    authors = []
    # 内容
    content = []
    # 作者和内容的字典
    dict_authors_content = {}
    for i in pages:
        soup = BeautifulSoup(i, 'lxml')
        # 作者tag
        author_tag = soup.find_all('h2')
        # 存储作者列表
        for i in author_tag:
            authors.append(i.string)
        # 内容tag
        content_tag = soup.find_all(class_='content')
        # 存储内容列表
        for i in content_tag:
            content.append(i.text)
    # 用zip+dict将2个列表合并为字典
    dict_authors_content = dict(zip(authors, content))

    return dict_authors_content


# 下载到本地
def download(i, tname):
    tname=tname.strip()
    with open('D://meizi/' + tname + '.txt', 'a',encoding='utf-8') as f:
        f.writelines(i[0])
        f.writelines(i[1])


# 每个工作的爬虫线程
# 工作内容:1.解析目标内容.2.下载到本地
class Spider(threading.Thread):
    def __init__(self, pages):
        threading.Thread.__init__(self)  # 初始化线程
        self.pages = pages
        print(self.name)

    def run(self):
        # 目标内容
        content = get_content(self.pages)
        # 将字典转换为可遍历的元组
        content = content.items()
        # 下载到本地
        for i in content:
            download(i, self.name)
        print('下载成功')


def main(page_num):
    # 确定多线程的任务
    # 要抓取的内容列表
    pages = []
    for i in range(page_num):
        i += 1
        url = ('https://www.qiushibaike.com/8hr/page/%d/' % i)
        html = get_single_page(url, headers=headers)
        pages.append(html)
    # 给各个线程安排任务
    # 工作线程
    threads = []
    # 工作线程数量及最大数量
    t_count = 10 if len(pages) >= 10 else len(pages)
    for i in range(t_count):
        # 均分工作量的方式:
        t_page = pages[math.ceil(int((len(pages)) / t_count) * i):math.ceil((int(len(pages) / t_count)) * (i + 1))]
        t = Spider(t_page)
        threads.append(t)
    # 各个线程开始工作
    for i in threads:
        i.start()


if __name__ == '__main__':
    page_num = input('请输入下载的页数:')
    main(int(page_num))