【Python爬虫】简单案例介绍4

最新推荐文章于 2025-08-04 16:58:05 发布

水w

最新推荐文章于 2025-08-04 16:58:05 发布

阅读量394

点赞数 10

CC 4.0 BY-SA版权

分类专栏： # python爬虫文章标签： python 爬虫开发语言 beautifulsoup scrapy

本文链接：https://blog.youkuaiyun.com/qq_45956730/article/details/147233612

python爬虫专栏收录该内容

21 篇文章

订阅专栏

本文继续接着我的上一篇博客【Python爬虫】简单案例介绍3-优快云博客

3.4 完整代码

此小节给出上述案例的完整代码，

# encoding=utf-8
import re, json, requests, xlwt, csv
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
from openpyxl import Workbook
import numpy as np
 
"""
爬取科普中国-图文
"""
 
class MySpider(object):
    """科普中国-图文"""
    def __init__(self):
        self.base_url = 'https://cloud.kepuchina.cn/newSearch/imageText?s=&start_time=&end_time=&type=1&keyword=&can_down=0&category_id=0&size=21&sort_rule=0&industry_category=0&subject_category=0&kp_category=0&crowd_category=0&spread_category=0&page='
        self.url = self.base_url + str(0)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'
        }
        self.index_list = []
        self.index_article = {}
 
 
    def get(self, url):
        """请求并返回网页源代码"""
        try:
            response = requests.get(url, self.headers)
            if response.status_code == 200:
                return response.text
        except Exception as err:
            print('get():', err)
 
 
    def parse(self, start_page, pages_num):
        """
        解析科普中国网站地址url
        :param start_page: 开始页面
        :param pages_num: 想要爬取的页面数量
        :return: 
        """
        for page in range(start_page, start_page+pages_num):
            # 将获取的页面源码加载到该对象中
            soup = BeautifulSoup(self.get(self.base_url + str(page)), 'html.parser')

            # 拿到数据列表
            for i in soup.findAll('div', class_="list-block _blockParma"):
                # 创建 BeautifulSoup 对象
                soup_i = BeautifulSoup(str(i), 'html.parser')
                # 提取文章标题和url、副标题、tag、发布者、发布时间
                title = soup_i.find('a', class_='_title').text
                title_url = soup_i.find('a', class_='_title')['href']
                subtitle = soup_i.find('p', class_='info').find('a').text
                tags = [a.text for a in soup_i.find_all('a', class_='typeColor')]
                publisher = soup_i.find('a', class_='source-txt').text.strip()
                publish_time = soup_i.find('span', class_='_time').text
                self.index_article = {"title": title, "title_url": title_url, "subtitle": self.clean(subtitle), "tag": tags, "publisher": publisher, "publish_time": publish_time}
                
                # 获得文章内容文本content和图片数量以及地址
                self.parse_page(title_url)  

                if self.index_article not in self.index_list:   # 存入列表
                    self.index_list.append(self.index_article)

            print("已完成" + str(page+1) + "页的存储")

        # self.get_json(str(self.index_list), "1.json")
        self.save_excel(self.index_list,  "result_" + str(start_page) + "_" + str(pages_num) + ".xlsx")
 
 
    def get_json(self, datas_list, filename):
        """
        将列表存储为json文件
        :param datas_list: 文章数据列表
        :param filename: json文件名称
        :return:
        """
        with open(filename, 'w') as f:
            f.write(datas_list)
 
 
    def save_excel(self, inputData, outPutFile):
        '''
        将列表数据写入excel表格文件
        inputData: 列表，含有多个字典;例如：[{'key_a':'123'},{'key_b':'456'}]
        outPutFile：输出文件名，例如：'data.xlsx'
        '''
        Lable = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']
        wb = Workbook()
        sheet = wb.active
        sheet.title = "Sheet1"
        item_0 = inputData[0]
        i = 0
        for key in item_0.keys():
            sheet[Lable[i] + str(1)].value = key
            i = i + 1
        j = 1
        for item in inputData:
            k = 0
            for key in item:
                try:
                    sheet[Lable[k] + str(j + 1)].value = item[key]
                except:
                    item[key] = [str(w) for w in item[key]]
                    sheet[Lable[k] + str(j + 1)].value = ' '.join(item[key])
                k = k + 1
            j = j + 1
        wb.save(outPutFile)
        print('数据写入完毕!')
 
 
    def parse_page(self, title_url):
        """
        进一步解析页面，得到页面的文本content、图片数量以及地址
        :param title_url: 文章标题的网页地址
        :return:
        """
        response = requests.get(title_url, headers=self.headers)
        try:
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                # 获取文章主体内容，根据新HTML结构调整选择器
                content_div = soup.find('div', class_='content-box __imgtext-content')
                if content_div:
                    content = self.clean(content_div.text)
                else:
                    content = ""
                
                # 图片数量以及地址，过滤掉不需要的图片来源（如含特定关键词的图片）
                img_url = []
                all_imgs = soup.find_all('img')
                for img in all_imgs:
                    src = img.get('src')
                    if src and 'kepuyun' in src and 'logo' not in src and 'wechat' not in src and 'weibo' not in src:
                        img_url.append(src)
                img_num = len(img_url)

                self.index_article["content"] = content
                self.index_article["img_num"] = img_num
                self.index_article["img_url"] = img_url
            else:
                print(f"请求失败，状态码: {response.status_code}")
        except Exception as err:
            print('parse_page:', err)
 
 
    def clean(self, text):
        """清理文本"""
        text = re.sub(r'\n|\r', '', text).strip().replace(r"\n", "")
        text = text.split('\ue62b')[0]
        return text
 
 
    def main(self):
        """
        主函数
        :return: 
        """
        self.parse(0, 1)
 
 
 
if __name__ == "__main__":
    spider = MySpider()
    spider.main()

OK。