PYTHON网络爬虫从入门到实践第 6章 mongodb 实践记录

本文介绍了一个虎扑论坛爬虫项目的实现过程，包括使用Python的requests和BeautifulSoup库抓取网页，解析并提取帖子信息，如标题、链接、作者等，并将数据存储到MongoDB数据库中。此外，还展示了如何通过遍历多页来获取更多数据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

https://bbs.hupu.com/bxj
在这里插入图片描述

在这里插入图片描述


帖子名称
帖子链接
作者
作者链接
创建时间
回复数
浏览数
最后回复用户
最后回复时间

<html>
 <head></head>
 <body>
  <li> 
   <div class="titlelink box" style="width:645px;"> 
    <a href="/29125141.html" class="truetit" target="_blank">被古人的哪句诗词惊艳到了，汉字之美！</a> 
    <span class="light_r  "> <a title="有17个亮了的回帖">&nbsp;</a> </span> [&nbsp;
    <span class="multipage"> <a href="/29125141-2.html" target="_blank">2</a> <a href="/29125141-3.html" target="_blank">3</a>...<a href="/29125141-10.html" target="_blank">10</a> </span> &nbsp;]&nbsp;
   </div> 
   <div class="author box"> 
    <a class="aulink" target="_blank" href="https://my.hupu.com/204678157619831">广工老男孩</a> 
    <br /> 
    <a style="color:#808080;cursor: initial; ">2019-08-26</a> 
   </div> <span class="ansour box">197&nbsp;/&nbsp;69889</span> 
   <div class="endreply box"> 
    <a href="29125141-10.html#o">15:44</a> 
    <br /> 
    <span class="endauthor ">yx771018</span> 
   </div> </li>
 </body>
</html>

可以使用的代码：

import requests
from bs4 import BeautifulSoup
import datetime

# 获取页面
def get_page(link):
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    r = requests.get(link, headers = headers)
    html = r.content  #使用r.content解封装
    html = html.decode('utf-8')  #由UTF-8解码为unicode
    soup = BeautifulSoup(html, 'lxml')
    return soup

# 解析网页
def get_data(post_list):
    data_list =[]
    for post in post_list:
        title = post.find('div',class_='titlelink box').text.strip()
        post_link = post.find('div',class_='titlelink box').a['href']
        post_link = "https://bbs.hupu.com" + post_link

        author = post.find('div',class_='author box').a.text.strip()
        author_page = post.find('div',class_='author box').a['href']
        start_date = post.find('div',class_='author box').contents[5].text.strip()

        reply_view = post.find('span',class_='ansour box').text.strip()
        reply = reply_view.split('/')[0].strip()
        view = reply_view.split('/')[1].strip()

        reply_time = post.find('div',class_='endreply box').a.text.strip()
        last_reply = post.find('div',class_='endreply box').span.text.strip()
        if ':' in reply_time: #时间是11:27
            date_time = str(datetime.date.today()) + ' ' + reply_time
            date_time = datetime.datetime.strptime(date_time, '%Y-%m-%d %H:%M')
        elif reply_time.find("-") == 4: #时间是2017-02-27
            date_time = datetime.datetime.strptime(reply_time, '%Y-%m-%d').date()
        else: #时间是11-27
            date_time = datetime.datetime.strptime('2018-' + reply_time, '%Y-%m-%d').date()
        data_list.append([title, post_link, author, author_page, start_date, reply, view, last_reply, date_time])
    return data_list

link = "https://bbs.hupu.com/bxj"
soup = get_page(link)
post_all= soup.find('ul', class_="for-list")
post_list = post_all.find_all('li')
data_list = get_data(post_list)
for each in data_list:
    print (each)

from pymongo import MongoClient
class MongoAPI(object):
    def __init__(self, db_ip, db_port, db_name, table_name):
        self.db_ip = db_ip
        self.db_port = db_port
        self.db_name = db_name
        self.table_name = table_name
        self.conn = MongoClient(host=self.db_ip, port=self.db_port)
        self.db = self.conn[self.db_name]
        self.table = self.db[self.table_name]
    def get_one(self, query):
        return self.table.find_one(query, projection={"_id": False})
    def get_all(self, query):
        return self.table.find(query)
    def add(self, kv_dict):
        return self.table.insert_one(kv_dict)
    def delete(self, query):
        return self.table.delete_many(query)
    def check_exist(self, query):
        ret = self.table.find_one(query)
        return ret != None
    # 如果没有会新建
    def update(self, query, kv_dict):
            self.table.update_one(query,{
              '$set': kv_dict
            }, upsert=True)


hupu_post = MongoAPI("localhost",  27017,  "hupu", "post")
for each in data_list:
    hupu_post.add({"title": each[0],
                "post_link": each[1],
               "author": each[2],
               "author_page": each[3],
               "start_date": str(each[4]),
               "reply": each[5],
               "view": each[6],
               "last_reply": each[7],
               "last_reply_time": str(each[8])})

多个页面的也可以使用:
到第11页的时候，网站会让你去注册，所以到第11页的时候，会出错。

import requests
from bs4 import BeautifulSoup
import datetime

# 获取页面
def get_page(link):
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    r = requests.get(link, headers = headers)
    html = r.content  #使用r.content解封装
    html = html.decode('utf-8')  #由UTF-8解码为unicode
    soup = BeautifulSoup(html, 'lxml')
    return soup

# 解析网页
def get_data(post_list):
    data_list =[]
    for post in post_list:
        title = post.find('div',class_='titlelink box').text.strip()
        post_link = post.find('div',class_='titlelink box').a['href']
        post_link = "https://bbs.hupu.com" + post_link

        author = post.find('div',class_='author box').a.text.strip()
        author_page = post.find('div',class_='author box').a['href']
        start_date = post.find('div',class_='author box').contents[5].text.strip()

        reply_view = post.find('span',class_='ansour box').text.strip()
        reply = reply_view.split('/')[0].strip()
        view = reply_view.split('/')[1].strip()

        reply_time = post.find('div',class_='endreply box').a.text.strip()
        last_reply = post.find('div',class_='endreply box').span.text.strip()
        if ':' in reply_time: #时间是11:27
            date_time = str(datetime.date.today()) + ' ' + reply_time
            date_time = datetime.datetime.strptime(date_time, '%Y-%m-%d %H:%M')
        elif reply_time.find("-") == 4: #时间是2017-02-27
            date_time = datetime.datetime.strptime(reply_time, '%Y-%m-%d').date()
        else: #时间是11-27
            date_time = datetime.datetime.strptime('2018-' + reply_time, '%Y-%m-%d').date()
        data_list.append([title, post_link, author, author_page, start_date, reply, view, last_reply, date_time])
    return data_list



from pymongo import MongoClient
class MongoAPI(object):
    def __init__(self, db_ip, db_port, db_name, table_name):
        self.db_ip = db_ip
        self.db_port = db_port
        self.db_name = db_name
        self.table_name = table_name
        self.conn = MongoClient(host=self.db_ip, port=self.db_port)
        self.db = self.conn[self.db_name]
        self.table = self.db[self.table_name]
    def get_one(self, query):
        return self.table.find_one(query, projection={"_id": False})
    def get_all(self, query):
        return self.table.find(query)
    def add(self, kv_dict):
        return self.table.insert_one(kv_dict)
    def delete(self, query):
        return self.table.delete_many(query)
    def check_exist(self, query):
        ret = self.table.find_one(query)
        return ret != None
    # 如果没有会新建
    def update(self, query, kv_dict):
            self.table.update_one(query,{
              '$set': kv_dict
            }, upsert=True)


###############################################

import requests
from bs4 import BeautifulSoup
import datetime
from pymongo import MongoClient
import time

hupu_post = MongoAPI("localhost", 27017, "hupu", "post")
for i in range(1, 100):
    link = "https://bbs.hupu.com/bxj-" + str(i)
    print (link)
    soup = get_page(link)

    post_all = soup.find('ul', class_="for-list")
    post_list = post_all.find_all('li')
    data_list = get_data(post_list)
    for each in data_list:
        hupu_post.update({"post_link": each[1]}, {"title": each[0],
                                                  "post_link": each[1],
                                                  "author": each[2],
                                                  "author_page": each[3],
                                                  "start_date": str(each[4]),
                                                  "reply": each[5],
                                                  "view": each[6],
                                                  "last_reply": each[7],
                                                  "last_reply_time": str(each[8])})
    time.sleep(3)
    print('第', i, '页获取完成，休息3秒')

（稍后补充）