https://bbs.hupu.com/bxj
帖子名称 | |
帖子链接 | |
作者 | |
作者链接 | |
创建时间 | |
回复数 | |
浏览数 | |
最后回复用户 | |
最后回复时间 |
<html>
<head></head>
<body>
<li>
<div class="titlelink box" style="width:645px;">
<a href="/29125141.html" class="truetit" target="_blank">被古人的哪句诗词惊艳到了,汉字之美!</a>
<span class="light_r "> <a title="有17个亮了的回帖"> </a> </span> [
<span class="multipage"> <a href="/29125141-2.html" target="_blank">2</a> <a href="/29125141-3.html" target="_blank">3</a>...<a href="/29125141-10.html" target="_blank">10</a> </span> ]
</div>
<div class="author box">
<a class="aulink" target="_blank" href="https://my.hupu.com/204678157619831">广工老男孩</a>
<br />
<a style="color:#808080;cursor: initial; ">2019-08-26</a>
</div> <span class="ansour box">197 / 69889</span>
<div class="endreply box">
<a href="29125141-10.html#o">15:44</a>
<br />
<span class="endauthor ">yx771018</span>
</div> </li>
</body>
</html>
可以使用的代码:
import requests
from bs4 import BeautifulSoup
import datetime
# 获取页面
def get_page(link):
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get(link, headers = headers)
html = r.content #使用r.content解封装
html = html.decode('utf-8') #由UTF-8解码为unicode
soup = BeautifulSoup(html, 'lxml')
return soup
# 解析网页
def get_data(post_list):
data_list =[]
for post in post_list:
title = post.find('div',class_='titlelink box').text.strip()
post_link = post.find('div',class_='titlelink box').a['href']
post_link = "https://bbs.hupu.com" + post_link
author = post.find('div',class_='author box').a.text.strip()
author_page = post.find('div',class_='author box').a['href']
start_date = post.find('div',class_='author box').contents[5].text.strip()
reply_view = post.find('span',class_='ansour box').text.strip()
reply = reply_view.split('/')[0].strip()
view = reply_view.split('/')[1].strip()
reply_time = post.find('div',class_='endreply box').a.text.strip()
last_reply = post.find('div',class_='endreply box').span.text.strip()
if ':' in reply_time: #时间是11:27
date_time = str(datetime.date.today()) + ' ' + reply_time
date_time = datetime.datetime.strptime(date_time, '%Y-%m-%d %H:%M')
elif reply_time.find("-") == 4: #时间是2017-02-27
date_time = datetime.datetime.strptime(reply_time, '%Y-%m-%d').date()
else: #时间是11-27
date_time = datetime.datetime.strptime('2018-' + reply_time, '%Y-%m-%d').date()
data_list.append([title, post_link, author, author_page, start_date, reply, view, last_reply, date_time])
return data_list
link = "https://bbs.hupu.com/bxj"
soup = get_page(link)
post_all= soup.find('ul', class_="for-list")
post_list = post_all.find_all('li')
data_list = get_data(post_list)
for each in data_list:
print (each)
from pymongo import MongoClient
class MongoAPI(object):
def __init__(self, db_ip, db_port, db_name, table_name):
self.db_ip = db_ip
self.db_port = db_port
self.db_name = db_name
self.table_name = table_name
self.conn = MongoClient(host=self.db_ip, port=self.db_port)
self.db = self.conn[self.db_name]
self.table = self.db[self.table_name]
def get_one(self, query):
return self.table.find_one(query, projection={"_id": False})
def get_all(self, query):
return self.table.find(query)
def add(self, kv_dict):
return self.table.insert_one(kv_dict)
def delete(self, query):
return self.table.delete_many(query)
def check_exist(self, query):
ret = self.table.find_one(query)
return ret != None
# 如果没有会新建
def update(self, query, kv_dict):
self.table.update_one(query,{
'$set': kv_dict
}, upsert=True)
hupu_post = MongoAPI("localhost", 27017, "hupu", "post")
for each in data_list:
hupu_post.add({"title": each[0],
"post_link": each[1],
"author": each[2],
"author_page": each[3],
"start_date": str(each[4]),
"reply": each[5],
"view": each[6],
"last_reply": each[7],
"last_reply_time": str(each[8])})
多个页面的 也可以使用:
到 第11页的时候 ,网站会让你去注册,所以 到 第11页的时候 ,会出错。
import requests
from bs4 import BeautifulSoup
import datetime
# 获取页面
def get_page(link):
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get(link, headers = headers)
html = r.content #使用r.content解封装
html = html.decode('utf-8') #由UTF-8解码为unicode
soup = BeautifulSoup(html, 'lxml')
return soup
# 解析网页
def get_data(post_list):
data_list =[]
for post in post_list:
title = post.find('div',class_='titlelink box').text.strip()
post_link = post.find('div',class_='titlelink box').a['href']
post_link = "https://bbs.hupu.com" + post_link
author = post.find('div',class_='author box').a.text.strip()
author_page = post.find('div',class_='author box').a['href']
start_date = post.find('div',class_='author box').contents[5].text.strip()
reply_view = post.find('span',class_='ansour box').text.strip()
reply = reply_view.split('/')[0].strip()
view = reply_view.split('/')[1].strip()
reply_time = post.find('div',class_='endreply box').a.text.strip()
last_reply = post.find('div',class_='endreply box').span.text.strip()
if ':' in reply_time: #时间是11:27
date_time = str(datetime.date.today()) + ' ' + reply_time
date_time = datetime.datetime.strptime(date_time, '%Y-%m-%d %H:%M')
elif reply_time.find("-") == 4: #时间是2017-02-27
date_time = datetime.datetime.strptime(reply_time, '%Y-%m-%d').date()
else: #时间是11-27
date_time = datetime.datetime.strptime('2018-' + reply_time, '%Y-%m-%d').date()
data_list.append([title, post_link, author, author_page, start_date, reply, view, last_reply, date_time])
return data_list
from pymongo import MongoClient
class MongoAPI(object):
def __init__(self, db_ip, db_port, db_name, table_name):
self.db_ip = db_ip
self.db_port = db_port
self.db_name = db_name
self.table_name = table_name
self.conn = MongoClient(host=self.db_ip, port=self.db_port)
self.db = self.conn[self.db_name]
self.table = self.db[self.table_name]
def get_one(self, query):
return self.table.find_one(query, projection={"_id": False})
def get_all(self, query):
return self.table.find(query)
def add(self, kv_dict):
return self.table.insert_one(kv_dict)
def delete(self, query):
return self.table.delete_many(query)
def check_exist(self, query):
ret = self.table.find_one(query)
return ret != None
# 如果没有会新建
def update(self, query, kv_dict):
self.table.update_one(query,{
'$set': kv_dict
}, upsert=True)
###############################################
import requests
from bs4 import BeautifulSoup
import datetime
from pymongo import MongoClient
import time
hupu_post = MongoAPI("localhost", 27017, "hupu", "post")
for i in range(1, 100):
link = "https://bbs.hupu.com/bxj-" + str(i)
print (link)
soup = get_page(link)
post_all = soup.find('ul', class_="for-list")
post_list = post_all.find_all('li')
data_list = get_data(post_list)
for each in data_list:
hupu_post.update({"post_link": each[1]}, {"title": each[0],
"post_link": each[1],
"author": each[2],
"author_page": each[3],
"start_date": str(each[4]),
"reply": each[5],
"view": each[6],
"last_reply": each[7],
"last_reply_time": str(each[8])})
time.sleep(3)
print('第', i, '页获取完成,休息3秒')
(稍后补充)