更多爬虫内容请关注博主的专栏python3网络爬虫
import requests
from bs4 import BeautifulSoup
import re
'''
@author: Face_to_sun
@modify:2018-12-01
'''
def getHTMLText(url, page):
headers = {
'Host': 'acm.cug.edu.cn',
'Referer': 'http://acm.cug.edu.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
}
data = {
'page': str(page)
}
try:
url = url + str(page)
response = requests.get(url, headers=headers, data=data)
response.raise_for_status();
response.encoding = 'utf8'
print(url)
return response.text
except:
return "产生异常"
# def getHead(soup):
# thead = soup.find_all(name='tr', class_='toprow')
# for tr in thead:
# ths = tr.find_all('th')
# for th in ths:
# print(th.string)
def getBody(soup):
problem = {}
# table = soup.find_all(name='table', id=problemset)
table = soup.find_all(id="problemset")
for i in table:
tbody = i.find_all('tbody')
for body in tbody:
trs = body.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
problem['Bool'] = tds[0].get_text()
problem['problem ID'] =tds[1].get_text()
problem['Title'] = tds[2].get_text()
problem['Source/Category'] = tds[3].get_text()
problem['AC'] = tds[4].get_text()
problem['Submit'] = tds[5].get_text()
if int(tds[5].get_text()) == 0:
problem['Accuracy rate'] = 0
else:
problem['Accuracy rate'] = int(tds[4].get_text())/int(tds[5].get_text())
print(problem)
if __name__ == "__main__":
url = 'http://acm.cug.edu.cn/problemset.php?page='
for i in range(1,13):
html = getHTMLText(url, i)
# with open('problem.html', 'w', encoding='utf8') as f:
# f.write(html)
#error: python编码问题——解决python3 UnicodeEncodeError: 'gbk' codec can't encode character '\xXX' in position XX
soup = BeautifulSoup(html,'lxml')
# getHead(soup)
getBody(soup)
运行结果


本文介绍了一个使用Python进行网络爬虫的实战案例,通过requests和BeautifulSoup库从指定网站抓取竞赛题目的详细信息,包括题目ID、标题、来源、提交次数等,并计算准确率。
1100

被折叠的 条评论
为什么被折叠?



