解析HTML 、XML、HTML5 【beautifulsoup4】
pip install beautifulsoup4
pip install lxml
from bs4 import BeautifulSoup
解析数据
def parse_result(soup):
movie_list_this_page = soup.find('ol', class_='grid_view').find_all('li')
for item in movie_list_this_page:
ranking = item.find('em').get_text()
name = item.find(class_='title').string
img = item.find('a').find('img').get('src')
score = item.find(class_='rating_num').string
author = item.find('p', class_='').get_text(strip=True)
if item.find(class_='inq') is not None:
desc = item.find(class_='inq').string
else:
desc = '暂无'
print(ranking, name, img, score, author, desc)
yield {
'ranking': ranking,
'name': name,
'img': img,
'score': score,
'author': author,
'desc': desc,
}
http请求
requests
response = requests.get(url)
# 请求获取网页数据
def request_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
'cookie': 'bid=dT8Z3OE5_cY; _pk_id.100001.4cf6=97307cec25d927ab.1727414571.; __utmz=30149280.1727414571.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1727414571.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=kz8b5hlKFkxH8Y9DITjOMuxWgBYikz0h; ll="108296"; _vwo_uuid_v2=D5055D26948C52E0832B26F1769798A7F|836a6dfe85637bb8c39462e0dadf8747; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1074059804.1727414571.1728178377.1728193731.4; __utmb=30149280.0.10.1728193731; __utma=223695111.688962959.1727414571.1728178377.1728193731.4; __utmb=223695111.0.10.1728193731'
}
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
json处理
import json
对象转化为 json:json.dumps()
json转化为对象:json.loads()
文件写入
with open('dangdang_top_500_book.txt', 'a', encoding='UTF-8') as f: f.write(json.dumps(item, ensure_ascii=False) + '\n') f.close()
Excel写入
import xlwt
# 数据写入Excel
def save_to_excel(result_list):
if result_list is None:
return
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
sheet.write(0, 0, '排名')
sheet.write(0, 1, '电影名称')
sheet.write(0, 2, '图片')
sheet.write(0, 3, '评分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '简介')
for i, item in enumerate(result_list):
row = i + 1
sheet.write(row, 0, item['ranking'])
sheet.write(row, 1, item['name'])
sheet.write(row, 2, item['img'])
sheet.write(row, 3, item['score'])
sheet.write(row, 4, item['author'])
sheet.write(row, 5, item['desc'])
book.save('豆瓣电影Top250.xls')
Redis
pip install redis