嗨喽,大家好呀~这里是爱看美女的茜茜呐
开发环境:
-
版 本: python 3.8
-
编辑器:pycharm
第三方库:
-
requests >>> pip install requests
-
parsel >>> pip install parsel
模块安装:
按住键盘 win + r, 输入cmd回车 打开命令行窗口, 在里面输入 pip install 模块名
👇 👇 👇 更多精彩机密、教程,尽在下方,赶紧点击了解吧~
python源码、视频教程、插件安装教程、资料我都准备好了,直接在文末名片自取就可
需求分析
确定需要的数据
找数据真实来源
https://travel.qunar.com/travelbook/list.htm?order=hot_heat
静态数据
代码实现步骤
-
发送请求
-
获取数据
-
提取数据
-
保存数据
数据获取
导入模块
import requests # 发送请求 代码当中用来访问网站的模块
import parsel # 解析数据的模块
import csv
with open('攻略.csv', mode='a', encoding='utf-8', newline='') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['title', 'date', 'days', 'photo_nums', 'fee', 'people', 'trip', 'view', 'love', 'comment', 'href'])
for page in range(1, 201):
url = f'https://travel.qunar.com/travelbook/list.htm?page={
page}&order=hot_heat'
- 发送请求
response = requests.get(url=url)
- 获取数据
html_data = response.text
- 提取数据
# re / css / xpath
# css: ul.b_strategy_list > li
# xpath: //ul[@class="b_strategy_list "]/li
select = parsel.Selector(html_data)
lis = select.xpath('//ul[@class="b_strategy_list "]/li')
# 二次的数据提取
for li in lis:
# li.css('h2 > a::text').get()
title = li.xpath('./h2/a/text()').get()
date = li.xpath('./p[@class="user_info"]//span[@class="date"]/text()').get("")
days = li.xpath('./p[@class="user_info"]//span[@class="days"]/text()').get("")
photo_nums = li.xpath('./p[@class="user_info"]//span[@class="photo_nums"]/text()').get("")
fee = li.xpath('./p[@class="user_info"]//span[@class="fee"]/text()').get("")
people = li.xpath('./p[@class="user_info"]//span[@class="people"]/text()').get("")
trip = li.xpath('./p[@class="user_info"]//span[@class="trip"]/text()').get("")
view = li.xpath('./p[@class="user_info"]//span[@class="icon_view"]/span/text()').get("")
love = li.xpath('./p[@class="user_info"]//span[@class="icon_love"]/span/text()').get("")
comment = li.xpath('./p[@class="user_info"]//span[@class="icon_comment"]/span/text()').get("")
href = li.xpath('./h2/a/@href').get()
print(title, date, days, photo_nums, fee, people, trip, view, love, comment, href)
- 保存数据
with open('攻略.csv', mode='a', encoding='utf-8', newline='') as f:
csv_writer = csv.writer(f)
csv_writer.writerow([title, date, days, photo_nums, fee, people, trip, view, love, comment, href])
数据可视化
import pandas as pd
from pyecharts.commons.utils import JsCode
from pyecharts.charts import *
from pyecharts import options as opts
data = pd.read_csv('去哪儿_数分.csv')
data
data.info()
data = data[~data['地点'].isin(['攻略'])]
data = data[~data['天数'].isin(['99+'])]
data
data.drop_duplicates(inplace=True)
data['人均费用'].fillna(0, inplace=True)
data['人物'].fillna('独自一人', inplace=True)
data['玩法'].fillna('没有', inplace=True)
data['天数'] = data['天数'].astype(int)
data = data[data['人均费用'].values>200]
data = data[data['天数']<=15]
data
data = data.reset_index(drop=True)
data
def Month(e):
m = str(e).split('/')[2]
if m=='01':
return '一月'