# -*- coding: utf-8 -*-
# @Time : 2019/7/15 15:59
import requests
import pandas
# get和post请求
# request的get请求
# resp = requests.get('http://www.neuedu.com')
# resp = requests.get('http://www.baidu.com')
# html_data = resp.content #获取str类型的源代码
# with open('baidu.html','wb') as f:
# f.write(html_data)
# print(html_data)
# html_data2 = resp.content #获取byte类型的源代码
# print(html_data2)
# print(resp.status_code) #获取请求响应状态码
# 200 ok 3 404 5
# print(resp.headers) #请求头
# print(resp.encoding) #编码方式
# # resp.encoding = 'gbk' 修改编码方式
# print(resp.request)
# 添加请求头
# headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
# User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1
# resp = requests.get('https://www.zhihu.com',headers = headers)
# print(resp.status_code)
# 爬取当当网‘python从入门到实践’书的书名、价格、购买链接、商家
from lxml import html
def spider(isbn):
"""当当网图书信息爬取"""
book_list = []
url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn)
html_data = requests.get(url).text
# print(html_data
# 使用xpath语法进行页面提取
selector = html.fromstring(html_data)
# 获取图书的 书名 价格 购买链接 店铺名
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print(len(ul_list))
# //div[@id="search_nature_rg"]/ul/li/p[@class="price"]/span[@class="search_now_price"]/text()
for li in ul_list:
# 获取书名
# // div[ @ id = "search_nature_rg"] / ul / li / p[ @class ="name"] / a / @ title
title = li.xpath('p[@class="name"]/a/@title')[0].strip()
# print(title)
# 获取价格
price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')
price = price[0].replace('¥','')
# print(price)
# 获取购买链接
link = li.xpath('a/@href')[0].strip()
# print(link)
# 获取商家
store = li.xpath('p[@class="search_shangjia"]/a/text()')
# if len(store) == 0:
# # store = '当当自营'
# # else:
# # store = store[0]
# # print(store)
# 三元表达式 和上面的if else一样的功能
store = '当当自营' if len(store) == 0 else store[0]
# print(store)
book_list.append({
'title': title,'price': price,'link': link,'store': store
})
# 排序
book_list = sorted(book_list,key=lambda x: float(x['price']),reverse=True)
for book in book_list:
print(book)
# 存储成csv逗号分隔符文件
df = pd.DataFrame(book_list)
df.to_csv('当当.csv')
# df.to_excel('')
# df.to_txt('')
spider('9787115428028')
python爬取网站数据
最新推荐文章于 2024-12-29 10:13:54 发布