python爬取网站数据

最新推荐文章于 2024-12-29 10:13:54 发布

原创最新推荐文章于 2024-12-29 10:13:54 发布 · 318 阅读

CC 4.0 BY-SA版权

文章标签：

# -*- coding: utf-8 -*- # @Time : 2019/7/15 15:59 import requests import pandas # get和post请求 # request的get请求 # resp = requests.get('http://www.neuedu.com') # resp = requests.get('http://www.baidu.com') # html_data = resp.content #获取str类型的源代码 # with open('baidu.html','wb') as f: # f.write(html_data) # print(html_data) # html_data2 = resp.content #获取byte类型的源代码 # print(html_data2) # print(resp.status_code) #获取请求响应状态码 # 200 ok 3 404 5 # print(resp.headers) #请求头 # print(resp.encoding) #编码方式 # # resp.encoding = 'gbk' 修改编码方式 # print(resp.request) # 添加请求头 # headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} # User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1 # resp = requests.get('https://www.zhihu.com',headers = headers) # print(resp.status_code) # 爬取当当网‘python从入门到实践’书的书名、价格、购买链接、商家 from lxml import html def spider(isbn): """当当网图书信息爬取""" book_list = [] url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn) html_data = requests.get(url).text # print(html_data # 使用xpath语法进行页面提取 selector = html.fromstring(html_data) # 获取图书的书名价格购买链接店铺名 ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li') print(len(ul_list)) # //div[@id="search_nature_rg"]/ul/li/p[@class="price"]/span[@class="search_now_price"]/text() for li in ul_list: # 获取书名 # // div[ @ id = "search_nature_rg"] / ul / li / p[ @class ="name"] / a / @ title title = li.xpath('p[@class="name"]/a/@title')[0].strip() # print(title) # 获取价格 price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()') price = price[0].replace('¥','') # print(price) # 获取购买链接 link = li.xpath('a/@href')[0].strip() # print(link) # 获取商家 store = li.xpath('p[@class="search_shangjia"]/a/text()') # if len(store) == 0: # # store = '当当自营' # # else: # # store = store[0] # # print(store) # 三元表达式和上面的if else一样的功能 store = '当当自营' if len(store) == 0 else store[0] # print(store) book_list.append({ 'title': title,'price': price,'link': link,'store': store }) # 排序 book_list = sorted(book_list,key=lambda x: float(x['price']),reverse=True) for book in book_list: print(book) # 存储成csv逗号分隔符文件 df = pd.DataFrame(book_list) df.to_csv('当当.csv') # df.to_excel('') # df.to_txt('') spider('9787115428028')