京东数据(滚动爬取+翻页)
"""
Time:2021/5/28 14:38
Author:Spectre
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from lxml import etree
from bs4 import BeautifulSoup
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
global b
b = webdriver.Chrome()
b.get('https://www.jd.com')
input = b.find_element_by_css_selector('#key')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
time.sleep(1)
def get_net_data():
max_height = 10000
y = 0
while True:
y += 200
b.execute_script(f'window.scrollTo(0,{y})')
if y>max_height:
break
time.sleep(0.5)
return b.page_source
def fanye():
time.sleep(1)
search_btn = b.find_element_by_css_selector('.pn-next')
search_btn.click()
def an_data(data):
soup = BeautifulSoup(data,'lxml')
li_list = soup.select('#J_goodsList>ul>li')
all_item = []
for li in li_list:
item = {}
name_list = li.select_one('.p-name.p-name-type-2>a>em').get_text().replace('\n', '')
item['name'] = name_list
src_list =li.select_one('.p-img img')
src_list = 'https://' + src_list.attrs['src'] if src_list else ''
item['img'] = src_list
a_list = 'https://' + li.select_one('a').attrs['href']
item['link'] = a_list
price_list = li.select_one('.p-price i').get_text().strip()
item['price'] = price_list
comments_list = li.select_one('.p-commit strong').get_text().replace('\n','')
item['comments'] = comments_list
shop_list = li.select_one('.p-shop a').get_text()
item['shop'] = shop_list
icon_list = li.select_one('.p-icons').get_text().strip().replace('\n', '')
item['icon'] = icon_list
all_item.append(item)
print(all_item,len(all_item))
return all_item
if __name__ == '__main__':
f = open('files/jd_computer.csv', 'w', newline='', encoding='utf-8')
writer = csv.DictWriter(f, ['name', 'img', 'link', 'price', 'comments', 'shop', 'icon'])
writer.writerow(
{'name': '电脑名称', 'img': '电脑图片', 'link': '电脑链接', 'price': '价格', 'comments': '评论数', 'shop': '店铺名称',
'icon': '图标名称'})
for i in range(10):
result = an_data(get_net_data())
for x in result:
writer.writerows([
{'name': x['name'], 'img': x['img'], 'link': x['link'], 'price': x['price'], 'comments': x['comments'],
'shop': x['shop'], 'icon': x['icon']}
])
fanye()
f.close()
51job数据爬取(“数据分析”所有岗位)
"""
Time:2021/5/30 12:49
Author:Spectre
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import csv
global b
b = webdriver.Chrome()
b.get('https://www.51job.com')
input = b.find_element_by_css_selector('#kwdselectid')
input.send_keys('数据分析')
input.send_keys(Keys.ENTER)
time.sleep(1)
def get_net_data():
max_height = 7500
return b.page_source
def fanye():
time.sleep(5)
search_btn = b.find_element_by_css_selector('.p_in>ul>.next>a')
search_btn.click()
def an_data(data):
soup = BeautifulSoup(data,'lxml')
li_list = soup.select('.j_joblist>.e')
all_item = []
for li in li_list:
item = {}
name_list = li.select_one('.jname.at').attrs['title']
item['name'] = name_list
a_list = li.select_one('a').attrs['href']
item['link'] = a_list
time_list =li.select_one('.time').get_text()
item['time'] = time_list
price_list = li.select_one('.sal').get_text()
item['price'] = price_list
info_list = li.select_one('.d.at').get_text().replace(' ','')
item['info'] = info_list
tags_list =li.select_one('.tags')
tags_list = li.select_one('.tags').attrs['title'] if tags_list else ''
item['tags'] = tags_list
co_name_list = li.select_one('.er>a').get_text()
item['co_name'] = co_name_list
co_link_list = li.select_one('.er>a').attrs['href']
item['co_link'] = co_link_list
co_type_list = li.select_one('.er>.dc.at').get_text()
item['co_type'] = co_type_list
co_intro_list = li.select_one('.er>.int.at').get_text()
item['co_intro'] = co_intro_list
all_item.append(item)
print(all_item)
return all_item
if __name__ == '__main__':
f = open('files/51_job.csv', 'w', newline='', encoding='utf-8')
writer = csv.DictWriter(f, ['name', 'link','time', 'price', 'info', 'tags', 'co_name', 'co_link', 'co_type', 'co_intro'])
writer.writerow(
{'name': '职位名称', 'link':'职位链接','time':'发布时间', 'price':'薪资', 'info':'岗位要求', 'tags':'公司福利', 'co_name':'公司名称', 'co_link':'公司链接', 'co_type':'公司类型', 'co_intro':'公司介绍'})
for i in range(171):
result = an_data(get_net_data())
for x in result:
writer.writerows([
{'name': x['name'], 'link':x['link'],'time':x['time'], 'price':x['price'], 'info':x['info'], 'tags':x['tags'], 'co_name':x['co_name'], 'co_link':x['co_link'], 'co_type':x['co_type'], 'co_intro':x['co_intro']}
])
fanye()
time.sleep(3)
f.close()