点击跳转总目录
本篇只记录程序点击跳转项目
直接上代码
- 此项目主要为了学习RFM模型,R(近度)F(频度)M(额度),我把目标盯上了淘宝,京东这种购物网站,但是水平太菜
- 前几天学了selenium,参考了网上的代码写了这个爬虫
- 写了这个爬虫之后发现获取到的数据仍然并不理想,故放弃
import time
from selenium import webdriver
from lxml import etree
import urllib
import csv
import pandas as pd
import random
def get_page(depth):
keyword = "糖果"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
}
base_url = 'https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page={}'
for page_num in range(1, depth):
try:
start_url = base_url.format(urllib.parse.quote(keyword), page_num * 2 - 1)
driver = webdriver.Chrome(executable_path="E:\Google\Chrome\Application\chromedriver.exe")
driver.get(start_url)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(10)
source = driver.page_source
html = etree.HTML(source)
item = parse_page(html)
write_excel(item)
print('爬取第' + str(page_num) + '页时成功!')
time.sleep(random.randint(2, 6))
except:
print('爬取第' + str(page_num) + '页时出错!')
continue
def parse_page(html):
li = html.xpath('//*[@id="J_goodsList"]/ul/li')
for one_li in li:
yield {
'price': one_li.xpath('div/div[2]/strong/i/text()')[0],
'title': get_title(one_li),
'comment_num': one_li.xpath('div/div[4]/strong/a/text()')[0],
'shop': get_shop(one_li),
'goods_url': 'http://' + one_li.xpath('div/div[1]/a/@href')[0]
}
def get_title(item):
title_list = item.xpath('div/div[3]/a/em/text()')
title = ' '.join(title_list)
return title
def get_shop(item):
shop = item.xpath('div/div[5]/span/a/text()')
if len(shop) == 0:
return '未知'
else:
return shop[0]
def write_excel(item):
good_df = pd.DataFrame(item)
good_df.to_csv('./JongDong.csv', mode='a', encoding='utf-8-sig')
def main():
get_page(50)
if __name__ == "__main__":
main()
