爬取商品的标题、店铺、价格、评价数以及链接,存储为Excel。
静态页面解析起来比较简单,有时间再补上分析过程。
效果如下:
附上代码:
import requests, re, datetime
from bs4 import BeautifulSoup
import urllib
import xlsxwriter
import threading
# class myThread(threading.Thread):
# def __init__(self, html):
# threading.Thread.__init__(self)
# self.html = html
# data = []
# def run(self):
# self.data = parsePage(self.html)
#
# def get_data(self):
# return self.data
def generateURL(good, pages=5):
url_str = urllib.parse.quote(good)
urls = ("https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=4&page={}&s=1&click=0".format(url_str, i) for i in range(1, pages*2, 2))
return urls
def get_html(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
'ContentType': 'text/html; charset=utf-8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive'
}
html = requests.get(url, headers