爬取京东数据

import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
# from config import *
import pymongo
import json
import time
from multiprocessing import Pool, Process
from threading import Thread

# MONGO_URL = 'localhost'
# MONGO_DB = 'taobao'
# MONGO_TABLE = 'product'

SERVICE_ARGS = ['--load-images=false', '--disk-cache=true','--ignore-ssl-errors=true', '--ssl-protocol=TLSv1']

# KEYWORD = '美食'

# client = pymongo.MongoClient(MONGO_URL)
# db = client[MONGO_DB]

browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# browser.implicitly_wait(15)
# browser = webdriver.PhantomJS()
wait = WebDriverWait(browser, 15)

browser.set_window_size(1400, 3000)

x = 1

class Rate:
    def __init__(self):
        SERVICE_ARGS = ['--load-images=false', '--disk-cache=true','--ignore-ssl-errors=true', '--ssl-protocol=TLSv1']
        self.browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
        self.wait = WebDriverWait(self.browser, 15)
    def get_good_rate(self, url):
        if len(url) > 52:
            return -1
        self.browser.get(url)
        doc = pq(self.browser.page_source)
        if not (doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div')):
            print('no')
            browser.execute_script("window.scrollBy(0,6000)")
            time.sleep(2)
        else:
            print('yes')
            return doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text()
            
        rate = self.wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div'))
            )
        return rate.text

r = Rate()


def search():
    global x
    global r
    print('正在搜索')
    try:
        browser.get('https://www.jd.com')
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#key'))
        )
        print('input')
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#search > div > div.form > button')))
        # submit = wait.until(
        #     EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.s')))
        input.send_keys('空气净化器')
        submit.click()
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)'))
        )
        print('..')
        doc = pq(browser.page_source)
        # with open('page.txt', 'w', encoding='utf-8') as f:
        #     f.write(doc.text())
        items = doc('.gl-item')
        print(len(items))
        data = []
        for rank, item in enumerate(items):
            item = pq(item)
            print(x)
            product = {
                'rank': x,
                'price': item('.p-price i').text(),
                'title': item('.p-name em').text(),
                'comment_cnt': item('.p-commit>strong a').text(),
                'comment_url': 'https:' + item('.p-commit>strong a').attr.href
            }
            product['brand'] = product['title'].split('\n')[0]
            good_rate = r.get_good_rate(product['comment_url'])
            product['good_rate'] = good_rate
            data.append(product)
            x += 1
        with open('data.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(data, indent=2, ensure_ascii=False))
        
        # for i in range(2,101):
        #     next_page(i)
    except TimeoutException as e:
       return False


def next_page(page_number):
    global x
    global r
    print('正在翻页', page_number)
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))
        )
        submit = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)'))
        )
        print('..')
        # browser.execute_script("window.scrollBy(0,10000)")
        # time.sleep(2)
        # wait.until(
        #     EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(60)'))
        # )
        doc = pq(browser.page_source)
        items = doc('.gl-item')
        print(len(items))
        data = []
        for rank, item in enumerate(items):
            item = pq(item)
            print(x)
            product = {
                'rank': x,
                'price': item('.p-price i').text(),
                'title': item('.p-name em').text(),
                'comment_cnt': item('.p-commit>strong a').text(),
                'comment_url': 'https:' + item('.p-commit>strong a').attr.href
            }
            product['brand'] = product['title'].split('\n')[0]
            good_rate = r.get_good_rate(product['comment_url'])
            product['good_rate'] = good_rate
            data.append(product)
            x += 1
        with open('data.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(data, indent=2, ensure_ascii=False))
    except Exception as e:
        print(e)
        next_page(page_number)


# def save_to_mongo(result):
#     try:
#         if db[MONGO_TABLE].insert(result):
#             print('存储到MONGODB成功', result)
#     except Exception:
#         print('存储到MONGODB失败', result)


def main():      
    try:
        total = search()
        total = int(re.compile(r'(\d+)').search(total).group(1))
        for i in range(2, total + 1):
            next_page(i)
    except Exception as e:
        print('出错啦')
        print(e)
    finally:
        browser.close()

if __name__ == '__main__':
    # main()
    search()
    for i in range(2, 5):
        # time.sleep(1)
        t = Thread(target=next_page, args=(i,))
        t.start()
        t.join()
        # next_page(i)
        # p = Process(target=next_page, args=(i,))
        # p.start()
        # p.join()
    # pool = Pool()
    # pool.map(next_page, [i for i in range(2,101)])
    # pool.close()
    # pool.join()
### 实时爬取京东数据的方法 实时爬取京东数据可以通过多种方式实现,主要依赖于Python爬虫技术和相关库的支持。以下是关于如何使用Scrapy或BeautifulSoup来完成这一任务的具体方法。 #### 方法一:使用Scrapy框架 Scrapy是一个功能强大的爬虫框架,适合处理复杂的爬虫需求。它能够自动化管理请求、解析响应以及保存数据的过程[^3]。下面是如何利用Scrapy进行京东数据抓取的一个基本流程: 1. **安装必要的库** 需要先安装Scrapy及相关依赖项: ```bash pip install scrapy requests beautifulsoup4 lxml ``` 2. **创建Scrapy项目** 创建一个新的Scrapy项目并定义Spider类: ```python import scrapy from bs4 import BeautifulSoup class JDProductSpider(scrapy.Spider): name = 'jd_products' allowed_domains = ['www.jd.com'] def __init__(self, *args, **kwargs): super(JDProductSpider, self).__init__(*args, **kwargs) self.start_urls = [f'https://search.jd.com/Search?keyword={self.keyword}&enc=utf-8&page=1'] def parse(self, response): soup = BeautifulSoup(response.text, 'lxml') products = soup.find_all('div', {'class': 'gl-i-wrap'}) for product in products: title_element = product.find('div', {'class': 'p-name'}).find('em') price_element = product.find('div', {'class': 'p-price'}).find('i') if title_element and price_element: yield { 'title': title_element.get_text(strip=True), 'price': price_element.get_text(strip=True), } next_page_url = response.css('.pn-next::attr(href)').get() if next_page_url: yield response.follow(next_page_url, callback=self.parse) ``` 上述代码中,`start_urls`动态生成搜索关键词对应的URL列表,并通过`parse`函数逐一解析每一页的商品信息[^4]。 3. **运行爬虫** 启动爬虫程序即可开始抓取数据: ```bash scrapy crawl jd_products -a keyword="笔记本电脑" ``` #### 方法二:使用BeautifulSoup与Requests组合 如果不需要构建完整的爬虫框架,可以采用更轻量级的方式——结合Requests和BeautifulSoup手动抓取网页内容[^2]。 1. **发送HTTP请求** 利用Requests模块向目标网站发起GET请求以获取HTML源码。 ```python import requests from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } search_keyword = "手机壳" url = f"https://search.jd.com/Search?keyword={search_keyword}" res = requests.get(url, headers=headers) ``` 2. **解析HTML文档** 将返回的内容交给BeautifulSoup处理以便提取有用的信息字段。 ```python soup = BeautifulSoup(res.content.decode(), 'lxml') items = soup.select(".gl-item") results = [] for item in items[:10]: try: title_tag = item.select_one("div.p-name em").text.strip().replace("\n", "") price_tag = item.select_one("div.p-price i").text.strip() link_tag = item.a['href'] result = {"title": title_tag, "price": price_tag, "link": link_tag} results.append(result) except Exception as e: continue print(results) ``` 以上两种方案均能有效满足实时爬取京东商品数据的要求,具体选择取决于实际应用场景和个人偏好。 --- ### 注意事项 为了确保爬虫行为合法合规,在设计过程中需注意以下几点: - 不得违反目标站点的服务条款; - 控制访问频率以免给服务器带来过重负担; - 妥善保管所收集的数据以防泄露隐私信息。 ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值