selenium+chromedriver部署到云主机并爬取东方财富网的股票评论(三)

本文是selenium+chromedriver爬取东方财富股票评论系列的第三部分,主要介绍了如何在云主机上进行基本部署,并通过多线程优化,提升运行效率。同时,部分selenium代码被替换为requests实现,以提高爬取速度。文章展示了Windows和Linux下的运行效果,并提供了源码下载。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

基本部署见:

selenium+chromedriver部署到云主机并爬取东方财富网的股票评论(一)

https://blog.youkuaiyun.com/qq_41375702/article/details/95978618

更新的内容:

  • 增加了多线程,目前为六个线程。
  • 将一部分用selenium完成的代码,用requests实现,提高了运行效率。

Windows下运行结果:

在这里插入图片描述

Linux运行结果

在这里插入图片描述

下载内容:

在这里插入图片描述

源码:

import re
import requests
import os
import time
from selenium import webdriver
import platform
from threading import Thread

class Win():
    def __init__(self):
        pass
    #抓取评论的html页和评论页数
    def get_general_page_html(self, url):
        '''
        option = webdriver.ChromeOptions()
        option.add_argument('--headless')
        driver = webdriver.Chrome(options=option)
        driver.get(url)
        '''
        html = requests.get(url).text
        return html

    # 评论页数
    def get_num(self, url):
        option = webdriver.ChromeOptions()
        option.add_argument('--headless')
        driver = webdriver.Chrome(options=option)
        driver.get(url)
        html = driver.page_source
        driver.quit()
        page = re.findall('sumpage.*?>(\d+)</span>', html, re.S)
        try:
            num = int(page[0])
        except:
            print("num = int(page[0]) error 34 lines")
        return num

    #得到评论列表的url
    def get_shrot_url_toConstruct(self,html):
        pattern=re.compile('articleh.*?l3 a3.*?href="/(.*?)" title',re.S)
        url_list=re.findall(pattern,html)
        return url_list

    #写入到txt文件
    def writen_to_file(sefl,comment, path, filename):
        totName = os.path.join(path,filename)
        with open(totName + '.txt', 'w+') as f:
            f.write(comment.strip())
        print(totName,'.txt : 写入完成!')

    def get_comment(self,shorturl, base_url_list):
        url_list = []
        comm_list = []
        for one in base_url_list:
            url = shorturl + one
            url_list.append(url)
        for url in url_list:
            html = requests.get(url).text
            pattern = re.compile('short_text">(.*?)<', re.S)
            results = re.findall(pattern, html)
            for one in results:
                one = one.strip()
                comm_list.append(one)
        return comm_list

    #单个爬取一次评论
    def per_run(self,location, path, filename):
        url = 'http://guba.eastmoney.com/list,{location}{filename}.html'.format(location=location, filename=filename)
        html= self.get_general_page_html(url)
        print("Get the html of each page:", url)
        base_url_list = self.get_shrot_url_toConstruct(html)
        comment = self.get_comment(shorturl='http://guba.eastmoney.com/', base_url_list=base_url_list)
        strcom = '\n'.join(comment)
        self.writen_to_file(comment=strcom, path=path, filename=filename)

    #爬取所有评论
    def Run(self):
        start = time.time()
        print('----------info------------')
        shareDict = {}
        '''
           hk:
           	00700 腾讯 
           	02331 李宁 
           	01810 小米
           	03690 美团点评

           us:
           	BIDU 百度
           	baba 阿里
           	NKE  耐克
           	googl 谷歌
         '''

        shareDict['hk'] = ['00700']
        shareDict['us'] = ['baba', 'BIDU']
        print('即将爬取 :')
        for i in shareDict.items():
            print(i)
        for location in shareDict.keys():
            startOne = time.time()
            print('stock exchange: ', location)
            for name in shareDict[location]:
                if os.path.exists(name):
                    print("文件夹:", name, ' 已经存在')
                else:
                    print('创建文件夹:', name)
                    os.mkdir(name)
                print('股票 :', name)
                pageNum =self.get_num(url='http://guba.eastmoney.com/list,' + str(location + name) + '.html')
  
                print('所有评论页数:', pageNum, ' 页')
                tnum1 = int(pageNum / 6)
                tnum2 = int(pageNum * 2 / 6)
                tnum3 = int(pageNum * 3/ 6)
                tnum4 = int(pageNum * 4/ 6)
                tnum5 = int(pageNum * 5 / 6)
                tnum6 = pageNum

                def threRun1():
                    for i in range(1, tnum1):
                        synx = name + '_' + str(i)
                        self.per_run(location=location, path=name, filename=synx)
                def threRun2():
                    for i in range(tnum1, tnum2):
                        synx = name + '_' + str(i)
                        self.per_run(location=location, path=name, filename=synx)
                def threRun3():
                    for i in range(tnum2, tnum3):
                        synx = name + '_' + str(i)
                        self.per_run(location=location, path=name, filename=synx)
                def threRun4():
                    for i in range(tnum3, tnum4):
                        synx = name + '_' + str(i)
                        self.per_run(location=location, path=name, filename=synx)
                def threRun5():
                    for i in range(tnum4, tnum5):
                        synx = name + '_' + str(i)
                        self.per_run(location=location, path=name, filename=synx)

                def threRun6():
                    for i in range(tnum5, tnum6):
                        synx = name + '_' + str(i)
                        self.per_run(location=location, path=name, filename=synx)

                t1=Thread(target=threRun1)
                t1.start()


                t2=Thread(target=threRun2)
                t2.start()

                t3=Thread(target=threRun3)
                t3.start()

                t4 = Thread(target=threRun4)
                t4.start()

                t5 = Thread(target=threRun5)
                t5.start()

                t6 = Thread(target=threRun6)
                t6.start()

                t1.join()
                t2.join()
                t3.join()
                t4.join()
                t5.join()
                t6.join()
            

        print('总共用时:', time.time() - start, 's')


class Linux(Win):
    def __init__(self,path):
        self.path=path
    # 抓取评论的html页
    def get_general_page_html(self,url):
        html = requests.get(url).text
        return html
    # 评论页数
    def get_num(self,url):
        option = webdriver.ChromeOptions()
        option.add_argument('--no-sandbox')
        option.add_argument('--headless')
        driver = webdriver.Chrome(executable_path=self.path,options=option)
        driver.get(url)
        html = driver.page_source
        driver.quit()
        page = re.findall('sumpage.*?>(\d+)</span>', html, re.S)
        try:
            num = int(page[0])
        except:
            print("num = int(page[0]) error 153 lines")
        return num



if __name__ == '__main__':
    platf=platform.platform()
    if 'Windows' in platf:
        crwal = Win()
        crwal.Run()
    elif 'Linux' in platf:
        path = input('请输入chromedriver路径(默认为/root/sel/chromedriver):').strip()
        if len(path) == 0:
            path='/root/sel/chromedriver'
        else:
            path=path.strip()
        crwal = Linux(path=path)
        crwal.Run()
        # try:
        #     os.popen('mkdir all')
        #     os.popen('mv *[^all] all')
        #     os.popen('mv all/df.py .')
        #     os.popen('zip all.zip all -r')
        #     os.popen('rm all -rf')
        # except:
        #     print('163~167:os.popen error')
    else:
        print(platf)

#3threads 6.875s
#6threads 3.229s
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值