爬虫----配合多线程的思路

本文介绍了一个使用 Python 实现的小说解析器,该解析器能够从指定文件夹读取 HTML 格式的小说章节,并通过 PyQuery 提取标题和正文内容,然后将内容保存为文本文件。解析器采用多线程方式提高处理效率。
from pyquery import PyQuery as pq
import os
from queue import Queue
from threading import Thread
class txtparser(Thread):
    def __init__(self,queue):
        Thread.__init__(self)
        self.queue = queue
        #文件夹目录

    def run(self):
        #path = "E:\辰东\ZheTian\\395020.html"
        while True:
            content = self.queue.get()
            html=""
            try:
                with open (content,"r",encoding='utf-8') as reader:
                     html = reader.read()
            except Exception:
                with open (content,"r") as reader:
                     html = reader.read()
            #print(html)
            try:
                doc = pq(html)
                title = doc("#main .content_read .box_con .bookname h1")
                print("标题=====",title.text())
                clipname = content.split("\\")[-2]
                #junkp = doc(".content").find('p').remove()
                passage = doc("#content").text()
            except Exception:
                continue
            print("正文======",str.replace(passage,"<br/>",""))
            try:
                clipname = str.replace(clipname,"","")
                clipname = str.replace(clipname,"","")
            except Exception:
                clipname = clipname
            if os.path.exists(clipname):
                pass
            else:
                os.mkdir(clipname)
            try:
                with open(clipname+"\\"+title.text()+".txt","w",encoding="gbk") as writer:
                    writer.write(passage)
                print("完成{}的写入".format(clipname+"\\"+title.text()+".txt"))
            except Exception:
                with open("errorecorder.log","a") as writer:
                    writer.write(clipname+"\\"+title.text()+".txt"+"\r")
            print("文件夹名称======",clipname)

def launchtxtparser(parentdir):
    rootdir = parentdir
    queue = Queue()
    print(rootdir)
    for i in os.listdir(rootdir):
        print(i)
        if os.path.isdir(rootdir+"\\"+i):
            print(rootdir+"\\"+i)
            g = (k for k in os.listdir(rootdir+"\\"+i))
            print(next(g))
            while True:
                 try:
                     filename = next(g)
                     fullfilename = rootdir+"\\"+i+"\\"+filename
                     queue.put(fullfilename)
                     print(fullfilename)
                 except StopIteration:
                     print("ooooophs~处理完毕")
                     break
    for i in range(10):
        cpc = txtparser(queue)
        cpc.daemon=True
        cpc.start()
    queue.join()
#print(os.listdir(rootdir))
launchtxtparser("E:\月关")

 

转载于:https://www.cnblogs.com/saintdingspage/p/10582296.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值