简单记录一下使用python pyppeteer爬取努努书坊的爬虫

最新推荐文章于 2025-01-24 17:43:53 发布
原创最新推荐文章于 2025-01-24 17:43:53 发布 · 500 阅读
0 ·
CC 4.0 BY-SA版权
python 专栏收录该内容
16 篇文章
订阅专栏
Python脚本利用pyppeteer抓取华人作家在kanunu8网站的简介和小说列表，实现批量下载，记录到Excel。
RT，是用python+pyppeteer随便写的，比较简单，权当做记录备忘而已。
不作任何这方面的疑问解答。
import pyppeteer,asyncio,sys,io,os,re,time,datetime
import openpyxl
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') 
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup

executable_path = "C:\\chrome-win32-chromium-588429\\chrome.exe"
noveltype = '华人'

def appeartimes(string, s):
    count = 0
    index = 0
    while True:
        index = string.find(s, index)
        if index != -1:
            count += 1
            index += len(s)
        else:
            break
    return count

def complete(s):
    return "0"+str(s) if s<10 else str(s)

def now():
    d = datetime.datetime.now()
    return str(d.year) + "-" + complete(d.month) + "-" + complete(d.day) + " " + complete(d.hour) + ":" + complete(d.minute) + ":" + complete(d.second)

def filter(s):
    try:
        return re.sub(r'</?\w+[^>]*>','',s)
    except:
        return s 

async def get_liebiao(url):
    # 打开浏览器
    browser = await pyppeteer.launch(
        executablePath=executable_path,
        headless=True, 
        userDataDir='D:\\temporary',
        args=['--no-sandbox'])

    page = await browser.newPage()
    await page.goto(url)
    content = await page.content()
    _= pq(content.encode('utf-8').decode('utf-8'))
     
    liebiao = os.getcwd() + "\\" + _('#main font').eq(0).html()
    print("开始爬取："+liebiao)

    recorder = os.getcwd() + "\\任务总列表.xlsx"
    if not os.path.exists(recorder):
        wb = openpyxl.Workbook()
        ws = wb.active
        wb.save(recorder)
    
    wb = openpyxl.load_workbook(recorder)
    ws = wb.active

    if not os.path.exists(liebiao):
        print("创建"+liebiao+"文件夹...")
        os.mkdir(liebiao)
    trslen = len(_("#main tr"))
    row = 1
    for i in range(1,trslen):
        suoyin = _("#main tr:eq("+str(i)+") td:eq(0)").html()
        if not os.path.exists(liebiao + "\\" + suoyin):
            print("在" + liebiao + " 下创建索引文件夹： " + suoyin + " ...")
            os.mkdir(liebiao + "\\" + suoyin)
        all_a = _("#main tr:eq("+str(i)+") a")
        for a in all_a.items():
            zuojia = a.html()
            href = 'https://www.kanunu8.com/' + a.attr('href')
            if not os.path.exists(liebiao + "\\" + suoyin + "\\" + zuojia):
                print("在" + suoyin + " 下创建作家文件夹： " + zuojia + " ...")
                os.mkdir(liebiao + "\\" + suoyin + "\\" + zuojia)
            ws.cell(row,1,suoyin)
            ws.cell(row,2,zuojia)
            ws.cell(row,3,href)
            row += 1
    wb.save(recorder)
    # 关闭浏览器
    await browser.close()

    return 1

async def get_zuojia():
    browser = await pyppeteer.launch(
        executablePath=executable_path,
        headless=True, 
        userDataDir='D:\\temporary',
        args=['--no-sandbox'])

    page = await browser.newPage()

    recorder = os.getcwd() + "\\任务总列表.xlsx"

    wb = openpyxl.load_workbook(recorder)
    ws = wb.active

    liebiao = os.getcwd() + "\\" + '任务总列表'
    if not os.path.exists(liebiao):
        print("创建 " + liebiao + " 文件夹...")
        os.mkdir(liebiao)

    for row in range(1,ws.max_row+1):
        url = ws.cell(row,3).value
        await page.goto(url)
        content = await page.content()
        _= pq(content.encode('utf-8').decode('utf-8'))

        try:
            jianjie = filter(_("table:eq(8) table:eq(1) p").html())
        except:
            jianjie = ''

        if jianjie is None:
            jianjie = ''

        txt = os.getcwd() + "\\"+noveltype+"作家列表\\" + ws.cell(row,1).value + "\\" + ws.cell(row,2).value + "\\简介.txt"
        if not os.path.exists(txt):
            with open(txt,"w",encoding='utf-8') as f:
                f.write(jianjie)
        print(ws.cell(row,1).value+" "+ws.cell(row,2).value+" 简介记录完成...",flush=True)

        target = _("table:eq(8) table:eq(1) table tr")

        recorder = liebiao + "\\" + ws.cell(row,1).value + " " + ws.cell(row,2).value + ".xlsx"
        if not os.path.exists(recorder):
            wbi = openpyxl.Workbook()
            wsi = wbi.active
            wbi.save(recorder)
        
        wbi = openpyxl.load_workbook(recorder)
        wsi = wbi.active

        fenlei = ''
        i = 1
        for t in target.items():
            if t.find("a").html() is None:
                if t.find("strong"):
                    newfenlei = filter(t.find("strong").html().replace("[TXT全文]","").replace("在线阅读","").replace("&nbsp;","").strip())
                else:
                    try:
                        newfenlei = filter(t.find("span").html().replace("[TXT全文]","").replace("在线阅读","").replace("&nbsp;","").strip())
                    except:
                        try:
                            newfenlei = filter(t.find("td").html().replace("[TXT全文]","").replace("在线阅读","").replace("&nbsp;","").strip())
                        except:
                            newfenlei = "" #因为有些tr行干脆就是空的，没辙，只能这样，这个是他的系统做的不好
                if newfenlei != "":
                    fenlei = newfenlei
            else:
                all_a = t.find("a")
                for a in all_a.items():
                    if a.find("img"):
                        continue
                    xiaoshuo = filter(a.html()).replace("《","").replace("》","").replace("[TXT全文]","").replace("在线阅读","").strip()
                    href = "https://www.kanunu8.com" + a.attr('href')
                    wsi.cell(i,1,fenlei)
                    wsi.cell(i,2,xiaoshuo)
                    wsi.cell(i,3,href)
                    i += 1

        wbi.save(recorder)
        print(ws.cell(row,1).value + " " + ws.cell(row,2).value + " 小说列表记录完成...",flush=True)

    await browser.close()

    print('完成',flush=True)

async def get_zuopin():
    browser = await pyppeteer.launch(
        executablePath=executable_path,
        headless=True, 
        userDataDir='D:\\temporary',
        args=['--no-sandbox'])

    page = await browser.newPage()
    page2 = await browser.newPage()
    
    missions = os.getcwd() + "\\任务总列表"

    paqu = os.getcwd() + "\\"+noveltype+"作家列表"
    if not os.path.exists(paqu):
        print(now()+" "+ "创建"+paqu+"文件夹...",flush=True)
        os.mkdir(paqu)

    def fuckfile(f):
        letter = ['0']
        for l in letter:
            if l in f:
                return False
        if "匪我思存" in f:
            return False
        return True

    print(now()+" "+ "任务开始...",flush=True)
    for root, dirs, files in os.walk(missions):
        for file in files: 
            if fuckfile(file):
                wb = openpyxl.load_workbook(missions+"\\"+file)
                ws = wb.active
                name = file.split(".")[0]
                print(now()+" "+ '开始下载作家：'+" "+name + " 的作品...",flush=True)
                paqupath = paqu + "\\" + name.split(" ")[0]            
                if not os.path.exists(paqupath):
                    os.mkdir(paqupath)
                paqupath = paqupath + "\\" + name.split(" ")[1]            
                if not os.path.exists(paqupath):
                    os.mkdir(paqupath)
                
                #if ws.max_row<2: #openpyxl即使是空的，max_row最大行数也为1
                #    continue
                for i in range(1,ws.max_row+1):
                    print(now()+" "+ file + "共"+str(ws.max_row)+"部，开始下载第"+str(i)+"部，"+re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value)+"...",flush=True)
                    if ws.cell(i,1).value=='' or ws.cell(i,1).value is None:
                        ws.cell(i,1,'作品')
                    zimulu = paqupath + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,1).value)
                    if not os.path.exists(zimulu):
                        os.mkdir(zimulu)
                    txt = zimulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value) + ".txt"
                    
                    if (not os.path.exists(txt)) and (appeartimes(ws.cell(i,3).value,'http')<2):

                        await page.goto(ws.cell(i,3).value,{'timeout': 1000*60*10})
                        content = await page.content()

                        _= pq(content.encode('utf-8').decode('utf-8'))

                        if _("p").parents('table').html() is not None:
                            neirong = BeautifulSoup(_("p").parents('table').html(),"html.parser").text
                            with open(txt,"w",encoding='utf-8') as f:
                                f.write(neirong)
                        else:
                            zuopinmulu = zimulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value)
                            if not os.path.exists(zuopinmulu):
                                os.mkdir(zuopinmulu)
                            
                            target = _("table:eq(8) tr:eq(3) table:eq(1) tr")
                            if (target.html() is None):
                                target = _("table:eq(8) table:eq(3) table:eq(1) tr")
                                if (target.html() is None):
                                    target = _(".book dl dd")
                                    if (target.html() is None):
                                        target = _(".book table:eq(1) tr")
                            inner = ''
                            fenlei = '正文'
                            newfenlei = "正文"
                            count = 1
                            for t in target.items():
                                if t.find("a").html() is None:
                                    newfenlei = ""
                                    if t.find("strong"):
                                        newfenlei = filter(t.find("strong").html().replace("[TXT全文]","").replace("在线阅读","").replace("&nbsp;","").strip())
                                    elif t.find("span"):
                                        newfenlei = filter(t.find("span").html().replace("[TXT全文]","").replace("在线阅读","").replace("&nbsp;","").strip())
                                    elif t.find("td"):
                                        newfenlei = filter(t.find("td").html().replace("[TXT全文]","").replace("在线阅读","").replace("&nbsp;","").strip())
                                    else:
                                        newfenlei = "正文" #因为有些tr行干脆就是空的，没辙，只能这样，这个是他的系统做的不好
                                    if newfenlei != "":
                                        fenlei = newfenlei
                                inner = zuopinmulu + "\\" +  re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",fenlei)
                                if not os.path.exists(inner):
                                    os.mkdir(inner)
                                if newfenlei != fenlei:
                                    fenlei = newfenlei
                                    print(now()+" "+ "开始下载 " + file + ' ' + ws.cell(i,1).value + ' ' + ws.cell(i,2).value +fenlei+" ...",flush=True)
                                else:
                                    all_a = t.find("a")
                                    for a in all_a.items():
                                        mingzi = re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",filter(a.html().strip()))
                                        innertxt = inner + "\\" + str(count) + " " + mingzi + ".txt"
                                        if not os.path.exists(innertxt):
                                            print(now()+" "+ "正在下载 " + file + ' ' + ws.cell(i,1).value + ' ' + ws.cell(i,2).value +' '+ fenlei + " " + mingzi + "...",flush=True)
                                            href = ws.cell(i,3).value
                                            if '.html' in href:
                                                while True:
                                                    href = href[:-1]
                                                    if href[len(href)-1]=='/':
                                                        break
                                            href = href + a.attr("href")
                                            #print(href)
                                            await page2.goto(href,{'timeout': 1000*60*10})
                                            content2 = await page2.content()
                                            _2= pq(content2.encode('utf-8').decode('utf-8'))
                                            if _2("p").parents('table').html() is not None:
                                                innerneirong = BeautifulSoup(_2("p").parents('table').html(),"html.parser").text
                                            elif _2("#content").html() is not None:
                                                innerneirong = BeautifulSoup(_2("#content").html(),"html.parser").text
                                            elif _2("#Article").html() is not None:
                                                fuck = _2("#Article .text").html() #不知道为什么不可以直接载入beautifulsoup里面，可能是时间太快？
                                                innerneirong = BeautifulSoup(fuck,"html.parser").text
                                            else:
                                                fuck = _2("body table:eq(4) td:eq(1)").html()
                                                innerneirong = BeautifulSoup(fuck,"html.parser").text
                                                    
                                            with open(innertxt,"w",encoding='utf-8') as f:
                                                f.write(innerneirong)                                                                         
                                        count +=1
                    print(now()+" " + file + ' ' + "第"+str(i)+"部下载完毕...",flush=True)
                print(now()+" "+ "作家 "+file+" 全部作品下载完毕...",flush=True)

    await browser.close()



url_list = ["https://www.kanunu8.com/author1.html"]
task = (get_zuopin() for url in url_list)

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*task))
# for content in results:
#     print(1)