RT,是用python+pyppeteer随便写的,比较简单,权当做记录备忘而已。
不作任何这方面的疑问解答。
import pyppeteer,asyncio,sys,io,os,re,time,datetime
import openpyxl
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
executable_path = "C:\\chrome-win32-chromium-588429\\chrome.exe"
noveltype = '华人'
def appeartimes(string, s):
count = 0
index = 0
while True:
index = string.find(s, index)
if index != -1:
count += 1
index += len(s)
else:
break
return count
def complete(s):
return "0"+str(s) if s<10 else str(s)
def now():
d = datetime.datetime.now()
return str(d.year) + "-" + complete(d.month) + "-" + complete(d.day) + " " + complete(d.hour) + ":" + complete(d.minute) + ":" + complete(d.second)
def filter(s):
try:
return re.sub(r'</?\w+[^>]*>','',s)
except:
return s
async def get_liebiao(url):
# 打开浏览器
browser = await pyppeteer.launch(
executablePath=executable_path,
headless=True,
userDataDir='D:\\temporary',
args=['--no-sandbox'])
page = await browser.newPage()
await page.goto(url)
content = await page.content()
_= pq(content.encode('utf-8').decode('utf-8'))
liebiao = os.getcwd() + "\\" + _('#main font').eq(0).html()
print("开始爬取:"+liebiao)
recorder = os.getcwd() + "\\任务总列表.xlsx"
if not os.path.exists(recorder):
wb = openpyxl.Workbook()
ws = wb.active
wb.save(recorder)
wb = openpyxl.load_workbook(recorder)
ws = wb.active
if not os.path.exists(liebiao):
print("创建"+liebiao+"文件夹...")
os.mkdir(liebiao)
trslen = len(_("#main tr"))
row = 1
for i in range(1,trslen):
suoyin = _("#main tr:eq("+str(i)+") td:eq(0)").html()
if not os.path.exists(liebiao + "\\" + suoyin):
print("在" + liebiao + " 下创建索引文件夹: " + suoyin + " ...")
os.mkdir(liebiao + "\\" + suoyin)
all_a = _("#main tr:eq("+str(i)+") a")
for a in all_a.items():
zuojia = a.html()
href = 'https://www.kanunu8.com/' + a.attr('href')
if not os.path.exists(liebiao + "\\" + suoyin + "\\" + zuojia):
print("在" + suoyin + " 下创建作家文件夹: " + zuojia + " ...")
os.mkdir(liebiao + "\\" + suoyin + "\\" + zuojia)
ws.cell(row,1,suoyin)
ws.cell(row,2,zuojia)
ws.cell(row,3,href)
row += 1
wb.save(recorder)
# 关闭浏览器
await browser.close()
return 1
async def get_zuojia():
browser = await pyppeteer.launch(
executablePath=executable_path,
headless=True,
userDataDir='D:\\temporary',
args=['--no-sandbox'])
page = await browser.newPage()
recorder = os.getcwd() + "\\任务总列表.xlsx"
wb = openpyxl.load_workbook(recorder)
ws = wb.active
liebiao = os.getcwd() + "\\" + '任务总列表'
if not os.path.exists(liebiao):
print("创建 " + liebiao + " 文件夹...")
os.mkdir(liebiao)
for row in range(1,ws.max_row+1):
url = ws.cell(row,3).value
await page.goto(url)
content = await page.content()
_= pq(content.encode('utf-8').decode('utf-8'))
try:
jianjie = filter(_("table:eq(8) table:eq(1) p").html())
except:
jianjie = ''
if jianjie is None:
jianjie = ''
txt = os.getcwd() + "\\"+noveltype+"作家列表\\" + ws.cell(row,1).value + "\\" + ws.cell(row,2).value + "\\简介.txt"
if not os.path.exists(txt):
with open(txt,"w",encoding='utf-8') as f:
f.write(jianjie)
print(ws.cell(row,1).value+" "+ws.cell(row,2).value+" 简介记录完成...",flush=True)
target = _("table:eq(8) table:eq(1) table tr")
recorder = liebiao + "\\" + ws.cell(row,1).value + " " + ws.cell(row,2).value + ".xlsx"
if not os.path.exists(recorder):
wbi = openpyxl.Workbook()
wsi = wbi.active
wbi.save(recorder)
wbi = openpyxl.load_workbook(recorder)
wsi = wbi.active
fenlei = ''
i = 1
for t in target.items():
if t.find("a").html() is None:
if t.find("strong"):
newfenlei = filter(t.find("strong").html().replace("[TXT全文]","").replace("在线阅读","").replace(" ","").strip())
else:
try:
newfenlei = filter(t.find("span").html().replace("[TXT全文]","").replace("在线阅读","").replace(" ","").strip())
except:
try:
newfenlei = filter(t.find("td").html().replace("[TXT全文]","").replace("在线阅读","").replace(" ","").strip())
except:
newfenlei = "" #因为有些tr行干脆就是空的,没辙,只能这样,这个是他的系统做的不好
if newfenlei != "":
fenlei = newfenlei
else:
all_a = t.find("a")
for a in all_a.items():
if a.find("img"):
continue
xiaoshuo = filter(a.html()).replace("《","").replace("》","").replace("[TXT全文]","").replace("在线阅读","").strip()
href = "https://www.kanunu8.com" + a.attr('href')
wsi.cell(i,1,fenlei)
wsi.cell(i,2,xiaoshuo)
wsi.cell(i,3,href)
i += 1
wbi.save(recorder)
print(ws.cell(row,1).value + " " + ws.cell(row,2).value + " 小说列表记录完成...",flush=True)
await browser.close()
print('完成',flush=True)
async def get_zuopin():
browser = await pyppeteer.launch(
executablePath=executable_path,
headless=True,
userDataDir='D:\\temporary',
args=['--no-sandbox'])
page = await browser.newPage()
page2 = await browser.newPage()
missions = os.getcwd() + "\\任务总列表"
paqu = os.getcwd() + "\\"+noveltype+"作家列表"
if not os.path.exists(paqu):
print(now()+" "+ "创建"+paqu+"文件夹...",flush=True)
os.mkdir(paqu)
def fuckfile(f):
letter = ['0']
for l in letter:
if l in f:
return False
if "匪我思存" in f:
return False
return True
print(now()+" "+ "任务开始...",flush=True)
for root, dirs, files in os.walk(missions):
for file in files:
if fuckfile(file):
wb = openpyxl.load_workbook(missions+"\\"+file)
ws = wb.active
name = file.split(".")[0]
print(now()+" "+ '开始下载作家:'+" "+name + " 的作品...",flush=True)
paqupath = paqu + "\\" + name.split(" ")[0]
if not os.path.exists(paqupath):
os.mkdir(paqupath)
paqupath = paqupath + "\\" + name.split(" ")[1]
if not os.path.exists(paqupath):
os.mkdir(paqupath)
#if ws.max_row<2: #openpyxl即使是空的,max_row最大行数也为1
# continue
for i in range(1,ws.max_row+1):
print(now()+" "+ file + "共"+str(ws.max_row)+"部,开始下载第"+str(i)+"部,"+re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value)+"...",flush=True)
if ws.cell(i,1).value=='' or ws.cell(i,1).value is None:
ws.cell(i,1,'作品')
zimulu = paqupath + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,1).value)
if not os.path.exists(zimulu):
os.mkdir(zimulu)
txt = zimulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value) + ".txt"
if (not os.path.exists(txt)) and (appeartimes(ws.cell(i,3).value,'http')<2):
await page.goto(ws.cell(i,3).value,{'timeout': 1000*60*10})
content = await page.content()
_= pq(content.encode('utf-8').decode('utf-8'))
if _("p").parents('table').html() is not None:
neirong = BeautifulSoup(_("p").parents('table').html(),"html.parser").text
with open(txt,"w",encoding='utf-8') as f:
f.write(neirong)
else:
zuopinmulu = zimulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",ws.cell(i,2).value)
if not os.path.exists(zuopinmulu):
os.mkdir(zuopinmulu)
target = _("table:eq(8) tr:eq(3) table:eq(1) tr")
if (target.html() is None):
target = _("table:eq(8) table:eq(3) table:eq(1) tr")
if (target.html() is None):
target = _(".book dl dd")
if (target.html() is None):
target = _(".book table:eq(1) tr")
inner = ''
fenlei = '正文'
newfenlei = "正文"
count = 1
for t in target.items():
if t.find("a").html() is None:
newfenlei = ""
if t.find("strong"):
newfenlei = filter(t.find("strong").html().replace("[TXT全文]","").replace("在线阅读","").replace(" ","").strip())
elif t.find("span"):
newfenlei = filter(t.find("span").html().replace("[TXT全文]","").replace("在线阅读","").replace(" ","").strip())
elif t.find("td"):
newfenlei = filter(t.find("td").html().replace("[TXT全文]","").replace("在线阅读","").replace(" ","").strip())
else:
newfenlei = "正文" #因为有些tr行干脆就是空的,没辙,只能这样,这个是他的系统做的不好
if newfenlei != "":
fenlei = newfenlei
inner = zuopinmulu + "\\" + re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",fenlei)
if not os.path.exists(inner):
os.mkdir(inner)
if newfenlei != fenlei:
fenlei = newfenlei
print(now()+" "+ "开始下载 " + file + ' ' + ws.cell(i,1).value + ' ' + ws.cell(i,2).value +fenlei+" ...",flush=True)
else:
all_a = t.find("a")
for a in all_a.items():
mingzi = re.sub(r"[\/\\\:\*\?\"\<\>\|]","_",filter(a.html().strip()))
innertxt = inner + "\\" + str(count) + " " + mingzi + ".txt"
if not os.path.exists(innertxt):
print(now()+" "+ "正在下载 " + file + ' ' + ws.cell(i,1).value + ' ' + ws.cell(i,2).value +' '+ fenlei + " " + mingzi + "...",flush=True)
href = ws.cell(i,3).value
if '.html' in href:
while True:
href = href[:-1]
if href[len(href)-1]=='/':
break
href = href + a.attr("href")
#print(href)
await page2.goto(href,{'timeout': 1000*60*10})
content2 = await page2.content()
_2= pq(content2.encode('utf-8').decode('utf-8'))
if _2("p").parents('table').html() is not None:
innerneirong = BeautifulSoup(_2("p").parents('table').html(),"html.parser").text
elif _2("#content").html() is not None:
innerneirong = BeautifulSoup(_2("#content").html(),"html.parser").text
elif _2("#Article").html() is not None:
fuck = _2("#Article .text").html() #不知道为什么不可以直接载入beautifulsoup里面,可能是时间太快?
innerneirong = BeautifulSoup(fuck,"html.parser").text
else:
fuck = _2("body table:eq(4) td:eq(1)").html()
innerneirong = BeautifulSoup(fuck,"html.parser").text
with open(innertxt,"w",encoding='utf-8') as f:
f.write(innerneirong)
count +=1
print(now()+" " + file + ' ' + "第"+str(i)+"部下载完毕...",flush=True)
print(now()+" "+ "作家 "+file+" 全部作品下载完毕...",flush=True)
await browser.close()
url_list = ["https://www.kanunu8.com/author1.html"]
task = (get_zuopin() for url in url_list)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*task))
# for content in results:
# print(1)