增加了时间统计,另外直接写入excel,还有些问题需要解决:
xpath爬取时发现,作者中有些位于span下,有些位于a下,不知道如何才能将两者组合在一起,发帖求问~~~
import requests
import re
import time
import random
import xlwt
from bs4 import BeautifulSoup
from lxml import etree
def get_text(url, code):
try:
r = requests.get(url, timeout=30)
r.encoding = code
r.raise_for_status()
return r.text
except BaseException:
return ''
def re_crawl(html,ls):
start = time.time()
authors = re.findall('<h2>(.*?)</h2>', html, re.S)
contents = re.findall('<span>(.*?)</span>', html, re.S)
laughs = re.findall('<i class="number">(\d{1,5})</i>', html, re.S)
for author, content, laugh in zip(authors, contents, laughs):
ls.append([author.strip(), laugh, content.strip().replace("<br/>", '').replace('\xa0', '')])
end = time.time()
print("re-time:{}s".format(end-start))
def BS_crawl(html,ls):
start = time.time()
soup = BeautifulSoup(html, 'html.parser')
authors = soup.find_all('h2')
contents = soup.find_all('div', class_='content')
laughs = soup.find_all('i', class_='number')
for author, content, laugh in zip(authors, contents, laughs):
author = author.get_text().strip()
content = content.get_text().strip().replace('\xa0', '')
laugh = laugh.get_text()
ls.append([author, laugh, content])
end = time.time()
print("bs-time:{}s".format(end-start))
def lxml_crawl(html,ls):
start = time.time()
ehtml = etree.HTML(html)
authors = ehtml.xpath('//div[@class]/div[1]/a[2]/h2/text()')
contents = ehtml.xpath('//div[@class]/a[1]/div[@class="content"]')
laughs = ehtml.xpath('//div[@class]/div[2]/span[1]/i/text()')
for author, content, laugh in zip(authors, contents, laughs):
content = content.xpath('string(.)')
ls.append([author.strip(), laugh.strip(), content.strip().replace(' ','').replace('查看全文', '')])
end = time.time()
print("lxml-time:{}s".format(end-start))
def write_to_excel(ls, filename):
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('informations', cell_overwrite_ok=True)
for i in range(len(ls)):
for j in range(3):
sheet.write(i,j,ls[i][j])
workbook.save(filename)
if __name__ == '__main__':
depth = 1
ls_re, ls_bs, ls_lxml = [], [], []
url_page_list = ['https://www.qiushibaike.com/text/page/{}/'
.format(i) for i in range(1, depth + 1)]
for url in url_page_list:
time.sleep(random.random())
html = get_text(url, code='utf-8')
if html == '':
pass
else:
re_crawl(html,ls_re)
BS_crawl(html,ls_bs)
lxml_crawl(html,ls_lxml)
# double_check(ls_bs, ls_re, ls_lxml)
write_to_excel(ls_bs, "bs.xls")
write_to_excel(ls_re, "re.xls")
write_to_excel(ls_lxml, "lxml.xls")