import requests
requests.packages.urllib3.disable_warnings()
from lxml import etree
from docx import Document
from docx.shared import Inches
import time
import random
url = "https://bj.wendu.com/zixun/yingyu/6697.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
}
r = requests.get(url, headers=headers, verify=False, timeout=120)
html = etree.HTML(r.text)
html_urls = html.xpath("//tr//a/@href")
num_count = len(html_urls)
print("总共发现: {}句".format(num_count))
for i in html_urls:
r = requests.get(i, headers=headers, verify=False, timeout=120)
result_html = etree.HTML(r.content, parser=etree.HTMLParser(encoding='utf8'))
html_data = result_html.xpath('//div[@class="article-body"]/p//text()')
head = html_data[1]
Message = "正在处理===>" + head + " " + i + " 请稍等..."
print(Message)
juzi = '\n'.join(html_data[2:4])
xuanxiang = '\n'.join(html_data[4:10])
fengxi = '\n'.join(html_data[10:-4])
content = "\n\n\n".join((juzi, xuanxiang, fengxi))
file_name = 'E:\\新建文件夹\\docs\\' + '何凯文每日一句.docx'
document = Document(file_name)
document.styles['Normal'].font.name = u'Times New Roman'
document.add_heading(head, level=1)
document.add_paragraph(content)
document.add_page_break()
document.save(file_name)
myrandom = random.randint(3, 10)
time.sleep(myrandom)