import os
import re
import docx
from docx.shared import RGBColor#设置字体
from docx import Document
from docx.shared import Pt#设置字体
from docx.oxml.ns import qn#设置中文字体
import urllib
import urllib.parse
import urllib.request
import sys
import time
from bs4 import BeautifulSoup
def getHtlm(url):
page=urllib.request.urlopen(url)
soup=BeautifulSoup(page)
e=soup.select('title')
ee=e[0]
if ee.string[0:3]!='404':
return soup
else :
return 0
def getHtlmcode(url1):
htlm=getHtlm(url1)
if htlm!=0:
#soup=BeautifulSoup(htlm)
e=htlm.select('div[class=titArea]')
ee=e[0]
eee=ee.select('h2')
eeee=eee[0]
#print(eeee.string,end='/n')
a=htlm.select('div[class=detArea]')
aa=a[0]
aaa=aa.select('dd')
bb=aaa[1].string+':'#.sring
cc=aaa[3].string #.sring
gen=eeee.string+':'
#print(bb.string,end='
爬取目标网页内容并保存在word
最新推荐文章于 2024-06-03 09:25:06 发布