import urllib.request, urllib.error
import gzip
def ungzip(data):
try:
data=gzip.decompress(data)
except:
pass
return data
def obtainHtml(url):
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Cookie': ' ',
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = ungzip(response.read()).decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
from bs4 import BeautifulSoup
import re
findImg = re.compile(r'<balanceWeightImg .*?src="(.*?)".*?>', re.S)
def getData(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("div", {'id': 'wrapper'})
print(items)
data = []
for item in items:
item = str(item)
ImgSrc = re.findall(findImg, item)
data.append(ImgSrc)
strChange = re.compile(r".*?//(.*?)'.*?")
i = 0
for ImgSrc in data:
if len(ImgSrc) != 0:
ImgSrc = str(ImgSrc[0])
ImgSrc = re.findall(strChange, ImgSrc)[0]
if __name__ == "__main__":
url = "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%C6%BD%BA%E2%C5%E4%D6%D8%BF%E9%B1%EA%CA%B6&fr=ala&ala=1&alatpl=normal&pos=0&dyTabStr=MCwzLDUsMiwxLDYsNCw3LDgsOQ%3D%3D"
html = obtainHtml(url)
getData(html)