import requests
import re
import os
def getHTMLtext(url):
headers = {'user-agent':'Mozilla/5.0'}
try:
r = requests.get(url, timeout=30, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("cannot scrapy the url")
return ""
#解析html文本,筛选出连接
def HTMLparse(link, html):
try:
plt = re.findall(r'"thumbUrl":"http://(.*?)"', html)
for i in range(len(plt)):
plt[i] = re.sub(r"thumbUrl", "", plt[i])
plt[i] = re.sub(r":", "", plt[i])
plt[i] = re.sub(r'"', "", plt[i])
if plt[i][-1]=='g' and plt[i][-2]=='p' and plt[i][-3]=='j':
link.append(r"http://"+plt[i])
except:
print("error")
def main():
source = input("请输入要查找的图片:")
link = []
try:
url = "http://pic.sogou.com/pics?pid=sogou-site-3b24156ad560a696&query=" + source
print(url)
html = getHTMLtext(url)
print(html)
HTMLparse(link, html)
except:
print("error2")
root = "d://ai//sogou//打架//"
headers = {'user-agent':'Mozilla/5.0'}
count = 0
for url in link:
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url, headers=headers)
with open(path, 'wb') as file:
file.write(r.content)
file.close()
print("successful safed:"+ url.split('/')[-1])
count = count + 1
else:
print(url.split('/')[-1] + "has already existed")
except:
print("cannot safed:" + url.split('/')[-1])
pass
print("total count = ", count)
main()
python根据关键字爬取搜狗图片
最新推荐文章于 2021-03-12 09:34:55 发布