import requests
import re
import os
def getHTMLtext(url):
headers = {'user-agent':'Mozilla/5.0'}
try:
r = requests.get(url, timeout=30, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("cannot scrapy the url")
return ""
#解析html文本,筛选出连接
def HTMLparse(link, html):
try:
plt = re.findall(r'"thumbUrl":"http://(.*?)"', html)
for i in range(len(plt)):
plt[i] = re.sub(r"thumbUrl", "", plt[i])
plt[i] = re.sub(r":", "", plt[i])
plt[i] = re.sub(r'"', "", plt[i])
if plt[i][-1]=='g' and plt[i][-2]=='p' and plt[i][-3]=='j':
link.append(r"http://"+plt[i])
except:
print("error")
def main():
source = input("请输入要查找的图片:")
link = []
try:
url = "http://pic.sogou.com/pics?pid=sogou-site-3b24156ad560a696&query=" + source
html = getHTMLtext(url)
HTMLparse(link, html)
except:
print("error2")
root = "E://爬虫//requests项目//source//"
headers = {'user-agent':'Mozilla/5.0'}
(初学)requests批量爬取搜狗图片
最新推荐文章于 2025-07-20 23:16:52 发布