python - 网站信息获取_requests用法_python使用request获取网上所有目录和文件名称-优快云博客

本文链接：https://blog.youkuaiyun.com/wzq29931927/article/details/108784382

本文详细介绍了如何使用Python的requests库进行网站信息的获取，包括发送GET和POST请求，设置请求头，处理响应等内容，是Python爬虫初学者的实用教程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import requests,re,time,math
import os
from io import BytesIO
from PIL import Image


#收集第一步Html
all_bigimg_html = []
real_url_html = []

#进入后收集大图地址
new_big_img = []



# 得到页面url
def get_pageurl(root, start, counts):

	all_page = [];

	#需要获取页面1
	if start <= 1:

		#强制设置start 防止错误
		start = 1;

		# 第1页面是没有_%d的 所以需要写死 index.html
		# 自动转换路径
		fragment_root = root.split('/')
		fragment_root[-1] = 'index.html'
		fixed_root = '/'.join(fragment_root)
		
		all_page.append(fixed_root)

		#循环放入其他链接
		for i in range (start+1, start + counts):

			newURL = root.replace("%d", str(i));
			all_page.append(newURL);

	#不需要获取页面1
	else:
		for i in range (start, start + counts):

			newURL = root.replace("%d", str(i));
			all_page.append(newURL);

	return all_page;

# 通过得到页面 得到大图的链接
def get_html(all_page):

	index = 1;
	part_url = [];
	all_url = [];

	for pageurl in all_page:

		#爬取前休息1秒钟
		time.sleep(1);

		#设置一个爬取进度
		print('小图html完成进度' + str(math.floor(index / len(all_page)  * 100 )) + '%' )
		index += 1

		try:
			res = requests.get(pageurl, headers=headers, timeout=5);
			res.encoding = 'gbk';

			#判断网络链接状态
			res.raise_for_status()

		except Exception:
			print('http error');

		#r.text 获得的html
		#预编译
		gethtml = re.compile("^/tupian/\d*.html$");

		#切割文件
		mid = re.split("\"", res.text);

		#正则匹配
		for i in mid:
			temp_part_url = re.findall(gethtml,i)

			# 能匹配到就加载进入
			if len(temp_part_url) != 0 :
				part_url.append(temp_part_url[0])

	# 数组解析 添加链接头文件
	all_url = [ linkpre + i for i in part_url]

	return all_url

	'''
	本地文件读写 根据需要取舍

		# 获得的文件写入到本地文件夹中 a+追加写入的方式一次性写完
		with open("C:/Users/30818/Desktop/pagefile.txt",'a+',encoding='ISO-8859-1') as pagefile:
			pagefile.write(r.text)

	# 打开文件做正则匹配并在屏幕输出
	with open("C:/Users/30818/Desktop/pagefile.txt",'r',encoding="gbk") as readfile:

		# 文件内容读取到内存中
		str = readfile.read();
		# " 号分隔文件
		mid = re.split("\"", str);

		#正则匹配
		for i in mid:
			readhtml = re.findall("^/tupian/\d*.html$", i);

			# 匹配到了就放入数组中
			if len(readhtml) != 0:
				# readhtml是一个匹配到的数组 匹配到几次就会有几次 所以将第1个放入
				all_bigimg_html.append(readhtml[0]);

	real_url_html = [ linkpre + all_bigimg_html[i] for i in range(len(all_bigimg_html)) ] 

	# 清空文件
	with open("C:/Users/30818/Desktop/realurl.txt",'w',encoding='ISO-8859-1') as realurl:
		realurl.write('');
	
	# 将真实地址存放在文件夹中 
	with open("C:/Users/30818/Desktop/realurl.txt",'a+',encoding='ISO-8859-1') as realulr:
		for i in real_url_html:
			realulr.write(i);
			realulr.write('\r');

	'''

#得到大图的真实链接地址
def big_imgget(real_url):

	index = 1;
	part_big_url = [];
	all_big_url = [];

	#获得大图html链接页面
	for html in real_url:

		print('大图html当前进度' + str(math.floor(index / len(real_url) * 100)) + '%');
		index += 1;

		time.sleep(0.4);

		#依次打开链接
		try:
			r = requests.get(html, headers=headers, timeout=5);
			r.encoding = 'gbk'
			r.raise_for_status();

		except Exception:
			print('http error');

		#re 模块  split 切割  findall 查询符合的全部 search 查看符合的第一个 compile 预编译 ?非贪婪
		re_downurl = re.compile(r'/uploads/allimg/.*?jpg')

		temp_part_big_url = re.search(re_downurl, r.text)

		if temp_part_big_url is not None:

			#获取到了 写入数组中
			part_big_url.append(temp_part_big_url[0]);

	#列表解析组成大图真实地址
	all_big_url = [ linkpre + i for i in part_big_url ]

	return all_big_url

def downloadIMG(url, path):
	'''
		参数一  下载的URL
		参数二	保存的路径
	'''
	try:
		r = requests.get(url, headers=headers, timeout=5);
		r.raise_for_status()

		# 打开图片
		MyImage = Image.open(BytesIO(r.content))
		# 保存图片
		MyImage.save(path)
		print('save successful')

	except Exception:
		print("save error")


#当前窗口是主函数窗口
if __name__ == "__main__":


	############################设置参数
	# 请求地址
	url = 'http://pic.netbian.com/4kmeinv'

	# 请求头  根据chrome://version 获得
	headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}

	# root中提前设置占位符  %04d %d
	root = 'http://pic.netbian.com/4kdongman/index_%d.html' 
	
	# 子页面 和大图链接前缀
	linkpre = 'http://pic.netbian.com'

	# 保存路径
	path = 'C:\\Users\\30818\\Desktop\\img\\'

	############################开始获取
	# 获取预览地址 从第几页开始 总共收集几页  
	all_page = get_pageurl(root, 2, 5);

	# 获取大图地址
	two_step = get_html(all_page)

	# 获取大图下载地址
	three_step = big_imgget(two_step)

	#根据总页面 逐一解析所有的URL 并且下载
	#downloadIMG('http://pic.netbian.com/uploads/allimg/200803/212015-15964608158e89.jpg', 'C:\\Users\\30818\\Desktop\\img\\1.jpg')
	#进行下载操作
	index = 1;

	for i in three_step:

		print('图片下载进度' + str(math.floor(index / len(three_step) * 100)) + '%');
		index += 1;

		# i = http://pic.netbian.com/uploads/allimg/200922/230835-160078731550f0.jpg
		# 组成真实保存地址
		part_path = i.split('/');
		part_path = '-'.join(part_path[-2:]);
		realpath = path + part_path

		downloadIMG(i, realpath)