有感而发几篇博客:
"While we teach we learn "
欢迎提问!
会改文件目录的,能直接run----2021.12.09
'''
2021-12-09
1.框架结构
.0头信息使用了用户代理(未使用ip代理)
.1爬取主页获得图片大类类别名称,并写入5.天堂图片网url爬取(目录).txt
.2根据拿出的名称获得其下的图片小类类别名称及url,并追加写入6.天堂图片网url爬取(分目录).txt
.3
'''
import time
from urllib import request
import re
import random
uapools = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763,'
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
]
path = 'C:\\Users\\Administrator\\Desktop\\Python基础与应用\\爬虫\\爬到的东西\\'
def ua(uapool):#用户代理池
opener = random.choice(uapool)
head = ('User-Agent',opener)
opener1 = request.build_opener()
opener1.addheaders = [head]
request.install_opener(opener1)
print('全局用户代理创建成功当前User-Agent:', head)
#获得ivsky一级目录并写入txt
def ivskyoneurl():
url = 'https://www.ivsky.com/tupian/'
pet = '"><a href="/(.*?)" title=".*?">(.*?)</a></li><li '
data1 = request.urlopen(url).read().decode('utf-8', 'ignore')
imagurl = re.compile(pet).findall(data1)
print('类别名称数量:',len(imagurl))
with open(path + '5.天堂图片网url爬取(目录).txt', 'w+', encoding='utf-8') as f:
for page in range(len(imagurl))