知乎思维导图半爬虫(含去重、过滤非法字符功能)
import requests
import re
import os
import time
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
def filename_filter(title):
''' windows系统中文件名不能包含 \ / : * ? " < > |想要创建必须过滤掉这些字符 '''
title = re.sub('[\/:*?"<>|]','',title)
return title
def duplicate_removal(old_arr):
'''去重函数'''
print('去重前的个数为:',len(old_arr))
new_arr = list(set(old_arr)) # 用set去重(去重后顺序是乱的)
new_arr.sort(key=old_arr.index) # 新list的排序 用旧list的排序(这样去重后 顺序就不变了)
print('去重后的个数为:',len(new_arr))
print('去重后的列表为',new_arr)
return new_arr
def download(url,title):
print('正在下载:{}\n图片网址为:{}'.format(title.split('/')[-1],url))
img_data = requests.get(url,headers=headers).content
with open(title,'wb') as f:
f.write(img_data)
print('-'*65)
if __name__ == '__main__':
for num in range(9,17):
with open('html代码A{}.txt'.format(str(num)),'r',encoding='utf-8') as file:
txt_list = file.readlines()
txt = ''.join(txt_list)
# print(txt)
pat_title = r'<blockquote>(.*?)<br>'
# pat_title = r'<li>(.*?)</li>'
title = re.compile(pat_title).findall(txt)[0]
print(title)
title = filename_filter(title)
print('本专辑的名字为:',str(num) + title)
print('='*70)
path = os.getcwd() + '/' + str(num).rjust(2,'0') + '.' + title
if not os.path.exists(path):
os.makedirs(path)
# pat_jpg = r'data-actualsrc="(https://pic.*?)".*?<figcaption>.*?</figcaption>'
pat_jpg = r'data-original="(https.*?_r\.jpg)"'
old_arr = re.compile(pat_jpg).findall(txt)
new_jpg_htmls = duplicate_removal(old_arr)
# pat_jpgname = r'data-actualsrc="https://pic.*?".*?<figcaption>(.*?)</figcaption>'
# jpg_names = re.compile(pat_jpgname).findall(txt)
# print(jpg_names)
# print(len(jpg_names))
for x in range(len(new_jpg_htmls)):
url_jpg = new_jpg_htmls[x]
jpg_tail = url_jpg.split('.')[-1]
# jpg_name = path + '/' + jpg_names[x] + '.' + jpg_tail
jpg_name = path + '/' + str(x+1) + '.' + jpg_tail
download(url_jpg,jpg_name)
print('下载结束')