import json
import random
import re
import time
import traceback
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
from lxml import etree
from utils import r_sismember, r_sadd, get_image_content, up_data_img_file, print_write, r_spop, get_image_or_v_content, \
get_file_type, get_requests, send_message, up_prompt_to_redis, r_scard, decode_headers, r_sadd_pip, r_sdiffstore, \
get_oss_media_path_prefix, r_delete, get_correct_proxies, r_sdiff
class Canva:
pool_author_data_loop = 50
pool_author_data = ThreadPoolExecutor(max_workers=pool_author_data_loop)
pool_like = ThreadPoolExecutor(max_workers=10)
pool = ThreadPoolExecutor(max_workers=30)
pool_dingshi = ThreadPoolExecutor(max_workers=1)
proxies = get_correct_proxies()
if proxies == {}:
proxies = {
'http': "http://127.0.0.1:7890",
'https': "http://127.0.0.1:7890",
}
headers1 = {
'authority': 'www.canva.com',
'sec-ch-ua-platform-version': '"14.6.1"',
'x-canva-app': 'marketplace',
'x-canva-locale': 'en',
'x-csrf-token': 'C3wwsvdezvY8yQQY87fpS4QdRzuf2X2q7qA0sJrcqwfoPNLMWTxaiWtcH8wz_7gtquPE3hSmnRs0V-Ipvld4jgVAETqnIR_636Q9emUAas5CwerY',
'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
'content-type': 'application/json;charset=UTF-8',
'Accept': '*/*',
'Host': 'www.canva.com',
'Connection': 'keep-alive'
}
headers = {
"Host": "www.canva.com",
"sec-ch-ua-platform": "\"macOS\"",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform-version": "\"15.3.1\"",
"accept": "*/*",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.canva.com/es_es/plantillas/EAFHcWBiJLc-logo-marca-emprendimiento-negocio-floristeria-acuarela-flores-rosa/",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=1, i"
}
image_headers = {
'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
'Accept': '*/*',
'Host': 'marketplace.canva.com',
'Connection': 'keep-alive',
}
auth_token = ''
source_net = 'canva'
source = 'canva_templates'
redis_key = source + '_user_id'
redis_key_running = source + '_user_id_running'
redis_prompt_key = source + '_prompt'
redis_prompt_key_beifen = redis_prompt_key + '_beifen'
redis_author_image_data = source + '_author_image_data'
redis_author_key = source + '_author'
redis_cookie = source_net + '_redis_cookie'
start_time = 0
def __init__(self):
# 移动 running 到 redis_prompt_key,删除running
try:
no_over = r_sdiff(self.redis_key_running,self.redis_prompt_key)
no_over_list = [data.decode('utf-8') for data in no_over]
r_sadd_pip(self.redis_prompt_key, no_over_list)
# r_sdiffstore(self.redis_prompt_key, self.redis_key_running)
except:
print('转移正在跑的队列失败')
try:
r_delete(self.redis_key_running)
except:
print('删除正在跑的队列失败')
# self.get_cookie()
# self.headers['Cookie'] = self.cookie
# self.headers['authorization'] = 'Bearer ' + self.auth_token
# self.pool_dingshi.submit(self.dingshi)
pass
# self.get_auth_token()
def get_url_info(self,url):
try:
id = url.split('/')[-2]
url_name = url.split('/')[-2]
response = get_requests(url, proxies=self.proxies, headers=self.headers, impersonate='chrome133a',)
data_json = re.findall("window\['bootstrap'\] = JSON.parse\('(.*?)'\);",response.text)[0]
data_json1 = json.loads(data_json.replace('\\','\\\\').replace('\\\\"','\\"'))
# 获取喜欢图片,有一部分需要单独网络请求
tree = etree.HTML(response.text)
like_list = tree.xpath('//a[@class="Ej7lEg ngt4GQ pTsp_A _3bC2IQ"]/@href')
like_image_urls = []
for like in like_list:
like_image = 'https://www.canva.com'+like
like_image_urls.append(like_image)
r_sadd_pip(self.redis_prompt_key, like_image_urls)
# r_sadd_pip(self.redis_prompt_key_beifen, like_image_urls)
# 作者链接
author_name = tree.xpath('//a[@class="Ej7lEg _200NbQ _3bC2IQ"]/@href')[0].split('/')[-2]
author_id = data_json1['page']['A']['A']['Da']['C'][0]['C']
if author_id == 'www.pexels.com':
try:
author_id = data_json1['page']['A']['A']['H2']['C']['A']
except:
pass
r_sadd(self.redis_author_key, author_id)
# 获取本链接图片
url_up,id_up,content_up,oss_up = [],[],[],[]
main_image_url_list = tree.xpath('//img[@class="yHccVw"]/@src')
if main_image_url_list == []:
main_image_url_list = tree.xpath('//img[@class="hOlLgQ"]/@src')
# main_image_url_list = data_json1
# for main_url in main_image_url_list:
all_main_image_list = []
if main_image_url_list == []:
try:
for main_url_data in data_json1['page']['A']['A']['H1']['D']:
if 'V' in main_url_data:
main_url = main_url_data['V'][0]['B']
elif 'X' in main_url_data:
main_url = main_url_data['X'][0]['B']
all_main_image_list.append(main_url)
except:
data1 = data_json1['page']['A']['A']['Da']['B'][-1]
if 'P' in data1:
main_url = data1['P'][-1]['A'][0]['B']
elif 'A' in data1:
main_url = data1['A'][0]['B']
all_main_image_list.append(main_url)
else:
all_main_image_list = main_image_url_list
is_test = False
for main_url in all_main_image_list:
# print(main_url)
response_image = get_requests(main_url, proxies=self.proxies, headers=self.image_headers, impersonate='chrome133a')
start_oss_path = get_oss_media_path_prefix(is_test)
if start_oss_path and not start_oss_path.endswith('/'):
start_oss_path += '/'
extension, mime = get_file_type(response_image.content)
if mime == '':
continue
# id 可能跳外站,去除host
if 'www.pexels.com' in author_id:
path_id = author_id.split('/')[-1]
else:
path_id = author_id
oss_path = start_oss_path + '%s/%s/%s.%s' % (self.source,path_id, id, extension)
oss_up.append(oss_path)
url_up.append(main_url)
image_id = re.findall('/(canva.*?)\.',main_url)[0]
id_up.append(image_id)
content_up.append(response_image.content)
# 额外获取like图片链接
# title =
try:
logos = data_json1['page']['A']['A']['H7']['B'][0]['A']['G']['D']['A']
except:
print('没有logo',url)
logos = ''
try:
fonts = data_json1['page']['A']['A']['IJ']
except:
print('没有fonts',url)
fonts = ''
try:
size = data_json1['page']['Bc']['B'][0]['N']
except:
print('没有size',url)
size = ''
try:
title = tree.xpath('//h1/text()')[0]
except:
title = data_json1['page']['A']['A']['H1']['A']
start_data = {
#
'title': title,
'author_name': author_name,
'author_id':author_id,
# 'colors':'',
'fonts':fonts,
'size':size,
'logos':logos,
'url':url,
}
group_id = id
if not up_data_img_file(url_up, self.source, id_up, content_up, start_data, is_test=is_test,group_id=group_id,oss_paths=oss_up):
raise Exception('上传失败')
r_sadd(self.redis_key, url)
print_write('图片上传成功,图片个数:', len(url_up), '\t', url)
# 额外获取like图片链接
# self.pool_like.submit(self.get_like_url,url_name)
self.get_like_url(url_name)
print_write('获取成功:', '\t', url)
return start_data
# print()
except:
print(traceback.format_exc())
print()
def get_like_url(self,name):
try:
url_like_id = 'https://www.canva.com/templates/%s/?runtime=BROWSER'%name
headers = {
'pragma': 'no-cache',
'priority': 'u=1, i',
'sec-ch-ua-platform-version': '"14.6.1"',
'x-canva-accept': 'application/json',
'x-canva-active-user': 'eyJBIjoiVUFHb2gwVHZ4aUkiLCJCIjoiQkFHb2gtSDZoVVEifQ==',
'x-canva-analytics': 'AAQAA1dFQgABABowMUpXOERRNUJGWkJFQ05QNDE0UUU5R1g2Mg==',
'x-canva-authz': 'cnvan4O-a6MVL9UjNRPJ1er8MYmes3TiCe_n6AgD2vFEQB1BBTQlW8UmyTRWRLVzrhz-FrYKQh05FHE2Ry62G_sRS-yyDWvHS71hIwUJBy040mXEVafgEKj0kk1GrTIJ47pMkspzqsH1WZooG1EQ8nZT0WFhp0YSdfi6eYH2EqnpyMezrcaO0JjfC9kgVHYnymCOVDSld485d17mnXTStGoMCjsGIIO2If3quzShmwsfVoqmpQUEEa5a3c99c',
'x-canva-brand': 'BAGoh-H6hUQ',
'x-canva-build-name': '20250525-22',
'x-canva-build-sha': 'b016092',
'x-canva-user': 'UAGoh0TvxiI',
'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
'content-type': 'application/json;charset=UTF-8',
'Accept': '*/*',
'Host': 'www.canva.com',
'Connection': 'keep-alive',
}
response_like_id = get_requests(url_like_id, proxies=self.proxies, headers=headers, impersonate='chrome133a')
# /A/A/H0/B/0/A/T
like_id_result_json =json.loads(response_like_id.text.split('//', 1)[1])
like_id_result_json_j =json.loads( json.loads(response_like_id.text.split('//', 1)[1])['J'])
like_id = like_id_result_json_j['A']['A']['H0']['B'][0]['A']['T']
like_url = 'https://www.canva.com/_ajax/marketplace2/card/masonry?continuationToken=%s&limit=50&hasPrimaryAction=false'%like_id
response_like = get_requests(like_url, proxies=self.proxies, headers=self.headers,
impersonate='chrome133a')
like_result_json = json.loads(response_like.text.split('//', 1)[1])
like_list = like_result_json['cards']
like_image_urls = []
for like in like_list:
like_image = 'https://www.canva.com'+like['N']['K']['A']
like_image_urls.append(like_image)
# print(like_image)
r_sadd_pip(self.redis_prompt_key,like_image_urls)
# r_sadd_pip(self.redis_prompt_key_beifen,like_image_urls)
except:
print(traceback.format_exc())
# print()
def get_category(self):
url = 'https://www.canva.com/video-editor/templates/'
# response = get_requests(url, proxies=self.proxies,impersonate='chrome120')
# tree = etree.HTML(response.content)
# all_category = tree.xpath('//a[@class="Ej7lEg sU9jOQ _2f_7cQ pTsp_A _3bC2IQ"]/@href')[:30]
all_category = ['https://www.canva.com/visual-suite/', 'https://www.canva.com/docs/',
'https://www.canva.com/presentations/', 'https://www.canva.com/online-whiteboard/',
'https://www.canva.com/pdf-editor/', 'https://www.canva.com/graphs/',
'https://www.canva.com/sheets/', 'https://www.canva.com/ai-assistant/',
'https://www.canva.com/video-editor/', 'https://www.canva.com/video-editor/youtube/',
'https://www.canva.com/photo-editor/', 'https://www.canva.com/photo-collages/',
'https://www.canva.com/features/background-remover/', 'https://www.canva.com/business-cards/',
'https://www.canva.com/cards/', 'https://www.canva.com/invitations/',
'https://www.canva.com/custom-mugs/', 'https://www.canva.com/t-shirts/',
'https://www.canva.com/hoodies-sweatshirts/', 'https://www.canva.com/calendars/',
'https://www.canva.com/labels/', 'https://www.canva.com/logos/',
'https://www.canva.com/posters/', 'https://www.canva.com/flyers/',
'https://www.canva.com/brochures/', 'https://www.canva.com/social-media/',
'https://www.canva.com/website-builder/', 'https://www.canva.com/stickers/',
'https://www.canva.com/mockups/', 'https://www.canva.com/qr-code-generator/']
return all_category
def get_page(self):
pass
def run_category(self,page_url,i):
try:
category = re.findall('https://www.canva.com/(.*?)/', page_url)[0]
_url = 'https://www.canva.com/templates/?query=%s&continuation=%s' % (category, 50 * i)
# _url = page_url + 'templates?continuation=%s' % (50 * i)
response = get_requests(_url, proxies=self.proxies, headers=self.headers, impersonate='chrome133a')
tree = etree.HTML(response.content)
all_category = tree.xpath('//a[@class="Ej7lEg ngt4GQ pTsp_A _3bC2IQ"]/@href')
all_urls = []
for url2 in all_category:
url3 = 'https://www.canva.com' + url2
# r_sadd(self.redis_key, url3)
# r_sadd(self.redis_key_beifen, url3)
all_urls.append(url3)
r_sadd_pip(self.redis_prompt_key, all_urls)
print(_url)
except:
print(traceback.format_exc())
def get_category_url(self):
all_category = self.get_category()
self.all_urls = []
for page_url in all_category:
for i in range(21):
# self.run_category(page_url)
self.pool.submit(self.run_category, page_url,i)
while True:
time.sleep(1)
que_num = self.pool._work_queue.qsize()
print('剩余队列 :%s' % que_num)
if que_num == 0:
time.sleep(60)
break
# r_sadd_pip(self.redis_prompt_key,self.all_urls)
# r_sadd_pip(self.redis_prompt_key_beifen,self.all_urls)
print(r_scard(self.redis_prompt_key))
def run_url_info(self):
while True:
try:
try:
url = r_spop(self.redis_prompt_key)
# url = 'https://www.canva.com/templates/EAFC8Uol99A-brown-white-typography-fashion-beauty-logo/'
# url = 'https://www.canva.com/templates/EAGU41SnomM-black-and-yellow-classic-video-centric-christmas-holiday-season-video/'
except:
print('队列无信息,尝试获取初始url')
self.get_category_url()
time.sleep(10)
continue
if not r_sismember(self.redis_key, url):
r_sadd(self.redis_key_running, url)
image_datas = self.get_url_info(url)
# print_write('图片个数:', len(image_datas), '\t', url)
except:
print(traceback.format_exc())
def run_url_start(self):
# self.pool_author_data_loop = 50
for i in range(self.pool_author_data_loop):
# self.run_author()
self.pool_author_data.submit(self.run_url_info)
time.sleep(1)
def up_prompt(self):
_path = [
'sora_explore_20250505_deduplicate.jsonl',
'sora_explore_20250506_deduplicate.jsonl',
]
for path in _path:
up_prompt_to_redis(path, self.redis_prompt_key)
def up_author(self):
redis_prompt_key_num = r_scard(self.redis_prompt_key)
source_author = r_scard('canva_explore_v2_prompt')
r_sdiffstore(self.redis_prompt_key, 'canva_explore_v2_prompt')# 后者转前者
print(f'导入作者成功,prompt数量:{redis_prompt_key_num},source_author数量:{source_author}')
def main():
canva = Canva()
# canva.get_category()
# canva.get_category_url()
# canva.up_author()
# canva.get_like_url('EAFC8Uol99A-brown-white-typography-fashion-beauty-logo')
# canva.get_url_info1('https://www.canva.com/templates/EAFC8Uol99A-brown-white-typography-fashion-beauty-logo/')
canva.run_url_start()
# canva.run_image_start()
# sora.get_connect()
# img_urls = sora.pages_down()
# print()
def run_bendi():
is_bendi = get_correct_proxies()
if is_bendi == {}:
canva = Canva()
# canva.run_url_info()
# canva = Canva()
# canva.get_category()
canva.get_category_url()
if __name__ == '__main__':
# canva.loop_url_info()
run_bendi()
# canva = Canva()
# canva.run_url_info()
# canva = Canva()
# # canva.get_category()
# canva.get_category_url()
loop = 25
pool_author = ProcessPoolExecutor(loop)
# while True:
for i in range(loop):
f = pool_author.submit(main)
time.sleep(3)
time.sleep(33333333)
while True:
count = pool_author._queue_count
if count > loop*2:
time.sleep(60)
else:
break
# pool_pro.close()
# pool_pro.join()
这段代码占用内存过高,有没有优化方案