import copy
import requests
import json
import traceback
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import threading
import warnings
import os
from lxml import html
warnings.filterwarnings("ignore")
def process(url):
print(f'开始抓取{url}')
response = crawl(url)
if response:
data = struct_data(response)
with lock:
all_data.extend(data)
return True
else:
return False
def crawl(url):
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=0, i",
"sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Google Chrome\";v=\"138\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
}
params = {
}
retries = 3
delay = 3
for attempt in range(retries):
try:
response = requests.get(url, headers=headers, params=params)
response.close()
if response.status_code == 200:
return response
else:
time.sleep(delay)
continue
except Exception as e:
print(f"请求失败: {e}. 尝试次数: {attempt + 1}/{retries}")
print(traceback.print_exc())
time.sleep(delay)
return None
def struct_data(response): #接收response对象
product_list = []
data_format = {
"Type": "variation",
"SKU": "",
"Name": "",
"Description": "",
"Stock": 1000,
"Sale price": "",
"Regular price": "",
"Categories": f"{category}",
"Tags": "",
"Images": "",
"Parent": "",
# "Attribute 1 name": "",
# "Attribute 1 value(s)": "",
# "Attribute 2 name": "",
# "Attribute 2 value(s)": "",
# "Attribute 3 name": "",
# "Attribute 3 value(s)": "",
"is_upload": 0,
"brand": f"{brand}"
}
'''
product_list =
[
{'Type':'variation','SKU':'xxx','Name':'xxx','Description':'xxx',....},
{'Type':'variation','SKU':'xxx','Name':'xxx','Description':'xxx',....},
]
product_list =
[
{'Type':'simple','SKU':'xxx','Name':'xxx','Description':'xxx',....},
]
'''
result = response.text
etree = html.fromstring(result)
json_str = etree.xpath('//script[@id="mobify-data"]')[0].text
json_data = json.loads(json_str)
json_str1 = etree.xpath('//script[@data-react-helmet="true"]')[0].text
json_data1 = json.loads(json_str1)
parent = json_data1.get('sku')
products = json_data.get('__PRELOADED_STATE__').get('__STATE_MANAGEMENT_LIBRARY').get('reduxStoreState').get('products').get(parent)
name = products.get('name')
description1 = products.get('shortDescription')
description2 = products.get('longDescription')
description = description1 if description1 else ""
description += description2 if description2 else ""
price = json_data1.get('offers').get('price')
images = products.get('imageGroups')[0].get('images')
images_list = []
for item in images:
link = item.get('link')
images_list.append(link)
images_str = ','.join(images_list)
color = json_data1.get('color')
variants = products.get('variants')
index = 0
for item in variants:
size = item.get('variationValues').get('size')
index += 1
data_format['SKU'] = parent + '_' + str(index)
data_format['Name'] = name
data_format['Description'] = description
data_format['Sale price'] = price
data_format['Regular price'] = price
data_format['Images'] = images_str
data_format['Parent'] = parent
data_format['Attribute 1 name'] = 'Color'
data_format['Attribute 1 value(s)'] = color
data_format['Attribute 2 name'] = 'Size'
data_format['Attribute 2 value(s)'] = size
data = copy.deepcopy(data_format)
product_list.append(data)
return product_list
def read_detail_url(category):
pwd = '../json_data/url.json'
with open(pwd, 'r',encoding='utf-8') as f:
data = json.load(f)
detail_url_list = data.get(category)
return detail_url_list
def write_to_csv(data):
pwd = rf'../o_data/{category}.csv'
with lock:
try:
df = pd.DataFrame(data)
df.to_csv(pwd, mode='w', index=False, header=True, encoding='utf-8') # Write all at once, including header
print(f'Data successfully written to csv')
except Exception as e:
print(f'Error writing to CSV: {e}')
def struct_parent():
# 定义构造父类数据存储路径
dir_path = '../o_data'
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# 读取 CSV 文件
try:
df = pd.read_csv(f'../o_data/{category}.csv')
parent_rows = []
for index, row in df.iterrows():
if row['Type'] == 'simple':
dict_row = row.to_dict()
parent_rows.append(dict_row)
# 根据子类数据构造父类行
grouped = df.groupby('Parent')
for parent_value, group in grouped:
attribute_1_values = group['Attribute 1 value(s)'].dropna().unique()
has_attr2 = 'Attribute 2 name' in group.columns and not group['Attribute 2 name'].isna().all()
if has_attr2:
attribute_2_values = group['Attribute 2 value(s)'].dropna().unique()
attr2_name = group['Attribute 2 name'].iloc[0]
attr2_value_str = ','.join(map(str, attribute_2_values)) if len(attribute_2_values) > 0 else ''
else:
attr2_name = ''
attr2_value_str = ''
has_attr3 = 'Attribute 3 name' in group.columns and not group['Attribute 3 name'].isna().all()
if has_attr3:
attribute_3_values = group['Attribute 3 value(s)'].dropna().unique()
attr3_name = group['Attribute 3 name'].iloc[0]
attr3_value_str = ','.join(map(str, attribute_3_values)) if len(attribute_3_values) > 0 else ''
else:
attr3_name = ''
attr3_value_str = ''
has_attr4 = 'Attribute 4 name' in group.columns and not group['Attribute 4 name'].isna().all()
if has_attr4:
attribute_4_values = group['Attribute 4 value(s)'].dropna().unique()
attr4_name = group['Attribute 4 name'].iloc[0]
attr4_value_str = ','.join(map(str, attribute_4_values)) if len(attribute_4_values) > 0 else ''
else:
attr4_name = ''
attr4_value_str = ''
first_image = group['Images'].iloc[0] if not group['Images'].empty else ''
# 创建父类行
parent_row = {
'Type': 'variable',
'SKU': parent_value,
'Name': group['Name'].iloc[0],
'Description': group['Description'].iloc[0],
'Sale price': group['Sale price'].iloc[0],
'Regular price': group['Regular price'].iloc[0],
'Categories': group['Categories'].iloc[0],
'Tags': group['Tags'].iloc[0],
'Images': first_image.split(',')[0] if first_image else '',
'Parent': '',
'Attribute 1 name': group['Attribute 1 name'].iloc[0],
'Attribute 1 value(s)': ','.join(map(str, attribute_1_values)),
'Attribute 2 name': attr2_name,
'Attribute 2 value(s)': attr2_value_str,
'Attribute 3 name': attr3_name,
'Attribute 3 value(s)': attr3_value_str,
'Attribute 4 name': attr4_name,
'Attribute 4 value(s)': attr4_value_str,
'is_upload': 0,
'brand': f'{brand}'
}
parent_rows.append(parent_row)
# 添加子类行
for _, row in group.iterrows():
parent_rows.append(row.to_dict())
# 创建新的DataFrame保存结果
new_df = pd.DataFrame(parent_rows)
# 保存到新的 CSV 文件中
new_df.to_csv(f'{dir_path}/{category}.csv', index=False)
except Exception as e:
print(traceback.print_exc())
print(e)
def write_fail(category,detail_url_list):
pwd = '../json_data/fail.json'
if not os.path.exists(pwd):
data = {}
else:
with open(pwd, 'r', encoding='utf-8') as file:
data = json.load(file)
data[category] = detail_url_list
# 写回文件
with open(pwd, 'w', encoding='utf-8') as file:
json.dump(data, file, indent=4, ensure_ascii=False)# 写回文件
if __name__ == '__main__':
brand = 'Off-White'
lock = threading.Lock()
fail_list = []
with open('../json_data/collections.json', 'r', encoding='utf-8') as f:
json_dict = json.load(f)
for category in json_dict.keys():
print(f'开始抓取{category}分类')
detail_url_list = read_detail_url(category)
# detail_url_list = ['https://www.off---white.com/en-us/women/clothing/dresses/black-stamp-bandana-dress-OWDF001S25FLE0011027.html']
all_data = []
for i in range(3):
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(process,url):url for url in detail_url_list}
for future in as_completed(futures):
url = futures[future]
try:
if not future.result():
fail_list.append(url)
except Exception as e:
print(traceback.print_exc())
print(f"抓取 {url} 时发生错误: {e}")
fail_list.append(url)
print('失败个数:', len(fail_list))
if not fail_list:
detail_url_list = fail_list.copy()
break
print('重试失败的请求')
detail_url_list = fail_list.copy()
fail_list.clear()
write_to_csv(all_data)
# 重试5次失败的写入json
if len(detail_url_list) != 0:
write_fail(category,detail_url_list)
# 构造父类
struct_parent()找不到路径怎么修改