import os
import requests
from openpyxl import Workbook
from openpyxl.drawing.image import Image
from concurrent.futures import ThreadPoolExecutor
# 图片链接列表
image_urls = [
"https://uploads/file/20230205/f85Lpcv8PXrLAdmNUDE1Hh6xqkp0NHi2gSXeqyOb.png",
"https://uploads/file/20230205/geG4FOpthrsUX0LkmWvDH2veFtw6yj8JLDMYBaQ1.png",
"https://uploads/file/20230205/mjVAx4jsbke6uj0e2Qz66f8KDceL1P5tanKQkNoy.png"
]
output_dir = "C:/Users/win-10/Desktop/发票图片/" # 指定Excel文件的输出目录
# 保存图片的本地目录
save_folder = "C:/Users/win-10/Desktop/发票图片/downloaded_images/"
# Excel文件名
excel_filename = "images_with_links.xlsx"
# 最大下载尝试次数
max_download_attempts = 3
def download_image(url, filename, attempts=0):
"""
下载图片到指定文件名
:param url: 图片的URL链接
:param filename: 保存图片的本地文件名
:param attempts: 当前下载尝试次数,默认为0
:return: 成功保存的文件名,下载失败返回None
"""
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return url, filename # 返回包含URL和文件名的元组
else:
raise Exception(f"HTTP错误码:{response.status_code}")
except Exception as e:
if attempts < max_download_attempts - 1:
print(f"下载尝试失败:{e},重试...")
return download_image(url, filename, attempts + 1)
else:
print(f"下载失败:{url},{e}")
return url, None # 返回包含URL和None(表示下载失败)的元组
def create_excel_file(image_data, output_dir, excel_filename):
"""
创建Excel文件并添加图片信息
:param image_data: 包含图片URL和本地路径的元组列表
:param output_dir: 目标Excel文件的输出目录
:param excel_filename: Excel文件名(不含目录路径)
"""
global cm_to_px_ratio
workbook = Workbook()
sheet = workbook.active
for idx, (img_url, img_path) in enumerate(image_data, start=1):
sheet[f"A{idx}"] = img_url
img = Image(img_path[1]) # 使用元组的第二个元素(文件名)
# 设置图片大小为6厘米 × 6厘米
cm_to_px_ratio = 20 # 假设1厘米等于96像素
img.width = 6 * cm_to_px_ratio
img.height = 6 * cm_to_px_ratio
# 将图片放置在与链接同一行的第二列(B列)
img.anchor = f"B{idx}"
img.left = idx # 或者 img.left = idx * 250 如果需要图片间有一定的间距
img.top = idx
sheet.add_image(img)
# 调整列宽以适应内容
sheet.column_dimensions['A'].width = 6 * cm_to_px_ratio
sheet.column_dimensions['B'].width = 6 * cm_to_px_ratio
sheet.row_dimensions[idx].height = 6 * cm_to_px_ratio
excel_full_path = os.path.join(output_dir, excel_filename)
workbook.save(excel_full_path)
print(f"图片及其链接已保存至Excel文件:{excel_full_path}")
if __name__ == "__main__":
if not image_urls:
print("图片链接列表为空,程序退出。")
exit(1)
# 创建保存目录(如果不存在)
if not os.path.exists(save_folder):
os.makedirs(save_folder)
with ThreadPoolExecutor(max_workers=5) as executor:
# 使用线程池并发下载图片
image_futures = [
executor.submit(download_image, url, os.path.join(save_folder, f"image{idx}.{url.split('.')[-1]}")) for
idx, url in enumerate(image_urls, start=1)]
# 收集下载结果
image_data = [(url, future.result()) for idx, (url, future) in
enumerate(zip(image_urls, image_futures), start=1)]
# 使用下载的图片信息创建Excel文件
create_excel_file(image_data, output_dir, excel_filename)
二 根据EXCEL中指定的链接下载图片并按照指定格式存储到WORD中
该Python脚本主要实现以下功能:
- 读取Excel文件:将指定Sheet的数据读取为字典列表,填充空值并返回。
- 按月份筛选数据:筛选2024年11月和12月的订单,支持手动指定订单号过滤,确保发票与验收单链接有效且订单不重复。
- 下载图片:根据URL下载图片并保存到本地路径。
- 写入Word文档:将订单信息写入Word文档,并插入对应的发票和验收单图片。
import pandas as pd
import requests
from docx import Document
from docx.shared import Inches
import os
import uuid
from datetime import datetime
from pdf2image import convert_from_path
import tempfile
def read_excel_to_dict(file_path, sheet_name=0):
"""
读取Excel文件并返回字典列表
"""
try:
df = pd.read_excel(file_path, sheet_name=sheet_name)
# 打印列名用于调试
print("Excel 列名:", list(df.columns))
# 填充 NaN 值为空字符串,防止后续出错
df = df.fillna('')
return df.to_dict('records')
except FileNotFoundError:
raise FileNotFoundError(f"文件 {file_path} 不存在")
except Exception as e:
raise RuntimeError(f"读取Excel文件失败: {str(e)}")
def filter_data_by_month(data, target_order_ids=None):
"""
筛选下单时间为 2024年11月 和 12月 的数据,每组取35个有效条目(不重)
要求:发票和验收单链接不为空或无效标记
"""
from collections import defaultdict
# 每月存储订单且使用集合记录已处理的订单号
monthly_orders = defaultdict(list)
seen_order_ids = set() # 全局记录已经处理过的订单号(防止跨月重复)
result = []
if target_order_ids:
# 💡 用户主动指定了订单号,匹配这些订单
target_order_ids = set(target_order_ids) # 去重 & 提高查找效率
for item in data:
order_id = item.get('订单号', '').strip()
if not order_id or order_id not in target_order_ids or order_id in seen_order_ids:
continue
# 复用原有字段有效性判断逻辑
invoice_link = str(item.get('发票', '')).strip()
receipt_link = str(item.get('验收单', '')).strip()
invalid_values = {'nan', '#n/a', '#na', 'n/a', 'na', '', 'none', '<null>'}
if invoice_link.lower() in invalid_values or receipt_link.lower() in invalid_values:
continue
try:
# 尝试解析时间字段构造订单月份
order_time_str = str(item['下单时间']).strip()
order_time = datetime.strptime(order_time_str, "%Y-%m-%d %H:%M:%S")
month = order_time.month
item['订单月份'] = f"2024年{month}月"
except Exception:
item['订单月份'] = "未知月份"
seen_order_ids.add(order_id)
result.append(item)
else:
for item in data:
order_time_str = str(item.get('下单时间', '')).strip()
invoice_link = str(item.get('发票', '')).strip()
receipt_link = str(item.get('验收单', '')).strip()
if not order_time_str:
continue
try:
# 解析时间字符串并获取年月
order_time = datetime.strptime(order_time_str, "%Y-%m-%d %H:%M:%S")
year, month = order_time.year, order_time.month
# 只保留 2024年11月 和 12月,并且订单号不能重复
if year == 2024 and month in [11, 12]:
order_id = item.get('订单号', '')
# 更全面判断无效值(包括 nan、#N/A 等)
invalid_values = {'nan', '#n/a', '#na', 'n/a', 'na', '', 'none', '<null>'}
if (
invoice_link.lower() in invalid_values or
receipt_link.lower() in invalid_values or
order_id in seen_order_ids
):
continue
# 构造“订单月份”字段供后续使用
item['订单月份'] = f"2024年{month}月"
monthly_orders[f"2024年{month}月"].append(item)
seen_order_ids.add(order_id) # 标记为已处理
except ValueError:
continue
for month, items in monthly_orders.items():
count = len(items)
print(f"【{month}】共找到 {count} 条不重复的有效数据")
result.extend(items[:35]) # 每月最多取前35条,确保不重复
return result
def download_image(url, save_path):
"""下载图片并保存到指定路径"""
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
return True
else:
print(f"无法下载图片:{url}")
return False
except Exception as e:
print(f"下载图片出错: {e}")
return False
def write_data_to_word(data, output_file):
"""
将数据写入Word文档,并插入相关图片(支持多张发票/验收单图片)
"""
document = Document()
if not data:
print("⚠️ 没有数据可写入文档!")
return
for index, item in enumerate(data):
print(f"正在处理第订单号为 {item.get('订单号')}的数据")
# 从下单时间提取月份
order_time_raw = item.get('下单时间')
try:
if isinstance(order_time_raw, pd.Timestamp):
order_time_str = order_time_raw.strftime("%Y-%m-%d %H:%M:%S")
else:
order_time_str = str(order_time_raw).strip()
order_time = datetime.strptime(order_time_str, "%Y-%m-%d %H:%M:%S")
month = order_time.month
month_label = f"2024年{month}月"
except ValueError:
month_label = "未知月份"
# 添加主标题
document.add_heading(f"{index + 1}){month_label}订单", level=1)
# 添加子标题(订单编号)
order_id = item.get('订单号', '未知编号')
document.add_heading(f"{index + 1}.{order_id}验收单及发票", level=3)
# 添加验收单描述
# document.add_paragraph("根据数据连接下载的高清“验收单”图片")
# 下载并插入验收单图片(支持多个链接)
receipt_links = item.get('验收单', '')
if receipt_links.strip():
receipt_link_list = [link.strip() for link in receipt_links.split(',') if link.strip()]
for i, url in enumerate(receipt_link_list):
yanshou_img_path = f"yanshou_temp_{uuid.uuid4().hex}.jpg"
if download_image(url, yanshou_img_path):
try:
document.add_picture(yanshou_img_path, width=Inches(5))
except Exception as e:
print(f"插入验收单图片失败: {e}")
finally:
if os.path.exists(yanshou_img_path):
os.remove(yanshou_img_path)
# 添加发票描述
# document.add_paragraph(" 根据数据连接下载的高清“发票”图片")
# 下载并插入发票图片(支持多个链接)
invoice_links = item.get('发票', '')
if invoice_links.strip():
invoice_link_list = [link.strip() for link in invoice_links.split(',') if link.strip()]
for i, url in enumerate(invoice_link_list):
fapiao_img_path = f"fapiao_temp_{uuid.uuid4().hex}.jpg"
if download_image(url, fapiao_img_path):
try:
document.add_picture(fapiao_img_path, width=Inches(5))
except Exception as e:
print(f"插入发票图片失败: {e}")
finally:
if os.path.exists(fapiao_img_path):
os.remove(fapiao_img_path)
# 保存文档
document.save(output_file)
print(f"文档已保存为 {output_file}")
# 示例调用
if __name__ == "__main__":
excel_file = r'F:\订单250430.xlsx'
output_word_file = r'F:\订单汇总.docx'
print("开始读取Excel文件...")
raw_data = read_excel_to_dict(excel_file)
print(f"读取到 {len(raw_data)} 条原始数据")
print("开始按月份筛选数据...")
target_order_ids = ['b064536e31', '96937b09bc', 'e2dd08d858', '4cf8df81ae']
filtered_data = filter_data_by_month(raw_data, target_order_ids)
print(f"筛选后共 {len(filtered_data)} 条有效数据")
if not filtered_data:
print("⚠️ 没有符合要求的数据,请检查字段名称和数据内容!")
else:
print("开始写入Word文档...")
write_data_to_word(filtered_data, output_word_file)