DIFY上传文件数量有所限制,
本篇解决对于大批量文件的一次性调用抽取文字,并将它整理到一起
DIFY流程中,开始添加file(单文件),var(文本类型),我的文件内容是/file
示例prompt
请你抽取以下指标内容:/var
并按示例的以下格式回复{"姓名":"x","年龄":25}
其中姓名、年龄都是示例指标,"x",25是对应的示例数字。
如果找不到相关的指标内容,请你返回NULL。例如{"姓名":"x","年龄":NULL}。请你注意!一定不要返回多余备注或者其他内容!!例如
“”注:您的文件中并没有提供具体的"转让金额"数据,所以我用null表示。如果您能提供更多详细信息,我可以更准确地回答您的问题。”这段话请一定不要返回给我!!!
import requests
import json
import os
os.chdir('D:\cindy\大模型')
import pandas as pd
import os
def get_pdf_files(directory):
pdf_files = []
try:
for root, dirs, files in os.walk(directory):
for file in files:
if file.lower().endswith('.pdf'):
pdf_files.append(os.path.join(root, file))
return pdf_files
except Exception as e:
print(f"发生错误: {e}")
return []
if __name__ == "__main__":
target_directory = 'D:\cindy\大模型\测试批量PDF' # 你可以把这个路径替换成你想要搜索的目录路径
pdfs = get_pdf_files(target_directory)
for pdf in pdfs:
print(pdf)
def upload_file(file_path, user):
upload_url = "dizhi/files/upload"
headers = {
"Authorization": "Bearer app-XX",
}
try:
print("上传文件中...")
with open(file_path, 'rb') as file:
files = {
'file': (file_path, file, 'text/plain') # 确保文件以适当的MIME类型上传
}
data = {
"user": user,
"type": "PDF" # 设置文件类型为TXT
}
response = requests.post(upload_url, headers=headers, files=files, data=data)
if response.status_code == 201: # 201 表示创建成功
print("文件上传成功")
return response.json().get("id") # 获取上传的文件 ID
else:
print(f"文件上传失败,状态码: {response.status_code}")
return None
except Exception as e:
print(f"发生错误: {str(e)}")
return None
def run_workflow(file_id, user, response_mode="blocking"):
workflow_url = "dizhi/workflows/run"#地址是自己的dify地址,
headers = {
"Authorization": "Bearer app-XX",
"Content-Type": "application/json"
}
data = {
"inputs": {
"file": {
"transfer_method": "local_file",
"upload_file_id": file_id,
"type": "document"
},"var":"出让机构,出让金额"
},
"response_mode": response_mode,
"user": user
}
try:
print("运行工作流...")
response = requests.post(workflow_url, headers=headers, json=data)
if response.status_code == 200:
print("工作流执行成功")
return response.json()
else:
print(f"工作流执行失败,状态码: {response.status_code}")
return {"status": "error", "message": f"Failed to execute workflow, status code: {response.status_code}"}
except Exception as e:
print(f"发生错误: {str(e)}")
return {"status": "error", "message": str(e)}
# 使用示例
user = "c"
MYDATA=[]
for pdf in pdfs:
file_path = pdf
file_id = upload_file(file_path, user)
if file_id:
# 文件上传成功,继续运行工作流
result = run_workflow(file_id, user)
MYDATA.append(json.loads(result['data']['outputs']['text']))
else:
print("文件上传失败,无法执行工作流")
excel=pd.DataFrame(MYDATA)`