GPT==问答实例
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
import openai
import os
import logging as logger
from flask_cors import CORS
import os
openai.api_key = os.getenv('OPENAI_API_KEY')
class Chatbot():
def parse_paper(self, pdf):
logger.info("Parsing paper")
number_of_pages = len(pdf.pages)
logger.info(f"Total number of pages: {number_of_pages}")
paper_text = []
for i in range(number_of_pages):
page = pdf.pages[i]
page_text = []
def visitor_body(text, cm, tm, fontDict, fontSize):
x = tm[4]
y = tm[5]
# ignore header/footer
if (y > 50 and y < 720) and (len(text.strip()) > 1):
page_text.append({
'fontsize': fontSize,
'text': text.strip().replace('\x03', ''),
'x': x,
'y': y
})
_ = page.extract_text(visitor_text=visitor_body)
blob_font_size = None
blob_text = ''
processed_text = []
for t in page_text:
if t['fontsize'] == blob_font_size:
blob_text += f" {t['text']}"
if len(blob_text) >= 2000:
processed_text.append({
'fontsize': blob_font_size,
'text': blob_text,
'page': i
})
blob_font_size = None
blob_text = ''
else:
if blob_font_size is not None and len(blob_text) >= 1:
processed_text.append({
'fontsize': blob_font_size,
'text': blob_text,
'page': i
})
blob_font_size = t['fontsize']
blob_text = t['text']
paper_text += processed_text
logger.info("Done parsing paper")
return paper_text
通用imaplib通过账号密码的方式获取邮箱的收件箱内容以及附件
# 如果你的 Outlook 帐号启用了两步验证,你不能使用常规密码进行 IMAP 登录。你需要创建一个应用密码:
# 登录到 Outlook、Hotmail 或 Microsoft 账户。
# 转到 安全性 设置。
# 找到 应用密码 部分,生成一个新的应用密码。
# 使用这个应用密码替代常规密码进行登录。
# 个人邮箱需要去开启POP/IMAP和Exchange服务
import datetime
# IMAP 服务器配置
from email import policy
from email.parser import BytesParser
from email.utils import parsedate_to_datetime
from io import BytesIO
import logging
import re
import imaplib
import email
from email.header import decode_header
import socket
import ssl
from pia.utils.constants import COS_OUTLOOK_DIR, EMAIL_TYPE
from pia.utils.cos_upload import check_exists, upload_stream_to_cos
from tencheck import settings
logger = logging.getLogger(__name__)
def login_mailbox(email_type, email_user, email_pass):
# 连接到 IMAP 服务器
imap_server, port = get_email_imap_server_by_type(email_type)
mail = imaplib.IMAP4_SSL(imap_server, port=port)
mail.login(email_user, email_pass)
return mail
def check_mailbox(email_type, email_user, email_pass):
mail = None
try:
# 连接到 IMAP 服务器
imap_server, port = get_email_imap_server_by_type(email_type)
mail = imaplib.IMAP4_SSL(imap_server, port=port)
ok, _ = mail.login(email_user, email_pass)
return True, None
except imaplib.IMAP4.error as e:
logger.error(f"IMAP登录失败: {str(e)}")
return False, f"IMAP登录失败: {str(e)}"
except (socket.gaierror, ssl.SSLError) as e:
logger.error(f"连接服务器失败: {str(e)}")
return False, f"连接服务器失败: {str(e)}"
except Exception as e:
logger.error(f"未知错误: {str(e)}")
return False, f"未知错误: {str(e)}"
finally:
if mail:
try:
mail.logout()
except:
pass
def get_email_imap_server_by_type(email_type):
if email_type == EMAIL_TYPE.OUTLOOK.value:
return 'outlook.office365.com', 993
elif email_type == EMAIL_TYPE.WECOM.value:
if settings.TIER == "local" or settings.TIER == "dev":
return 'imap.qq.com', 993
else:
return '对应的邮箱服务器', 993
def deal_user_email_imap(email_type, email_user, email_pass, str_start, str_end):
mail = login_mailbox(email_type, email_user, email_pass)
# 选择收件箱
status, _ = mail.select('inbox')
if status != 'OK':
mail.logout()
return []
# 获取当天和前一天的日期
today = datetime.datetime.now()
tomorrow = today + datetime.timedelta(days=1)
# start = today - datetime.timedelta(days=5)
# 转换为IMAP搜索需要的格式
since_date = f'"{today.strftime("%d-%b-%Y")}"'
before_date = f'"{tomorrow.strftime("%d-%b-%Y %H:%M:%S")}"'
# 搜索所有邮件
# status, messages = mail.search(None, 'ALL')
# 设置时间范围
# since_date = '"01-Sep-2024"' # 开始日期
# 转换为 DD-Mon-YYYY HH:MM:SS 格式
# now_date = datetime.datetime.now().strftime('%d-%b-%Y %H:%M:%S')
# before_date = f'"{now_date}"' # 结束日期
status, messages = mail.search(None, f'SINCE {since_date} BEFORE {before_date}')
if status != 'OK':
mail.logout()
return []
# 获取邮件 ID 列表
mail_ids = messages[0].split()
result = []
# 遍历每封邮件
for mail_id in mail_ids:
# 获取邮件
status, msg_data = mail.fetch(mail_id, '(RFC822)')
if status != 'OK':
continue
value = {}
msg = email.message_from_bytes(msg_data[0][1])
# 解码邮件主题
subject, encoding = decode_header(msg['Subject'])[0]
if isinstance(subject, bytes):
subject = subject.decode(encoding if encoding else 'utf-8')
# 获取发件人、收件人和抄送
# 获取发件人
from_ = msg.get('From')
sender_name, sender_address = email.utils.parseaddr(from_)
# 获取收件人
to = msg.get('To')
recipients = email.utils.getaddresses([to])
# 获取抄送
cc = msg.get('Cc')
# 获取接收时间
date_str = msg['Date']
# date_tuple = email.utils.parsedate(date_str)
# received_time = datetime.datetime(*date_tuple[:6]).strftime("%Y-%m-%dT%H:%M:%SZ")
# received_time = datetime.datetime(*date_tuple[:6])
date_datetime = parsedate_to_datetime(date_str)
utc_datetime = date_datetime.astimezone(datetime.timezone.utc)
received_time = utc_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
mail_id = mail_id.decode()
value['id'] = mail_id
value['sender'] = {
"emailAddress": {
"address": sender_address,
"name": msg_decode(sender_name)
}
}
value['toRecipients'] = []
for name, address in recipients:
value['toRecipients'].append({
"emailAddress": {
"address": address,
"name": msg_decode(name)
}
})
value['subject'] = subject
value['receivedDateTime'] = received_time
hasAttachments = False
# 获取邮件内容
body = ""
attachment_list = []
# 嵌入的附件
embedded_attachments = {}
content_type = "text/plain"
if msg.is_multipart():
for part in msg.walk():
# 获取邮件内容
c_type = part.get_content_type()
if part.get_content_type() == 'text/html':
content_type = part.get_content_type()
body = part.get_payload(decode=True).decode(part.get_content_charset() or 'utf-8')
elif part.get_content_type() == 'text/plain':
content_type = part.get_content_type()
body = part.get_payload(decode=True)
if isinstance(body, bytes):
body = body.decode(part.get_content_charset() or 'utf-8', errors='replace')
elif c_type == 'image/png' or c_type == 'image/jpeg':
if part.get_content_disposition() != 'attachment':
cid = part.get('Content-ID').strip("<>")
if cid:
payload = part.get_payload(decode=True)
image_name = part.get_filename()
if payload:
file_stream = BytesIO(payload)
object_name = f'{COS_OUTLOOK_DIR}/{email_user}/{mail_id}/{image_name}'
is_exists, url = check_exists(object_name)
if not is_exists:
url = upload_stream_to_cos(file_stream, object_name)
embedded_attachments[cid] = url
# 如果邮件部分是附件
if part.get_content_disposition() == 'attachment':
hasAttachments = True
filename = part.get_filename()
if not filename:
# 如果没有文件名,尝试从 Content-Disposition 中获取
content_disposition_header = part.get('Content-Disposition')
if content_disposition_header:
# 使用正则表达式提取文件名
match = re.search(r'filename="(.+?)"', content_disposition_header)
if match:
filename = match.group(1)
# 处理文件名编码
if filename:
filename = decode_header(filename)[0]
if isinstance(filename[0], bytes):
filename = filename[0].decode(filename[1] if filename[1] else 'utf-8')
if isinstance(filename, tuple):
filename = filename[0]
if filename:
# 保存附件
# filepath = os.path.join(attachment_dir, filename)
payload = part.get_payload(decode=True)
if payload:
size = len(payload)
file_stream = BytesIO(payload)
object_name = f'{COS_OUTLOOK_DIR}/{email_user}/{mail_id}/{filename}'
is_exists, url = check_exists(object_name)
if not is_exists:
url = upload_stream_to_cos(file_stream, object_name)
attachment_list.append({"name": filename, "url": url, "size": size})
# with open(filename, 'wb') as f:
# f.write(payload)
logger.info(f'Attachment saved: {filename}')
else:
logger.info(f'Skipping attachment {filename} (empty payload)')
if part.get_content_type() == 'application/octet-stream' and part.get_content_disposition() != 'attachment':
file = part.get_filename() #附件名
if file:
filename = email.header.decode_header(file)[0][0] #附件名
charset = email.header.decode_header(file)[0][1] #编码
if part.get_all("Content-ID"):
content_id = part.get_all("Content-ID")[0][1:-1]
else:
content_id = "" #附件ID,也就是邮件源码里面的cid
''' 多个附件时将附件名和ID对应保存到dict里面,后面将正文中的cid替换为本地保存路径 '''
''' 附件文件名为中文或有编码的时候要进行转码 '''
if str(charset) != "None":
filename = filename.decode(charset)
filedata = part.get_payload(decode=True)
filename = f"embedded-{filename}"
file_stream = BytesIO(filedata)
object_name = f'{COS_OUTLOOK_DIR}/{email_user}/{mail_id}/{filename}'
is_exists, url = check_exists(object_name)
if not is_exists:
url = upload_stream_to_cos(file_stream, object_name)
embedded_attachments[content_id] = url
else:
# 如果不是多部分邮件,直接获取内容
content_type = msg.get_content_type()
body = msg.get_payload(decode=True)
if isinstance(body, bytes):
body = body.decode(msg.get_content_charset() or 'utf-8', errors='replace')
value['body'] = {
"contentType": content_type,
"content": body
}
value['hasAttachments'] = hasAttachments
value['attachments'] = attachment_list
for content_id, url in embedded_attachments.items():
body = body.replace(f"cid:{content_id}", url)
value['body']['content'] = body
result.append(value)
# 关闭连接
mail.logout()
return result
def msg_decode(msg):
try:
subject, encoding = decode_header(msg)[0]
if isinstance(subject, bytes):
subject = subject.decode(encoding if encoding else 'utf-8')
return subject
except Exception as e:
logger.error(f"msg_decode error: {str(e)}")
return msg