通用imaplib通过账号密码的方式获取邮箱的收件箱内容以及附件

本文链接：https://blog.youkuaiyun.com/weixin_39184713/article/details/144290306

GPT==问答实例

import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
import openai
import os
import logging as logger
from flask_cors import CORS
import os
openai.api_key = os.getenv('OPENAI_API_KEY')

class Chatbot():

    def parse_paper(self, pdf):
        logger.info("Parsing paper")
        number_of_pages = len(pdf.pages)
        logger.info(f"Total number of pages: {number_of_pages}")
        paper_text = []
        for i in range(number_of_pages):
            page = pdf.pages[i]
            page_text = []

            def visitor_body(text, cm, tm, fontDict, fontSize):
                x = tm[4]
                y = tm[5]
                # ignore header/footer
                if (y > 50 and y < 720) and (len(text.strip()) > 1):
                    page_text.append({
                        'fontsize': fontSize,
                        'text': text.strip().replace('\x03', ''),
                        'x': x,
                        'y': y
                    })

            _ = page.extract_text(visitor_text=visitor_body)

            blob_font_size = None
            blob_text = ''
            processed_text = []

            for t in page_text:
                if t['fontsize'] == blob_font_size:
                    blob_text += f" {t['text']}"
                    if len(blob_text) >= 2000:
                        processed_text.append({
                            'fontsize': blob_font_size,
                            'text': blob_text,
                            'page': i
                        })
                        blob_font_size = None
                        blob_text = ''
                else:
                    if blob_font_size is not None and len(blob_text) >= 1:
                        processed_text.append({
                            'fontsize': blob_font_size,
                            'text': blob_text,
                            'page': i
                        })
                    blob_font_size = t['fontsize']
                    blob_text = t['text']
                paper_text += processed_text
        logger.info("Done parsing paper")
        return paper_text

通用imaplib通过账号密码的方式获取邮箱的收件箱内容以及附件

# 如果你的 Outlook 帐号启用了两步验证，你不能使用常规密码进行 IMAP 登录。你需要创建一个应用密码：
# 登录到 Outlook、Hotmail 或 Microsoft 账户。
# 转到 安全性 设置。
# 找到 应用密码 部分，生成一个新的应用密码。
# 使用这个应用密码替代常规密码进行登录。
# 个人邮箱需要去开启POP/IMAP和Exchange服务
import datetime
# IMAP 服务器配置
from email import policy
from email.parser import BytesParser
from email.utils import parsedate_to_datetime
from io import BytesIO
import logging
import re


import imaplib
import email
from email.header import decode_header
import socket
import ssl

from pia.utils.constants import COS_OUTLOOK_DIR, EMAIL_TYPE
from pia.utils.cos_upload import check_exists, upload_stream_to_cos
from tencheck import settings

logger = logging.getLogger(__name__)
def login_mailbox(email_type, email_user, email_pass):
    # 连接到 IMAP 服务器
    imap_server, port = get_email_imap_server_by_type(email_type)
    mail = imaplib.IMAP4_SSL(imap_server, port=port)
    mail.login(email_user, email_pass)
    return mail

def check_mailbox(email_type, email_user, email_pass):
    mail = None
    try:
        # 连接到 IMAP 服务器
        imap_server, port = get_email_imap_server_by_type(email_type)
        mail = imaplib.IMAP4_SSL(imap_server, port=port)
        ok, _ = mail.login(email_user, email_pass)
        return True, None
    except imaplib.IMAP4.error as e:
        logger.error(f"IMAP登录失败: {str(e)}")
        return False, f"IMAP登录失败: {str(e)}"
    except (socket.gaierror, ssl.SSLError) as e:
        logger.error(f"连接服务器失败: {str(e)}")
        return False, f"连接服务器失败: {str(e)}"
    except Exception as e:
        logger.error(f"未知错误: {str(e)}")
        return False, f"未知错误: {str(e)}"
    finally:
        if mail:
            try:
                mail.logout()
            except:
                pass

def get_email_imap_server_by_type(email_type):
    if email_type == EMAIL_TYPE.OUTLOOK.value:
        return 'outlook.office365.com', 993
    elif email_type == EMAIL_TYPE.WECOM.value:
        if settings.TIER == "local" or settings.TIER == "dev":
            return 'imap.qq.com', 993
        else:
            return '对应的邮箱服务器', 993

def deal_user_email_imap(email_type, email_user, email_pass, str_start, str_end):
    mail = login_mailbox(email_type, email_user, email_pass)
    # 选择收件箱
    status, _ = mail.select('inbox')
    if status != 'OK':
        mail.logout()
        return [] 

    # 获取当天和前一天的日期
    today = datetime.datetime.now()
    tomorrow = today + datetime.timedelta(days=1)
    # start = today - datetime.timedelta(days=5) 
    
    # 转换为IMAP搜索需要的格式 
    since_date = f'"{today.strftime("%d-%b-%Y")}"'
    before_date = f'"{tomorrow.strftime("%d-%b-%Y %H:%M:%S")}"'

    # 搜索所有邮件
    # status, messages = mail.search(None, 'ALL')
    # 设置时间范围
    # since_date = '"01-Sep-2024"'  # 开始日期
    # 转换为 DD-Mon-YYYY HH:MM:SS 格式
    # now_date = datetime.datetime.now().strftime('%d-%b-%Y %H:%M:%S')
    # before_date = f'"{now_date}"'  # 结束日期
    status, messages = mail.search(None, f'SINCE {since_date} BEFORE {before_date}')
    if status != 'OK':
        mail.logout()
        return []
    # 获取邮件 ID 列表
    mail_ids = messages[0].split()

    result = []

    # 遍历每封邮件
    for mail_id in mail_ids:
        # 获取邮件
        status, msg_data = mail.fetch(mail_id, '(RFC822)')
        if status != 'OK':
            continue
        
        value = {}
        msg = email.message_from_bytes(msg_data[0][1])

        # 解码邮件主题
        subject, encoding = decode_header(msg['Subject'])[0]
        if isinstance(subject, bytes):
            subject = subject.decode(encoding if encoding else 'utf-8')

        # 获取发件人、收件人和抄送
        # 获取发件人    
        from_ = msg.get('From')
        sender_name, sender_address = email.utils.parseaddr(from_)
        # 获取收件人
        to = msg.get('To')
        recipients = email.utils.getaddresses([to])
        # 获取抄送
        cc = msg.get('Cc')
        # 获取接收时间
        date_str = msg['Date']
        # date_tuple = email.utils.parsedate(date_str)
        # received_time = datetime.datetime(*date_tuple[:6]).strftime("%Y-%m-%dT%H:%M:%SZ")
        # received_time = datetime.datetime(*date_tuple[:6])

        date_datetime = parsedate_to_datetime(date_str)
        utc_datetime = date_datetime.astimezone(datetime.timezone.utc)
        received_time = utc_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
        
        
        mail_id = mail_id.decode()
        value['id'] = mail_id
        value['sender'] = {
            "emailAddress": {
                "address": sender_address,
                "name": msg_decode(sender_name)
            }
        }
        value['toRecipients'] = []
        for name, address in recipients:
            value['toRecipients'].append({
                "emailAddress": {
                    "address": address,
                    "name": msg_decode(name)
                }
            })
           
        value['subject'] = subject
        value['receivedDateTime'] = received_time


        hasAttachments = False
        # 获取邮件内容
        body = ""
        attachment_list = []
        # 嵌入的附件
        embedded_attachments = {}
        content_type = "text/plain"
        if msg.is_multipart():
            for part in msg.walk():
                # 获取邮件内容
                c_type = part.get_content_type()
                if part.get_content_type() == 'text/html':
                    content_type = part.get_content_type()
                    body = part.get_payload(decode=True).decode(part.get_content_charset() or 'utf-8')
                    
                elif part.get_content_type() == 'text/plain':
                    content_type = part.get_content_type()
                    body = part.get_payload(decode=True)
                    if isinstance(body, bytes):
                        body = body.decode(part.get_content_charset() or 'utf-8', errors='replace')

                elif c_type == 'image/png' or c_type == 'image/jpeg':
                    if part.get_content_disposition() != 'attachment':
                        cid = part.get('Content-ID').strip("<>")
                        if cid:
                            payload = part.get_payload(decode=True)
                            image_name = part.get_filename()
                            if payload:
                                file_stream = BytesIO(payload)
                                object_name = f'{COS_OUTLOOK_DIR}/{email_user}/{mail_id}/{image_name}'
                                is_exists, url = check_exists(object_name)
                                if not is_exists:
                                    url = upload_stream_to_cos(file_stream, object_name)
                                embedded_attachments[cid] = url
                
                # 如果邮件部分是附件
                if part.get_content_disposition() == 'attachment':
                    hasAttachments = True
                    filename = part.get_filename()
                    if not filename:
                        # 如果没有文件名，尝试从 Content-Disposition 中获取
                        content_disposition_header = part.get('Content-Disposition')
                        if content_disposition_header:
                            # 使用正则表达式提取文件名
                            match = re.search(r'filename="(.+?)"', content_disposition_header)
                            if match:
                                filename = match.group(1)

                    # 处理文件名编码
                    if filename:
                        filename = decode_header(filename)[0]
                        if isinstance(filename[0], bytes):
                            filename = filename[0].decode(filename[1] if filename[1] else 'utf-8')
                    if isinstance(filename, tuple):
                        filename = filename[0]

                    if filename:
                        # 保存附件
                        # filepath = os.path.join(attachment_dir, filename)
                        payload = part.get_payload(decode=True)
                        if payload:
                            size = len(payload)
                            file_stream = BytesIO(payload)
                            object_name = f'{COS_OUTLOOK_DIR}/{email_user}/{mail_id}/{filename}'
                            is_exists, url = check_exists(object_name)
                            if not is_exists:
                                url = upload_stream_to_cos(file_stream, object_name)
                            attachment_list.append({"name": filename, "url": url, "size": size})
                            # with open(filename, 'wb') as f:
                            #     f.write(payload)
                            logger.info(f'Attachment saved: {filename}')
                        else:
                            logger.info(f'Skipping attachment {filename} (empty payload)')
                
                if part.get_content_type() == 'application/octet-stream' and part.get_content_disposition() != 'attachment':
                        file = part.get_filename()  #附件名
                        if file:
                            filename = email.header.decode_header(file)[0][0]  #附件名
                            charset = email.header.decode_header(file)[0][1]   #编码
                            if part.get_all("Content-ID"):
                                content_id = part.get_all("Content-ID")[0][1:-1]
                            else:
                                content_id = ""   #附件ID,也就是邮件源码里面的cid
                            ''' 多个附件时将附件名和ID对应保存到dict里面,后面将正文中的cid替换为本地保存路径 '''
                            ''' 附件文件名为中文或有编码的时候要进行转码 '''
                            if str(charset) != "None":
                                filename = filename.decode(charset)
                            filedata = part.get_payload(decode=True) 

                            filename = f"embedded-{filename}"
                            file_stream = BytesIO(filedata)
                            object_name = f'{COS_OUTLOOK_DIR}/{email_user}/{mail_id}/{filename}'
                            is_exists, url = check_exists(object_name)
                            if not is_exists:
                                url = upload_stream_to_cos(file_stream, object_name)
                            embedded_attachments[content_id] = url
        else:
            # 如果不是多部分邮件，直接获取内容
            content_type = msg.get_content_type()
            body = msg.get_payload(decode=True)
            if isinstance(body, bytes):
                body = body.decode(msg.get_content_charset() or 'utf-8', errors='replace')

        value['body'] = {
            "contentType": content_type,
            "content": body
        }
        value['hasAttachments'] = hasAttachments
        value['attachments'] = attachment_list
        
        for content_id, url in embedded_attachments.items():
            body = body.replace(f"cid:{content_id}", url)
        value['body']['content'] = body
        
        result.append(value)

    # 关闭连接
    mail.logout()
    return result

def msg_decode(msg):
    try:
        subject, encoding = decode_header(msg)[0]
        if isinstance(subject, bytes):
            subject = subject.decode(encoding if encoding else 'utf-8')
        return subject
    except Exception as e:
        logger.error(f"msg_decode error: {str(e)}")
        return msg