根据url链接下载word文档（doc、docx)、excel(xlsx、xls)、pdf、txt，并解析获取其中的内容（段落、表格）形式

原创已于 2023-05-25 11:05:09 修改 · 1.2k 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#word #excel #pdf #python #txt

于 2023-03-23 17:32:06 首次发布

爬虫同时被 2 个专栏收录

18 篇文章

订阅专栏

python基础

9 篇文章

订阅专栏

主函数的逻辑代码如下

def main(link):
    if not os.path.isdir('./yuxi'):
        os.mkdir('./yuxi')
    files_path = './yuxi'
    rename_title = hashlib.md5(title.encode('utf-8'))
    title_md5 = rename_title.hexdigest()
    the_path = os.path.join(files_path, title_md5)
    if 'zip' in link or 'rar' in link:
        with open('yuxi.txt', 'a', encoding='utf-8') as file_txt:
            file_txt.write(url)
        continue
    if '.docx' in link:
        the_path = f'{the_path}.docx'
    elif '.doc' in link:
        the_path = f'{the_path}.doc'
    elif 'xlsx' in link:
        the_path = f'{the_path}.xlsx'
    elif 'xls' in link:
        the_path = f'{the_path}.xls'
    elif 'txt' in link:
        the_path = f'{the_path}.txt'
    else:
        the_path = f'{the_path}.pdf'
    state = download_file(link, the_path)
    if state:
        if '.docx' in link:
            new_content = convert_doc_to_txt(files_path, the_path, 'docx')
        elif '.doc' in link:
            new_content = convert_doc_to_txt(files_path, the_path, 'doc')
        elif 'xlsx' in link:
            new_content = convert_xlsx_to_txt(the_path)
        elif 'xls' in link:
            new_content = read_xls(the_path)
        elif 'txt' in link:
            new_content = read_txt(the_path)
        else:
            new_content = convert_pdf_to_txt(the_path)
    else:
        new_content = None
    if new_content:
        new_content = new_content.strip()
    if not new_content:
        new_content = None
    if not re.search(r'[\u4e00-\u9fa5]', new_content):
        new_content = None


if __name__ == '__main__':
    url = 'xxxxxxxxxx'
    main(url)

根据url下载文件的函数如下

def download_file(link, the_path):
    """下载文件."""
    try:
        response = req.get(link, stream=True, timeout=500)
        chunk_size = 1024
        with open(the_path, "wb") as file:
            for data in response.iter_content(chunk_size=chunk_size):
                file.write(data)
                # count += len(data)
            return True
    except req.RequestException:
        print(f'下载文件失败')
        return False

解析txt,返回txt文件内容

def read_txt(the_path):
    """txt文本解析."""
    with open(the_path, "rb") as file:
        txt = str(file.read())
    return txt

解析word文档，返回内容

def convert_doc_to_txt(p_path, pat, the_type):
    """解析doc文件."""
    txt = ""
    new_path = f"{pat}"
    if the_type == 'doc':
        # doc文件转为docx文件
        subprocess.run(
            ["/bin/libreoffice", "--headless", "--convert-to", "docx", "--outdir", p_path, pat],
            shell=False,
            check=True,
            encoding="utf-8",
        )
        new_path = f"{pat}x"
    # 直接读取docx文件
    try:
        file = docx.Document(new_path)
        # 段落
        for ele_p in file.paragraphs:
            txt += ele_p.text
        # 表格
        for table in file.tables:
            for row in table.rows:
                for cell in row.cells:
                    txt += cell.text
    except Exception as exception:
        print(f"ERROR !!! {exception}")
    # 读取完后删除docx文件
    os.remove(f"{pat}")
    return txt

解析excel，返回excel内容

def convert_xlsx_to_txt(file_name):
    """read excel."""
    wb = load_workbook(file_name)
    sheets = wb.get_sheet_names()
    # 第一个表格的名称
    sheet_first = sheets[0]
    # 获取特定的worksheet
    ws = wb.get_sheet_by_name(sheet_first)
    # 获取表格所有行和列，两者都是可迭代的
    rows = ws.rows
    # 迭代所有的行
    txt = ''
    for row in rows:
        txt_0 = ''
        for col in row:
            if col.value:
                txt_0 += str(col.value)
        txt += txt_0
    return txt


def read_xls(excel_name):
    """读取xls"""
    workbook = open_workbook(excel_name)

    sheet_name = workbook.sheet_names()
    sheet = workbook.sheet_by_name(sheet_name[0])

    rows = sheet.get_rows()
    txt = ''
    for row in rows:
        txt_0 = ''
        for col in row:
            if col.value:
                txt_0 += str(col.value)
        txt += txt_0
    return txt

解析pdf，返回pdf内容

def convert_pdf_to_txt(pdf_path):
    """pdf文本解析."""
    with pdfplumber.open(pdf_path) as pdf:
        content = ''
        try:
            for i in range(len(pdf.pages)):
                page = pdf.pages[i]
                page_content = '\n'.join(page.extract_text().split('\n')[:-1])
                content = content + page_content
        except Exception as exception:
            print(f"ERROR !!! {exception}")
    pdf.close()
    return content