pyautogui_pdf批量转换为TXT

本文介绍使用PyAutoGUI实现PDF文件批量转换为TXT文本的方法。代码详细展示了从打开PDF阅读器到批量转换的具体步骤,包括文件路径设置、热键操作及等待时间调整等内容。

 

pyautogui_pdf批量转换为TXT,

用pdf自带无损转换

 

# -*- coding: utf-8 -*-
"""
Created on Thu May  5 15:39:54 2016

一定要有time.sleep(1)时间控制,否则出错
pdf另存文本,效果可能很差
typewrite("content") 用于输入文字
typewrite(["right","left","up"]) 用于输入连续键盘按钮
@author: daxiong
"""

import pyautogui,time,os

dir_file="C:/Users/daxiong/Desktop/test"
#获取文件夹的文件名
fileNames=os.listdir(dir_file)


#打开存储PDF软件;(50,50)为pdf坐标
pyautogui.doubleClick(50,50)
time.sleep(1)


for fileName in fileNames:
    #打开第一个PDF,按热键Ctrl+o即可
    pyautogui.hotkey("ctrl","o")
    time.sleep(1)
    #输入PDF文件名,并进入
    pyautogui.typewrite(fileName)
    time.sleep(1)
    pyautogui.press("enter")
    time.sleep(1)
    #另存为纯文本
    pyautogui.hotkey("shift","ctrl","s")
    time.sleep(1)
    pyautogui.press("tab") #切换到下面的保存格式
    time.sleep(1)
    pyautogui.press("down")
    time.sleep(1)
    pyautogui.typewrite(["down","down","down","down","down","down","down","down"\
    ,"down","down","down","down","down","down","down","down","down","down"\
    ,"down","down","enter"])

    #选择储存路径
    pyautogui.press('f4')  #定位地址栏
    time.sleep(1)
    pyautogui.hotkey("ctrl","a") #选中内容
    time.sleep(1)
    pyautogui.press('delete') #删除旧的地址
    time.sleep(1)
    pyautogui.typewrite(dir_file)
    time.sleep(1)
    #连续按下10个tab就到保存按钮
    pyautogui.typewrite(["tab","tab","tab","tab","tab","tab","tab","tab","tab",\
    "tab","enter"])
    #关闭PDF,组合键ctrl+w
    time.sleep(2)
    pyautogui.hotkey("ctrl","w")

pyautogui.hotkey("ctrl","q")

  

 

办公室电脑测试代码

pdf_to_txt

# -*- coding: utf-8 -*-
"""
Created on Thu May 12 11:22:57 2016
pdf更换为最新版本,尝试提高转换成功率。
txt必须转换为纯文本格式
等待时间必须和pdf页码数匹配
@author: Administrator
"""

import pyautogui,time,os,PyPDF2

dir_file="C:/Users/Administrator/Desktop/test/pdf/"
#获取文件夹的文件名
fileNames=os.listdir('.')
pdf_fileNames=[i for i in fileNames if os.path.splitext(i)[1]==".pdf"]
 
  


def Get_time(filename):
    try:
        pdfFileObj=open(filename,'rb')
        pdfReader=PyPDF2.PdfFileReader(pdfFileObj)
        pages=pdfReader.numPages #显示页数 在第4100行时读取pdfReader也会出错
        
    except:
        print ("wrong when read pdf:",filename)
        sleepTime=10
        return sleepTime
        
    if pages<=10:
        sleepTime=pages+2
    else:
        sleepTime=15
    
    return sleepTime
        


#打开存储PDF软件;(50,50)为pdf坐标
pyautogui.doubleClick(50,50)
time.sleep(3)

for fileName in fileNames:
    #打开第一个PDF,按热键Ctrl+o即可
    pyautogui.hotkey("ctrl","o")
    time.sleep(1)
    #输入PDF文件名,并进入
    pyautogui.typewrite(fileName)
    time.sleep(2)
    pyautogui.press("enter")
    time.sleep(1)
    #另存为纯文本
    pyautogui.hotkey("shift","ctrl","s")
    time.sleep(1)
    pyautogui.press("tab") #切换到下面的保存格式
    time.sleep(1)
    pyautogui.press("down")
                                        
    time.sleep(1)
    #不准确
    pyautogui.typewrite(["down","down","down","down","down","down","down","down"\
    ,"down","down","down","down","down","down","down","down","down","down","enter","enter"])
 
    sleepTime=Get_time(fileName)
    #关闭PDF,组合键ctrl+w
    time.sleep(sleepTime)
    pyautogui.hotkey("ctrl","w")
 
pyautogui.hotkey("ctrl","q")

  

txt 文件包提取到excel

# -*- coding: utf-8 -*-
"""
Created on Thu May 12 14:05:06 2016
1.先用filenameToExcel.exe程序导入文件名
2.B11写入me_txt
3.批量写入内容

list不能写入cell,str才可以.txt必须是纯文本格式
@author: Administrator
"""

import PyPDF2,os,openpyxl,sys,time,xlrd
from openpyxl.cell import get_column_letter,column_index_from_string

#开始时间
timeBegin=time.clock()
excelFileName="test.xlsx"
wb=openpyxl.load_workbook(excelFileName)  
sheet=wb.active
columnIndex="A"
start=1
expandName=".txt"
expandName_upper=expandName.upper()

excelFile = xlrd.open_workbook(excelFileName)  
table = excelFile.sheet_by_index(0) #通过索引顺序获取  
#A列的单元格
cells_columnA=sheet.columns[0]
#B列单元格
cells_columnB=sheet.columns[1]

#content="你好"
    
def Get_col_values(i):
    list_col_values=table.col_values(i)
    list_col_values1=list_col_values[1:]
    return list_col_values1
    
    


def single_txt_extract(filename,i):
    try:
        txtFileObj=open(filename)
        #不知道readlines()效果和readline相比如何,要测试
        content=txtFileObj.read()
        
    except:
        print ("wrong when read txt:",filename)
    
    cells_columnB[i+1].value=content   #list不能写入cell,str才可以.txt必须是纯文本格式
    txtFileObj.close()
    

list_pdf_fileNames=Get_col_values(0)

single_txt_extract("1151.txt",0)

wb.save(excelFileName)

  

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值