添词、计数
import os
import re
import time
import math
import openpyxl
import fitz
import jieba
import jieba.analyse
from collections import Counter
'''
使用Python操作PDF:常用PDF库总结 - 知乎
https://zhuanlan.zhihu.com/p/352722932
'''
def pdr_reader(file):
doc = fitz.open(file)
content=''
for page in doc:
text = page.get_text()
content=content+text
return content
def keyword_count(content,words):
raw_list = jieba.lcut(content)
counts = {}
for word in raw_list:
if word in words:
counts[word] = counts.get(word, 0) + 1
return counts
def count_sum(counts):
count1 = 0
count2 = 0
for key in counts:
count1 += 1
count2 += counts.get(key, 0)
return [count1,count2]
file=r"E:\Alark\Desktop\数字化关键词.txt"
f=open(file,'r',encoding='utf-8')
requests=re.compile(r'\w*[\u4e00-\u9fa5]*、',re.S)
words=[]
result=re.finditer(requests,f.read())
for p in result:
tmp=re.sub("、",'',p.group(0))
words.append(tmp)
f.close()
for word in words:
jieba.add_word(word)
#########
mywb = openpyxl.load_workbook(r'G:\词频统计.xlsx')
mysheet = mywb.active
row=0
list=[]
for r in mysheet.rows:
if r[0].value!=None:
row+=1
list.append((r[0].value,r[1].value))
if mysheet['A'+str(row+1)]==None:
break
input=r'G:\年报'
order=0
for i in range(0,6):
path=input+'\\'+str(2015+i)+'\\'+"制造"
os.chdir(path)
for file in os.listdir():
if os.path.splitext(file)[1]=='.pdf':
print(str(order+1)+'、'+file+':processing......')
t1=time.monotonic()
code=os.path.splitext(file)[0][:6]
if (code,2015+i) in list:
order+=1
print(str(order) + '、' + file + ': is already there.')
else:
year=2015+i
content = pdr_reader(file)
counts = keyword_count(content)
[count1,count2]=count_sum(counts)
values=[code,year,count1,count2,math.log(count1+1,math.e),math.log(count2+1,math.e),1]
row += 1
order+=1
list.append((code,year))
for j in range(0,7):
mysheet.cell(row=row, column=j+1, value=values[j])
t2 = time.monotonic()
print(str(order) + '、' + file + ':done.',str(t2-t1)+'s')
mywb.save(r'G:\词频统计.xlsx')
path = input + '\\' + str(2015 + i) + '\\' + "其他"
os.chdir(path)
for file in os.listdir():
if os.path.splitext(file)[1]=='.pdf':
print(str(row+1)+'、'+file+':processing......')
t1 = time.monotonic()
code=os.path.splitext(file)[0][:6]
if (code,2015+i) in list:
order+=1
print(str(order) + '、' + file + ': is already there.')
else:
year=2015+i
content = pdr_reader(file)
counts = keyword_count(content)
[count1,count2]=count_sum(counts)
values=[code,year,count1,count2,math.log(count1+1,math.e),math.log(count2+1,math.e),0]
order+=1
row += 1
list.append((code, year))
for j in range(0,7):
mysheet.cell(row=row, column=j+1, value=values[j])
t2 = time.monotonic()
print(str(row) + '、' + file + ':done.',str(t2-t1)+'s')
mywb.save(r'G:\词频统计.xlsx')
mywb.save(r'G:\词频统计.xlsx')
特定章节选取
确定成功提取内容是关键
思路:返回最大中文片段==标题