参考文章链接在这儿,感谢(没有列出的我会仔细找一下列出来)
http://python3-cookbook.readthedocs.io/zh_CN/latest/c13/p07_copy_move_files_and_directories.html
这是我的GitHub地址:https://github.com/NieShengyuan/pythonSpider/tree/master/PDF_Download
这是文档下载程序
包括:URL解析,requests方法,文档命名保存
# _*_ coding:utf-8_*_
# auther :nsy12
# date :2018/2/25
# time :11:20
import requests
import re, os
from bs4 import BeautifulSoup
import time
import random
global NUM
url_datas = [
'https://**.cn',
'https://.cn',
'https://.cn',
'https://.cn',
'https://.cn'
]
def download(url, pdf_name, dir):
'''
判断文件是否存在,若不存在则下载,需要在程序启动前获取文件列表
:param url: 指向文档的URL
:param pdf_name: 文档名称
:param dir: 保存地址
:return:
'''
# showPdf(url, pdf_name)
# 判断文件是否存在
if pdf_name in files:
print(pdf_name + ' exist')
else:
response = requests.get(url, data=None, stream=True)
if not os.path.exists(dir):
os.makedirs(dir)
with open(os.path.join(dir, pdf_name), "wb") as pdf_file:
for content in response.iter_content():
pdf_file.write(content)
print(" " + pdf_name + " has been downloaded!!")
def get_urls(url, dir):
'''
对URL指向的网站进行解析,获取文档链接
:param url: 需解析的URL
:param dir: 文档保存路径
:return:
'''
print("Please wait for second ...")
global NUM
html = requests.get(url, data=None)
# html.encoding = 'utf-8' # 指定网页编码方式(查看网页源代码)
# print(html.encoding)
# print(html.status_code)
# print(html.text)
soup = BeautifulSoup(html.text, 'lxml')
# all_a = soup.find('div', class_='cvideotitle').find_all('a')
all_a = soup.find('div').find_all('a')
for a in all_a:
title = a.get_text()
url_pdf = a['href']
name = title[19:-18]
print(str(NUM) + ' 开始保存:' + name)
NUM = NUM + 1
download(url_pdf, str(name), dir)
# time.sleep(random.randint(1, 2))
"""
#将数据写入记事本
# with open(r'D:jishubaogao\date.txt', 'a', encoding='gbk') as f:
f.write(name + '\n')
"""
if __name__ == "__main__":
global NUM
FILE_DIR = r'E:\\1smart Car\paper\Twelfth'
files = os.listdir(FILE_DIR) # 读取文件夹下存在的文件
NUM = 1
for url_data in url_datas:
get_urls(url_data, FILE_DIR)
print("finsh" + url_data)
print(" finsh download")
下面是文档特定关键词筛选,并复制进特定地址
包括:文件夹下所有文件名获取,对特定关键词筛选,复制进特定路径的方法
# _*_ coding:utf-8_*_
# auther :nsy12
# date :2018/2/28
# time :16:25
# 筛选关键字,并将文件分类保存
import os
import shutil
global NUM, RESULT
def name_analyze(sub, dir):
'''
:param sub: 待筛选字符串
:param dir: 待保存地址
:return:
'''
global RESULT
# 筛选关键词
result1 = sub.find('双车')
result2 = sub.find('追逐')
if result1 == -1:
result1 = 0
if result2 == -1:
result2 = 0
# 筛选条件
if (result1 or result2) != 0:
# 操作函数
fileClassify(sub, dir)
RESULT += 1
print(' exist' + str(sub))
def fileClassify(fileName, dir):
'''
:param fileName:待移动文件名
:param dir: 待移动新地址
:return:
'''
# 创建文件夹
if not os.path.exists(dir):
os.makedirs(dir)
filesDirs = os.listdir(dir)
# 查重
if fileName in filesDirs:
print(fileName + ' exist')
else:
shutil.copy(DIR_PATH + '/' + str(fileName), dir) # ("oldpos", "newpos") # 移动文件或目录
print(" " + fileName + " has been moved!!")
if __name__ == "__main__":
global NUM, RESULT
NUM = 1
RESULT = 1
# 待筛选地址
DIR_PATH = 'E:\\1smart Car\paper\Twelfth'
# 待保存地址
NEW_DIR = 'E:\\1smart Car\paper\sc'
files = os.listdir(DIR_PATH)
for file in files:
name_analyze(file, NEW_DIR)
NUM += 1
print(NUM)
print("共计" + str(RESULT) + "个结果")