1、getDatas.py
获取网页数据,并创建文件夹
#coding:utf-8
import urllib.request
from bs4 import BeautifulSoup
import os
class GetDatas:
def __init__(self):
self.url='https:~~~~~~~~~~~网址~~~~~~~~~~~'
self.folder_name='output'
# 获取数据
def datas(self):
# 打开网页
res=urllib.request.urlopen(self.url)
# 格式转换
response=BeautifulSoup(res,'html.parser')
# 获取想要数据的父元素
datas=response.find_all('div',{'class':'item'})
list1=[]
for item in datas:
dict1={}
dict1['rank']=item.find('div',{'class':'pic'}).find('em').get_text()
dict1['title']=item.find('div',{'class':'info'}).find('div',{'class':'hd'}).find('a').find('span',{'class':'title'}).get_text()
dict1['picUrl']=item.find('div',{'class':'pic'}).find('a').find('img').get('src')
list1.append(dict1)
return list1
# 创建文件夹
def mikDir(self):
if not os.path.exists(self.folder_name):
os.mkdir(self.folder_name)
if __name__ == '__main__':
getDatas=GetDatas()
getDatas.mikDir()
2、处理数据
(1)保存为txt格式
from getDatas import GetDatas
import time
# 保存数据
def saveData():
getDatas=GetDatas()
# 定义文件
current_time=time.strftime('%Y-%m-%d',time.localtime())
file_name='doubanMovie'+current_time+'.txt'
# 定义文件路径
file_path=getDatas.folder_name+'/'+file_name
getdatas=getDatas.datas()
# print(file_path)
for item in getdatas:
# 保存数据为txt格式文件
try:
with open(file_path,'a',encoding='utf-8') as fp:
fp.write('排名'+item['rank']+'\n')
fp.write('标题'+item['title']+'\n')
fp.write('图片路径'+item['picUrl']+'\n\n')
except IOError as err:
print("error:"+str(err))
finally:
fp.close()
if __name__ == '__main__':
saveData()
(2)保存为json格式
from getDatas import GetDatas
import json
import codecs
import time
# 保存数据
def saveData():
getDatas=GetDatas()
# 定义文件
current_time=time.strftime('%Y-%m-%d',time.localtime())
file_name='doubanMovie'+current_time+'.json'
# 定义文件路径
file_path=getDatas.folder_name+'/'+file_name
getdatas=getDatas.datas()
for item in getdatas:
# 保存数据为txt格式文件
try:
with codecs.open(file_path,'a',encoding='utf-8') as fp:
lines=json.dumps(item,ensure_ascii=False)
fp.write(lines+',\n')
except IOError as err:
print("error:"+str(err))
finally:
fp.close()
if __name__ == '__main__':
saveData()
(3)保存数据为xls文件
from getDatas import GetDatas
# 创建工作簿
from xlutils.copy import copy
import xlwt
import xlrd
import time
# 保存数据
def saveData():
getDatas=GetDatas()
# 定义文件
current_time=time.strftime('%Y-%m-%d',time.localtime())
file_name='doubanMovie'+current_time+'.xls'
# 定义文件路径
file_path=getDatas.folder_name+'/'+file_name
workbook=xlwt.Workbook(encoding='utf-8')
sheet=workbook.add_sheet(u'豆瓣电影top250')
head=['排行','电影标题','图片路径']
for index,item in enumerate(head):
sheet.write(0,index,item)
workbook.save(file_path)
index=1
getdatas=getDatas.datas()
for data in getdatas:
# 保存数据为xls格式文件
# 打开文件
wk=xlrd.open_workbook(file_path,formatting_info=True)
newWk=copy(wk)
newSheet=newWk.get_sheet(0)
line=[data['rank'],data['title'],data['picUrl']]
for i,item in enumerate(line):
newSheet.write(index,i,item)
newWk.save(file_path)
index=index+1
if __name__ == '__main__':
saveData()
(4)将数据保存到MySQL数据库
提前建立好数据库及数据表
import urllib.request
#
from bs4 import BeautifulSoup
import pymysql
import os
import time
# 找到网址
def getDatas():
# 打开网页
url="https://movie.douban.com/top250"
# print(url)
res=urllib.request.urlopen(url)
# 格式转换
response=BeautifulSoup(res,'html.parser')
datas=response.find_all('div',{'class':'item'})
con=pymysql.connect(host='localhost',user='root',passwd='1',db='douban',port=3306,charset='utf8')
if con:
print('链接成功')
for item in datas:
cur=con.cursor()
if cur:
dict1={}
dict1['rank']=item.find('div',{'class':'pic'}).find('em').get_text()
dict1['title']=item.find('div',{'class':'info'}).find('div',{'class':'hd'}).find('a').find('span',{'class':'title'}).get_text()
dict1['picUrl']=item.find('div',{'class':'pic'}).find('a').find('img').get('src')
sql='insert into doubaninfo values(null,%s,%s,%s)'
cur.execute(sql,(dict1['rank'],dict1['title'],dict1['picUrl']))
con.commit()
cur.close()
con.close()
pass
getDatas()
(5)保存网页中的图片
from getDatas import GetDatas
import urllib.request
import os
# 找到网址
# 保存数据
def saveData():
getDatas=GetDatas()
folder_name='images'
if not os.path.exists(folder_name):
os.mkdir(folder_name)
getdatas=getDatas.datas()
for item in getdatas:
# 保存数据为txt格式文件
try:
image_name=item['picUrl'].split('/')[-1]
urllib.request.urlretrieve(item['picUrl'],folder_name+'/%s' %image_name)
except IOError as err:
print("error:"+str(err))
if __name__ == '__main__':
saveData()