#!/usr/bin/env python3
#-*- coding:utf-8-*-
import os
import sys
import logging
from urllib import request
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
from urllib.parse import urljoin
import codecs
import matplotlib.pyplot as plt
from matplotlib import font_manager
import matplotlib
import numpy as np
import logging
import time
import threading
logging.basicConfig(level=logging.DEBUG,format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
logging.info('start...')
BookData=[]
UserfulData=[]
UrlData=[]
def Timer():
i=0
Process=""
while 1:
i=i+1
Process=Process+"="
print(Process+">wait "+str(i)+" s",end='\r')
time.sleep(1)
try:
if flag == True:
break
except:
print
else:
print
threading.Thread(target=Timer).start()
if __name__ == "__main__":
base_url = "https://movie.douban.com"
First_page_url = base_url+"/review/best"
UrlData.append(First_page_url)
temp_url=First_page_url
while 1:
try:
html = request.urlopen(temp_url).read()
html = str(html,encoding = "utf-8")
soup = bs(html,'html.parser')
netx_url = soup.find(rel='next')['href']
temp_url= base_url+netx_url
except Exception as err:
logging.info(' url get has done')
break
else:
UrlData.append(base_url+netx_url)
for http in UrlData:
respone = request.urlopen(http)
html = respone.read()
html = str(html,encoding = "utf-8") # encoding = "utf-8" 可查看网址源码确定 字符编码类型
with codecs.open('1.html','w','utf-8') as html_file:
html_file.write(html)
soup = bs(html,'html.parser')
#DIV = soup.find_all(class_="main review-item")
DIV = soup.find_all('img')
for i in range(len(DIV)):
if 'title' in DIV[i].attrs:
BookData.append(DIV[i].attrs['title'])
#print (BookData)
DIV = soup.find_all("div",class_="action")
for i in range(len(DIV)):
Approve=DIV[i].get_text().replace(' ','').strip().split('\n')[0]
UserfulData.append(Approve)
#print (UserfulData)
print (BookData)
print (UserfulData)
X=[]
Y=[]
LIST=sorted(zip(BookData,map(int,UserfulData)),key=lambda item:item[1])
for i in LIST:
X.append(i[0])
Y.append(i[1])
print (LIST)
flag=True
plt.rc('font', family='SimHei', size=13)
plt.title(" TOP 10",color='blue')
plt.barh(range(len(X)), Y,tick_label=X)
#c = np.arange(len(Y))
for i in np.arange(len(Y)):
plt.text(Y[i],i,str(Y[i]), fontsize=15,ha='left',wrap=True,va='center') # YX
#plt.text(i,Y[i],str(Y[i]), fontsize=15,ha='center',wrap=True,va='baseline') #XY
plt.show()
python 爬虫并以图表显示
最新推荐文章于 2022-05-09 19:07:50 发布