python数据抓取与可视化post方法,网易云课堂人工智能

网易云课堂数据分析
本文通过爬取网易云课堂的数据,并利用Python进行清洗和可视化分析,探究了课程来源、内容、价格及用户评价等多个维度的特点。

 

数据爬取

import json,time
from bs4 import BeautifulSoup
import requests

headers={
    'Content-Type': 'application/json',
    'edu-script-token': '70d2f62d6584454f8b6378680f8f58fa',
    'Host': 'study.163.com',
    'Origin': 'https://study.163.com',
    'Referer':'https://study.163.com/category/400000001310004',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

Payloads={
'activityId':0,
'frontCategoryId':'400000001310004',
'orderType':50,
'pageIndex':1,
'pageSize':50,
'priceType':-1,
'relativeOffset':0,
'searchTimeType':-1
}

url='https://study.163.com/p/search/studycourse.json'
fullinfo=[]

def getcourses(url,headers=headers,Payloads=Payloads):
    for i in range(1,8):
        Payloads['pageIndex']=i
        Payloads['relativeOffset']=50*(i-1)
        r=requests.post(url,data=json.dumps(Payloads).encode(encoding='utf 8'),headers=headers)
        content=r.json()
        fullinfo.extend(content['result']['list'])
        print("the {} page was finished".format(i))
        time.sleep(1)
    print("All page is OK")
    return fullinfo
getcourses(url,headers=headers,Payloads=Payloads)

提取所要分析的列

import pandas as pd
mydata=pd.DataFrame(fullinfo)
mydata['courseId']=mydata['courseId'].astype('str')
no_use_columns=['activityIds','bigImgUrl','gmtModified','published','schoolShortName','tagLectorTime','courseCardProps','displayType','endTime','imgUrl','productId','startTime','tagIap','termType']
mydata=mydata.drop(no_use_columns,axis=1)
mydata.head(10)
mydata.info()

可视化分析

1.课程来源分析

sum(mydata.lectorName==mydata.provider)/len(mydata)#个人发布所占比例
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('classic')
labels=['个人','机构']
colors=['green','blue']
plt.figure(figsize=(4,3),facecolor='#ebebeb')
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
plt.rcParams['axes.unicode_minus']=False
plt.axes(aspect='equal',facecolor='#ebebeb')
ax=plt.gca()
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
plt.xlim(0,4)
plt.ylim(0,4)
percent=[0.2143,1-0.2143]
plt.pie(percent,labels=labels,colors=colors,labeldistance=1.3,textprops={'color':'k'},autopct='%1.1f%%',radius=1,center=(1.8,1.8))
plt.title('课程来源性质分布',fontdict={'fontsize':16,'color':'k'},loc='left')
plt.show()

import squarify
import random
count=mydata['provider'].value_counts()
count=count.sort_values(ascending=False)

plt.style.use('classic')
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
plt.rcParams['axes.unicode_minus']=False
plt.figure(figsize=(10,8),facecolor='#ebebeb')
colors=['#6794a7','#014d64','#76c0c1','#01a2d9','#7ad2f6','#00887d']
plot=squarify.plot(sizes=count[count>=5].values,label=list(count[count>=10].index)+['']*12,color=random.choices(colors,k=len(count[count>=5])),
                   alpha=0.6,value=list(count[count>=10])+['']*12,edgecolor='white',linewidth=1)
plt.axis('off')
plt.tick_params(top='off',right='off')
plt.show()

 

plt.style.use('ggplot')
plt.figure(figsize=(8,4),facecolor='#ebebeb')
plt.barh(range(9,-1,-1),count.values[:10],align='center',color=random.choice(colors),alpha=0.8)
plt.xlim(0,35)
plt.yticks(range(9,-1,-1),count.index[:10])
plt.xlabel('课程发布数')
plt.title('课程发布量Top10')
ax=plt.gca()
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.grid(axis='x',which='major',linestyle='dashed',color='grey')
for x,y in zip(range(9,-1,-1),count.values[:10]):
    plt.text(y+0.2,x,'%s'%y,va='center')
plt.show()

 

count2=mydata['provider'].value_counts()#降序
plt.style.use('ggplot')
fig=plt.figure(figsize=(8,4),facecolor='white')
plt.plot(range(0,136),count2.cumsum(),linestyle='-',linewidth=2,color='r',marker='o',markersize=2,markeredgecolor='g')
plt.xlim(0,150)
plt.ylim(0,400)
plt.tick_params(top='off',right='off')
plt.xlabel('作者累计数')
plt.ylabel('课程发布频数')
plt.title('课程累计发布频率分布')
ax=plt.gca()
ax.grid(axis='x',which='major',linestyle='--',color='grey')
plt.show()

 

2.课程内容分析 

import jieba
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np

#标题词云图
word_list=[" ".join(jieba.cut(sentence))for sentence in mydata['productName']]
new_text=" ".join(word_list)
wc=WordCloud(background_color='white',max_words=2000,stopwords=STOPWORDS.add('said'),max_font_size=50,font_path="C:\Windows\Fonts\simhei.ttf",random_state=42).generate(new_text)
fig=plt.figure(figsize=(10,8),facecolor='grey')
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

#描述词云图
word_list1=[" ".join(jieba.cut(sentence))for sentence in mydata['description'] if sentence != None]
new_text1=" ".join(word_list)
new_text1=new_text.replace('\n','')
wc=WordCloud(background_color='white',max_words=2000,stopwords=STOPWORDS.add('said'),max_font_size=50,font_path="C:\Windows\Fonts\simhei.ttf",random_state=42).generate(new_text1)
fig=plt.figure(figsize=(10,8),facecolor='grey')
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

 

 

3.课程价格分析 

price_data1=mydata.loc[mydata['originalPrice']!=0,['lectorName','originalPrice','productName','learnerCount','score']].sort_values(by='originalPrice',ascending=False)

len(price_data1)/len(mydata)
price_data1.iloc[:10]

course1000=price_data1.loc[(price_data1.originalPrice>10)&(price_data1.originalPrice<1000),]

len(price_data1.loc[price_data1.originalPrice<10,]),len(price_data1.loc[price_data1.originalPrice>1000,]),len(course1000)

course1000mean=course1000.groupby('lectorName')['originalPrice'].mean()
mean_price_top10=course1000mean.sort_values(ascending=False)[:10]
mean_price_top10

fig,axes=plt.subplots(1,1)
plt.bar(range(10),mean_price_top10,color='g',alpha=0.6,align='center')
plt.ylabel('课程平均价格')
plt.title("课程平均价格分析")
plt.xticks(range(10),mean_price_top10.index,rotation=60)
plt.ylim(400,1100)
for x,y in enumerate(mean_price_top10.values):
    plt.text(x,y+10,'%s'%y,ha='center')
plt.show()

4.课程用户分析 

learner_data=mydata.loc[:,['provider','originalPrice','learnerCount','score','productName']]
priceno=learner_data.loc[learner_data.originalPrice!=0,['productName','provider','originalPrice','learnerCount']].sort_values(by='learnerCount',ascending=False)

#付费
plt.style.use('ggplot')
plt.figure(figsize=(8,4))
plt.barh(range(9,-1,-1),priceno.learnerCount[:10],align='center',color=random.choice(colors),alpha=0.8)
plt.xlim(0,6000)
plt.yticks(range(9,-1,-1),priceno.productName[:10])
plt.tick_params(top='off',left='off',right='off',bottom='off')
plt.xlabel('学员数量')
plt.title('付费课程Top10')
ax=plt.gca()
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.grid(axis='x',which='major',linestyle='dashed',color='grey')
for x,y in zip(range(9,-1,-1),priceno.learnerCount[:10]):
    plt.text(y+2,x,'%s'%y,va='center')
plt.show()

#免费
free=learner_data.loc[learner_data.originalPrice==0,['productName','provider','originalPrice','learnerCount']].sort_values(by='learnerCount',ascending=False)
plt.style.use('ggplot')
plt.figure(figsize=(8,4))
plt.barh(range(9,-1,-1),free.learnerCount[:10],align='center',color='g',alpha=0.8)
plt.xlim(0,150000)
plt.yticks(range(9,-1,-1),free.productName[:10])
plt.tick_params(top='off',left='off',right='off',bottom='off')
plt.xlabel('学员数量')
plt.title('免费课程Top10')
ax=plt.gca()
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.grid(axis='x',which='major',linestyle='dashed',color='grey')
for x,y in zip(range(9,-1,-1),free.learnerCount[:10]):
    plt.text(y+2,x,'%s'%y,va='center')
plt.show()

 

5.用户评价分析 



#免费
scoreno=learner_data.loc[learner_data.originalPrice==0].groupby('provider')['score'].sum().sort_values(ascending=False)
plt.figure(figsize=(8,4))
plt.barh(range(9,-1,-1),scoreno.values[:10],align='center',color=random.choice(colors),alpha=0.8)
plt.yticks(range(9,-1,-1),scoreno.index[:10])
plt.xlabel('累计评分')
plt.title('免费课程评分Top10统计')
for x,y in zip(range(9,-1,-1),scoreno.values[:10]):
    plt.text(y+0.2,x,'%s'%y,va='center')
plt.show()

#付费
scoreyes=learner_data.loc[learner_data.originalPrice!=0].groupby('provider')['score'].sum().sort_values(ascending=False)
plt.figure(figsize=(8,4))
plt.barh(range(9,-1,-1),scoreyes.values[:10],align='center',color=random.choice(colors),alpha=0.8)
plt.yticks(range(9,-1,-1),scoreyes.index[:10])
plt.xlabel('累计评分')
plt.title('付费课程评分Top10统计')
for x,y in zip(range(9,-1,-1),scoreyes.values[:10]):
    plt.text(y+0.2,x,'%s'%y,va='center')
plt.show()

 

 

coursename=list(set(scoreno.index[:10]).intersection(set(scoreyes.index[:10])))
princeno=learner_data.loc[(learner_data['provider'].isin(coursename))&(learner_data.originalPrice==0),'provider'].value_counts()
princeyes=learner_data.loc[(learner_data['provider'].isin(coursename))&(learner_data.originalPrice!=0),'provider'].value_counts()
princeresult=pd.concat([princeno,princeyes],axis=1)
princeresult
 providerprovider
唐宇迪328
城市数据团317
plt.style.use('classic')
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
plt.rcParams['axes.unicode_minus']=False

values1=princeresult.iloc[:,0].values
values2=princeresult.iloc[:,1].values
feature=princeresult.index
angles=np.linspace(0,2*np.pi,len(values1),endpoint=False)

values1=np.concatenate((values1,[values1[0]]))
values2=np.concatenate((values2,[values2[0]]))
angles=np.concatenate((angles,[angles[0]]))

fig=plt.figure(figsize=(10,8),facecolor='#ebebeb')
ax=fig.add_subplot(111,polar=True)

ax.plot(angles,values1,'o-',linewidth=2,label='免费')
ax.fill(angles,values1,alpha=0.25)
ax.plot(angles,values2,'o-',linewidth=2,label='付费')
ax.fill(angles,values2,alpha=0.25)

ax.set_thetagrids(angles*180/np.pi,feature)
ax.set_ylim(0,50)
plt.title('高价值课程发布者免费与付费发布量')
ax.grid(True)
plt.legend(loc='best')
plt.show()

 

参考链接: http://www.360doc.com/content/17/1106/11/16619343_701315844.shtml

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值