题目:

代码:
import random
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statistics as sta
import scipy.stats as stats
anscombe = sns.load_dataset("anscombe")
ansxI = anscombe.x[:11].values
meanIx = np.mean(ansxI)
print('the mean of x of dataset I: {}'.format(meanIx))
ansxII = anscombe.x[11:22].values
meanIIx = np.mean(ansxII)
print('the mean of x of dataset II: {}'.format(meanIIx))
ansxIII = anscombe.x[22:33].values
meanIIIx = np.mean(ansxIII)
print('the mean of x of dataset III: {}'.format(meanIIIx))
ansxIV = anscombe.x[33:44].values
meanIVx = np.mean(ansxIV)
print('the mean of x of dataset IV: {}'.format(meanIVx))
print('')
varIx = sta.variance(ansxI)
print('the variance of x of dataset I: {}'.format(varIx))
varIIx = sta.variance(ansxII)
print('the variance of x of dataset II: {}'.format(varIIx))
varIIIx = sta.variance(ansxIII)
print('the variance of x of dataset III: {}'.format(varIIIx))
varIVx = sta.variance(ansxIV)
print('the variance of x of dataset IV: {}'.format(varIVx))
print('')
ansyI = anscombe.y[:11].values
meanIy = np.mean(ansyI)
print('the mean of y of dataset I: {}'.format(meanIy))
ansyII = anscombe.y[11:22].values
meanIIy = np.mean(ansyII)
print('the mean of y of dataset II: {}'.format(meanIIy))
ansyIII = anscombe.y[22:33].values
meanIIIy = np.mean(ansyIII)
print('the mean of y of dataset III: {}'.format(meanIIIy))
ansyIV = anscombe.y[33:44].values
meanIVy = np.mean(ansyIV)
print('the mean of y of dataset IV: {}'.format(meanIVy))
print('')
varIy = sta.variance(ansyI)
print('the variance of y of dataset I: {}'.format(varIy))
varIIy = sta.variance(ansyII)
print('the variance of y of dataset II: {}'.format(varIIy))
varIIIy = sta.variance(ansyIII)
print('the variance of y of dataset III: {}'.format(varIIIy))
varIVy = sta.variance(ansyIV)
print('the variance of y of dataset IV: {}'.format(varIVy))
print('')
cofI = stats.pearsonr(ansxI, ansyI)[0]
print('the correlation coefficient of dataset I: {}'.format(cofI))
cofII = stats.pearsonr(ansxII, ansyII)[0]
print('the correlation coefficient of dataset II: {}'.format(cofII))
cofIII = stats.pearsonr(ansxIII, ansyIII)[0]
print('the correlation coefficient of dataset III: {}'.format(cofIII))
cofIV = stats.pearsonr(ansxIV, ansyIV)[0]
print('the correlation coefficient of dataset IV: {}'.format(cofIV))
print('')
xI = sm.add_constant(ansxI)
modI = sm.OLS(ansyI, xI)
resI = modI.fit()
print('the linear regression line of dataset I:')
print('y = {} + {} * x'.format(resI.params[0], resI.params[1]))
xII = sm.add_constant(ansxII)
modII = sm.OLS(ansyII, xII)
resII = modII.fit()
print('the linear regression line of dataset II:')
print('y = {} + {} * x'.format(resII.params[0], resII.params[1]))
xIII = sm.add_constant(ansxIII)
modIII = sm.OLS(ansyIII, xIII)
resIII = modIII.fit()
print('the linear regression line of dataset III:')
print('y = {} + {} * x'.format(resIII.params[0], resIII.params[1]))
xIV = sm.add_constant(ansxIV)
modIV = sm.OLS(ansyIV, xIV)
resIV = modIV.fit()
print('the linear regression line of dataset IV:')
print('y = {} + {} * x'.format(resIV.params[0], resIV.params[1]))
sns.set(style = 'whitegrid')
g = sns.FacetGrid(anscombe, col = 'dataset', hue = 'dataset', size = 3)
g.map(plt.scatter, 'x', 'y')
plt.show()
结果:


参考资料:
求相关系数:https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html
求线性拟合:http://www.statsmodels.org/devel/generated/statsmodels.regression.linear_model.OLS.html
本文通过Python的多种库,如Numpy、Pandas、Seaborn等,对Anscombe四组数据集进行了深入分析。包括计算每组数据集中x和y变量的均值、方差,并求出相关系数,进行线性回归拟合,最后绘制散点图进行直观展示。
158

被折叠的 条评论
为什么被折叠?



