第十四周作业

最新推荐文章于 2025-03-14 22:09:59 发布

原创最新推荐文章于 2025-03-14 22:09:59 发布 · 225 阅读

0 ·

CC 4.0 BY-SA版权

本文通过Python的多种库，如Numpy、Pandas、Seaborn等，对Anscombe四组数据集进行了深入分析。包括计算每组数据集中x和y变量的均值、方差，并求出相关系数，进行线性回归拟合，最后绘制散点图进行直观展示。

题目：

代码：

import random
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm    
import statsmodels.formula.api as smf    
import statistics as sta     
import scipy.stats as stats 

anscombe = sns.load_dataset("anscombe")    

ansxI = anscombe.x[:11].values
meanIx = np.mean(ansxI)
print('the mean of x of dataset I: {}'.format(meanIx))
ansxII = anscombe.x[11:22].values
meanIIx = np.mean(ansxII)
print('the mean of x of dataset II: {}'.format(meanIIx))
ansxIII = anscombe.x[22:33].values
meanIIIx = np.mean(ansxIII)
print('the mean of x of dataset III: {}'.format(meanIIIx))
ansxIV = anscombe.x[33:44].values
meanIVx = np.mean(ansxIV)
print('the mean of x of dataset IV: {}'.format(meanIVx))

print('')

varIx = sta.variance(ansxI)
print('the variance of x of dataset I: {}'.format(varIx))
varIIx = sta.variance(ansxII)
print('the variance of x of dataset II: {}'.format(varIIx))
varIIIx = sta.variance(ansxIII)
print('the variance of x of dataset III: {}'.format(varIIIx))
varIVx = sta.variance(ansxIV)
print('the variance of x of dataset IV: {}'.format(varIVx))

print('')

ansyI = anscombe.y[:11].values
meanIy = np.mean(ansyI)
print('the mean of y of dataset I: {}'.format(meanIy))
ansyII = anscombe.y[11:22].values
meanIIy = np.mean(ansyII)
print('the mean of y of dataset II: {}'.format(meanIIy))
ansyIII = anscombe.y[22:33].values
meanIIIy = np.mean(ansyIII)
print('the mean of y of dataset III: {}'.format(meanIIIy))
ansyIV = anscombe.y[33:44].values
meanIVy = np.mean(ansyIV)
print('the mean of y of dataset IV: {}'.format(meanIVy))

print('')

varIy = sta.variance(ansyI)
print('the variance of y of dataset I: {}'.format(varIy))
varIIy = sta.variance(ansyII)
print('the variance of y of dataset II: {}'.format(varIIy))
varIIIy = sta.variance(ansyIII)
print('the variance of y of dataset III: {}'.format(varIIIy))
varIVy = sta.variance(ansyIV)
print('the variance of y of dataset IV: {}'.format(varIVy))

print('')

cofI = stats.pearsonr(ansxI, ansyI)[0]
print('the correlation coefficient of dataset I: {}'.format(cofI))
cofII = stats.pearsonr(ansxII, ansyII)[0]
print('the correlation coefficient of dataset II: {}'.format(cofII))
cofIII = stats.pearsonr(ansxIII, ansyIII)[0]
print('the correlation coefficient of dataset III: {}'.format(cofIII))
cofIV = stats.pearsonr(ansxIV, ansyIV)[0]
print('the correlation coefficient of dataset IV: {}'.format(cofIV))

print('')

xI = sm.add_constant(ansxI)
modI = sm.OLS(ansyI, xI)
resI = modI.fit()
print('the linear regression line of dataset I:') 
print('y = {} + {} * x'.format(resI.params[0], resI.params[1]))
xII = sm.add_constant(ansxII)
modII = sm.OLS(ansyII, xII)
resII = modII.fit()
print('the linear regression line of dataset II:') 
print('y = {} + {} * x'.format(resII.params[0], resII.params[1]))
xIII = sm.add_constant(ansxIII)
modIII = sm.OLS(ansyIII, xIII)
resIII = modIII.fit()
print('the linear regression line of dataset III:') 
print('y = {} + {} * x'.format(resIII.params[0], resIII.params[1]))
xIV = sm.add_constant(ansxIV)
modIV = sm.OLS(ansyIV, xIV)
resIV = modIV.fit()
print('the linear regression line of dataset IV:') 
print('y = {} + {} * x'.format(resIV.params[0], resIV.params[1]))

sns.set(style = 'whitegrid')
g = sns.FacetGrid(anscombe, col = 'dataset', hue = 'dataset', size = 3)
g.map(plt.scatter, 'x', 'y')
plt.show()

结果：