Python练习题(二)
.json文件转.csv文件
import csv
import json
import codecs
cvsfile = open('result.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(cvsfile)
with codecs.open('result.json', 'r', encoding='utf-8') as f:
r = f.readlines()
t = 1
for a,b in enumerate(r):
jsondata = json.loads(b)
if(t == 1):
keys = list(jsondata.keys())
writer.writerow(keys)
t = t - 1
value = list(jsondata.values())
writer.writerow(value)
cvsfile.close()
f.close()
链接生成静图/动图二维码
# 第一条指令
pip install myqr
# 第二条指令
myqr 链接 -p 图片路径 -c
.csv文件 数据预处理
1. 去除空格、转为整型、列与列相除

代码:
import numpy as np
import pandas as pd
# 打开.csv文件
df = pd.read_csv(r'result.csv', encoding='gbk', dtype=str)
# 去除空格
df['house'] = df['house'].str.strip() # house列
df['locatea'] = df['locatea'].str.strip() # locatea列
df['locateb'] = df['locateb'].str.strip() # locateb列
df['locatec'] = df['locatec'].str.strip() # locatec列
df['typemin'] = df['typemin'].str.strip() # typemin列
df['areamin'] = df['areamin'].str.strip() # areamin列
df['priceavg'] = df['priceavg'].str.strip() # priceavg列
df['priceall'] = df['priceall'].str.strip() # priceall列
# 将priceall、areamin变为整型
df['priceall'] = df['priceall'].astype(np.float)
df['areamin'] = df['areamin'].astype(np.float)
# 重新计算priceavg
priceavg = round(df['priceall']/df['areamin']/1.0, 4)
# 合并原始数据的某些列、计算出来的新列
result = pd.concat([df['house'], df['locatea'], df['locateb'], df['locatec'], df['typemin'], df['areamin'], df['priceall'], priceavg], axis=1, ignore_index=True)
result.columns = ['名称', '地理位置1', '地理位置2', '地理位置3', '最小房型', '最小面积', '总价(万元/套)', '均价(万元/㎡)']
result.to_csv("result-1.csv", encoding="gbk")
2. 洗掉某几列数据全为空的行、列与列相加、groupby().mean()求平均值

代码:
import numpy as np
import pandas as pd
# 打开csv文件
df = pd.read_csv(r'BeijingPM.csv', encoding='gbk', usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
# 清洗 去掉四列PM数据全部为空的行
df.dropna(axis='index', how='all', subset=['PM_Dongsi', 'PM_Dongsihuan', 'PM_Nongzhanguan', 'PM_US Post'], inplace=True)
# 新建索引
df['sum'] = df[['PM_Dongsi', 'PM_Dongsihuan', 'PM_Nongzhanguan', 'PM_US Post']].sum(axis=1)
df['count'] = df[['PM_Dongsi', 'PM_Dongsihuan', 'PM_Nongzhanguan', 'PM_US Post']].count(axis=1)
df['avg'] = round(df['sum']/df['count'], 2)
# 按照年做汇总,查看年的平均值
df = df.groupby(["year","month"]).mean()
df.to_csv('BeijingPM-avg.csv', encoding='gbk')
df = pd.read_csv(r'BeijingPM-avg.csv', encoding='gbk', dtype=str)
df = pd.concat([df['year'], df['month'], df['avg']], axis=1, ignore_index=True)
df.columns = ['year', 'month', 'avg']
df.to_csv('BeijingPM-avg.csv', encoding='gbk')
3. 线性插值处理(将超过2倍标准差数据修正)、后项填充

代码:
import numpy as np
import pandas as pd
import scipy.interpolate
import matplotlib.pyplot as plt
# 打开csv文件
df = pd.read_csv(r'BeijingPM.csv', encoding='gbk')
# 对 HUMI PRES TEMP 三列进行线性插值处理,并将超过2倍标准差数据修正
df['HUMI'] = df['HUMI'].interpolate()
df['PRES'] = df['PRES'].interpolate()
df['TEMP'] = df['TEMP'].interpolate()
a = df['HUMI'].mean() - df['HUMI'].std()*2
b = df['HUMI'].mean() + df['HUMI'].std()*2
df['HUMI'][(df['HUMI'] < a)] = a
df['HUMI'][(df['HUMI'] > b)] = b
a = df['PRES'].mean() - df['PRES'].std()*2
b = df['PRES'].mean() + df['PRES'].std()*2
df['PRES'][(df['PRES'] < a)] = a
df['PRES'][(df['PRES'] > b)] = b
a = df['TEMP'].mean() - df['TEMP'].std()*2
b = df['TEMP'].mean() + df['TEMP'].std()*2
df['TEMP'][(df['TEMP'] < a)] = a
df['TEMP'][(df['TEMP'] > b)] = b
# PM 指数处理
df['PM_Dongsi'][(df['PM_Dongsi'] > 500)] = 500
df['PM_Dongsihuan'][(df['PM_Dongsihuan'] > 500)] = 500
df['PM_Nongzhanguan'][(df['PM_Nongzhanguan'] > 500)] = 500
# 对 cbwd 列中值为'cv'的单元格,后项填充
df['cbwd'][(df['cbwd'] == 'cv')] = np.nan
df['cbwd'] = df['cbwd'].bfill()
df.to_csv('BeijingPM-1.csv', encoding='gbk')
4. (0-1)归一化、Z-Score归一化、散点图、离散化

代码1:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# 生成对象
fig = plt.figure()
mm = MinMaxScaler()
st = StandardScaler()
# 打开csv文件
df = pd.read_csv(r'BeijingPM.csv', encoding='gbk')
# 子图1:原始图像
p1 = fig.add_subplot(131)
x, y = df['DEWP'], df['TEMP']
p1.scatter(x, y, s=10)
p1.set_title('Original')
# 子图2:(0-1)归一化
p2 = fig.add_subplot(132)
x = mm.fit_transform(df['DEWP'].values.reshape(-1, 1))
y = mm.fit_transform(df['TEMP'].values.reshape(-1, 1))
p2.scatter(x, y, s=10)
p2.set_title('MinMaxScaler')
# 子图3:Z-Score归一化
p3 = fig.add_subplot(133)
x = st.fit_transform(df['DEWP'].values.reshape(-1, 1))
y = st.fit_transform(df['TEMP'].values.reshape(-1, 1))
p3.scatter(x, y, s=10)
p3.set_title('StandardScaler')
# 显示图形
plt.show()
输出1:
代码2:
import pandas as pd
# 打开csv文件
df = pd.read_csv(r'BeijingPM.csv', encoding='gbk')
# 清洗 去掉四列PM数据全部为空的行
df.dropna(axis=0, how='all', subset=['PM_Dongsi', 'PM_Dongsihuan', 'PM_Nongzhanguan', 'PM_US Post'], inplace=True)
# 新建索引
df['sum'] = df[['PM_Dongsi', 'PM_Dongsihuan', 'PM_Nongzhanguan', 'PM_US Post']].sum(axis=1)
df['count'] = df[['PM_Dongsi', 'PM_Dongsihuan', 'PM_Nongzhanguan', 'PM_US Post']].count(axis=1)
df['avg'] = round(df['sum']/df['count'], 2)
# 按照天做汇总,查看天的平均值
df = df.groupby(["year", "month", "day"]).mean()
# 划分区间
sections = [0, 50, 100, 150, 200, 300, 1200]
# 设置区间标签
names = ['green', 'yellow', 'orange', 'red', 'purple', 'brownish red']
# 计算出每个级别(或各个颜色)对应的天数
result = pd.cut(df['avg'], sections, labels=names)
print(pd.value_counts(result))
输出2:
green 699
yellow 655
orange 401
red 196
purple 153
brownish red 51
Name: avg, dtype: int64
请按任意键继续. . .