import pandas as pd import matplotlib.pyplot as plt import numpy as np house_data = pd.read_csv("./housing.csv", ) house_data.hist(bins=50, figsize=(15, 10)) #house_data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) house_data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.3, s=house_data["population"] / 100, label="population", c=house_data["median_house_value"], cmap=plt.get_cmap("jet"), colorbar=True, ) plt.legend() #plt.show() corr_matrix = house_data.corr() a = corr_matrix["median_house_value"].sort_values(ascending=False) house_data["rooms_per_households"] = house_data["total_rooms"] / house_data["households"] house_data["bedrooms_per_room"] = house_data["total_bedrooms"] / house_data["total_rooms"] house_data["population_per_households"] = house_data["population"] / house_data["households"] #print(house_data.info()) corr_matrix = house_data.corr() b = corr_matrix["median_house_value"].sort_values(ascending=False) #print(b) # import pandas.plotting.scatter_matrix as scatter_matrix # attributes = ["median_house_value","median_income","total_rooms","housing_median_age"] # scatter_matrix(house_data[attributes],figsize = (10,6)) from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(house_data, test_size=0.2, random_state=16) from sklearn.preprocessing import Imputer imputer = Imputer(strategy="median") house_num = house_data.drop("ocean_proximity",axis=1) imputer.fit(house_num) X = imputer.transform(house_num) house_tr = pd.DataFrame(X,columns=house_num.columns,index=list(house_data.index.values)) print(house_tr.info()) house_cat = house_data["ocean_proximity"] a = house_cat.value_counts() print(a) house_cat_encoded,house_cat_categories = house_cat.factorize() print(house_cat_encoded) print(house_cat_encoded.shape) from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() house_cat_onehot = encoder.fit_transform(house_cat_encoded.reshape(-1,1)) print(house_cat_onehot) print(house_cat_onehot.shape) from sklearn.preprocessing import LabelBinarizer encoder = LabelBinarizer() house_cat_onehot1 = encoder.fit_transform(house_cat) print(house_cat_onehot1) print(house_cat_onehot1.shape) from sklearn.preprocessing import StandardScaler std_scaler = StandardScaler() std = std_scaler.fit_transform(house_tr) print(std)
preprocessing
最新推荐文章于 2024-01-09 00:24:34 发布