import pandas as pd
import numpy as np
t = pd.read_csv('C:/Users/user/Desktop/train.csv')
T = t.set_index('Loan_ID')
T.head(10)
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status |
---|
Loan_ID | | | | | | | | | | | | |
---|
LP001002 | Male | No | 0 | Graduate | No | 5849 | 0.0 | NaN | 360.0 | 1.0 | Urban | Y |
---|
LP001003 | Male | Yes | 1 | Graduate | No | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | Rural | N |
---|
LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y |
---|
LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y |
---|
LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y |
---|
LP001011 | Male | Yes | 2 | Graduate | Yes | 5417 | 4196.0 | 267.0 | 360.0 | 1.0 | Urban | Y |
---|
LP001013 | Male | Yes | 0 | Not Graduate | No | 2333 | 1516.0 | 95.0 | 360.0 | 1.0 | Urban | Y |
---|
LP001014 | Male | Yes | 3+ | Graduate | No | 3036 | 2504.0 | 158.0 | 360.0 | 0.0 | Semiurban | N |
---|
LP001018 | Male | Yes | 2 | Graduate | No | 4006 | 1526.0 | 168.0 | 360.0 | 1.0 | Urban | Y |
---|
LP001020 | Male | Yes | 1 | Graduate | No | 12841 | 10968.0 | 349.0 | 360.0 | 1.0 | Semiurban | N |
---|
mask=(T['Education']=='Not Graduate')&(T['Loan_Status']=='Y')&(T['Gender']=='Female')
T.loc[mask,['Gender','Education','Loan_Status']]
| Gender | Education | Loan_Status |
---|
Loan_ID | | | |
---|
LP001155 | Female | Not Graduate | Y |
---|
LP001669 | Female | Not Graduate | Y |
---|
LP001692 | Female | Not Graduate | Y |
---|
LP001908 | Female | Not Graduate | Y |
---|
LP002300 | Female | Not Graduate | Y |
---|
LP002314 | Female | Not Graduate | Y |
---|
LP002407 | Female | Not Graduate | Y |
---|
LP002489 | Female | Not Graduate | Y |
---|
LP002502 | Female | Not Graduate | Y |
---|
LP002534 | Female | Not Graduate | Y |
---|
LP002582 | Female | Not Graduate | Y |
---|
LP002731 | Female | Not Graduate | Y |
---|
LP002757 | Female | Not Graduate | Y |
---|
LP002917 | Female | Not Graduate | Y |
---|
def num_missing(x):
return sum(x.isnull())
T.apply(num_missing,axis=0)
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 22
Loan_Amount_Term 14
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
T.apply(num_missing,axis=1)[:10]
Loan_ID
LP001002 1
LP001003 0
LP001005 0
LP001006 0
LP001008 0
LP001011 0
LP001013 0
LP001014 0
LP001018 0
LP001020 0
dtype: int64
T['Gender'].fillna(T['Gender'].mode().iloc[0],inplace=True)
T['Married'].fillna(T['Married'].mode().iloc[0],inplace=True)
T['Self_Employed'].fillna(T['Self_Employed'].mode().iloc[0],inplace=True)
pd.crosstab(T['Credit_History'],T['Loan_Status'],margins=True)
Loan_Status | N | Y | All |
---|
Credit_History | | | |
---|
0.0 | 82 | 7 | 89 |
---|
1.0 | 97 | 378 | 475 |
---|
All | 179 | 385 | 564 |
---|
prop_rates = pd.DataFrame([1000, 5000, 12000], index=['Rural','Semiurban','Urban'],columns=['rates'])
prop_rates
| rates |
---|
Rural | 1000 |
---|
Semiurban | 5000 |
---|
Urban | 12000 |
---|
T1=T.merge(right=prop_rates,how='inner',left_on='Property_Area',right_index=True,sort=False)
T1
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | rates |
---|
Loan_ID | | | | | | | | | | | | | |
---|
LP001002 | Male | No | 0 | Graduate | No | 5849 | 0.0 | NaN | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001011 | Male | Yes | 2 | Graduate | Yes | 5417 | 4196.0 | 267.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001013 | Male | Yes | 0 | Not Graduate | No | 2333 | 1516.0 | 95.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001018 | Male | Yes | 2 | Graduate | No | 4006 | 1526.0 | 168.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001024 | Male | Yes | 2 | Graduate | No | 3200 | 700.0 | 70.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001027 | Male | Yes | 2 | Graduate | No | 2500 | 1840.0 | 109.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001028 | Male | Yes | 2 | Graduate | No | 3073 | 8106.0 | 200.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001030 | Male | Yes | 2 | Graduate | No | 1299 | 1086.0 | 17.0 | 120.0 | 1.0 | Urban | Y | 12000 |
---|
LP001032 | Male | No | 0 | Graduate | No | 4950 | 0.0 | 125.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001034 | Male | No | 1 | Not Graduate | No | 3596 | 0.0 | 100.0 | 240.0 | NaN | Urban | Y | 12000 |
---|
LP001036 | Female | No | 0 | Graduate | No | 3510 | 0.0 | 76.0 | 360.0 | 0.0 | Urban | N | 12000 |
---|
LP001041 | Male | Yes | 0 | Graduate | No | 2600 | 3500.0 | 115.0 | NaN | 1.0 | Urban | Y | 12000 |
---|
LP001043 | Male | Yes | 0 | Not Graduate | No | 7660 | 0.0 | 104.0 | 360.0 | 0.0 | Urban | N | 12000 |
---|
LP001046 | Male | Yes | 1 | Graduate | No | 5955 | 5625.0 | 315.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001073 | Male | Yes | 2 | Not Graduate | No | 4226 | 1040.0 | 110.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001086 | Male | No | 0 | Not Graduate | No | 1442 | 0.0 | 35.0 | 360.0 | 1.0 | Urban | N | 12000 |
---|
LP001091 | Male | Yes | 1 | Graduate | No | 4166 | 3369.0 | 201.0 | 360.0 | NaN | Urban | N | 12000 |
---|
LP001095 | Male | No | 0 | Graduate | No | 3167 | 0.0 | 74.0 | 360.0 | 1.0 | Urban | N | 12000 |
---|
LP001106 | Male | Yes | 0 | Graduate | No | 2275 | 2067.0 | NaN | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001109 | Male | Yes | 0 | Graduate | No | 1828 | 1330.0 | 100.0 | NaN | 0.0 | Urban | N | 12000 |
---|
LP001114 | Male | No | 0 | Graduate | No | 4166 | 7210.0 | 184.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001119 | Male | No | 0 | Graduate | No | 3600 | 0.0 | 80.0 | 360.0 | 1.0 | Urban | N | 12000 |
---|
LP001120 | Male | No | 0 | Graduate | No | 1800 | 1213.0 | 47.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
LP001123 | Male | Yes | 0 | Graduate | No | 2400 | 0.0 | 75.0 | 360.0 | NaN | Urban | Y | 12000 |
---|
LP001136 | Male | Yes | 0 | Not Graduate | Yes | 4695 | 0.0 | 96.0 | NaN | 1.0 | Urban | Y | 12000 |
---|
LP001137 | Female | No | 0 | Graduate | No | 3410 | 0.0 | 88.0 | NaN | 1.0 | Urban | Y | 12000 |
---|
LP001138 | Male | Yes | 1 | Graduate | No | 5649 | 0.0 | 44.0 | 360.0 | 1.0 | Urban | Y | 12000 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
LP002729 | Male | No | 1 | Graduate | No | 11250 | 0.0 | 196.0 | 360.0 | NaN | Semiurban | N | 5000 |
---|
LP002738 | Male | No | 2 | Graduate | No | 3617 | 0.0 | 107.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002741 | Female | Yes | 1 | Graduate | No | 4608 | 2845.0 | 140.0 | 180.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002743 | Female | No | 0 | Graduate | No | 2138 | 0.0 | 99.0 | 360.0 | 0.0 | Semiurban | N | 5000 |
---|
LP002753 | Female | No | 1 | Graduate | No | 3652 | 0.0 | 95.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002757 | Female | Yes | 0 | Not Graduate | No | 3017 | 663.0 | 102.0 | 360.0 | NaN | Semiurban | Y | 5000 |
---|
LP002768 | Male | No | 0 | Not Graduate | No | 3358 | 0.0 | 80.0 | 36.0 | 1.0 | Semiurban | N | 5000 |
---|
LP002776 | Female | No | 0 | Graduate | No | 5000 | 0.0 | 103.0 | 360.0 | 0.0 | Semiurban | N | 5000 |
---|
LP002792 | Male | Yes | 1 | Graduate | No | 5468 | 1032.0 | 26.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002795 | Male | Yes | 3+ | Graduate | Yes | 10139 | 0.0 | 260.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002798 | Male | Yes | 0 | Graduate | No | 3887 | 2669.0 | 162.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002804 | Female | Yes | 0 | Graduate | No | 4180 | 2306.0 | 182.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002807 | Male | Yes | 2 | Not Graduate | No | 3675 | 242.0 | 108.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002813 | Female | Yes | 1 | Graduate | Yes | 19484 | 0.0 | 600.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002821 | Male | No | 0 | Not Graduate | Yes | 5800 | 0.0 | 132.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002862 | Male | Yes | 2 | Not Graduate | No | 6125 | 1625.0 | 187.0 | 480.0 | 1.0 | Semiurban | N | 5000 |
---|
LP002863 | Male | Yes | 3+ | Graduate | No | 6406 | 0.0 | 150.0 | 360.0 | 1.0 | Semiurban | N | 5000 |
---|
LP002872 | Male | Yes | 0 | Graduate | No | 3087 | 2210.0 | 136.0 | 360.0 | 0.0 | Semiurban | N | 5000 |
---|
LP002892 | Male | Yes | 2 | Graduate | No | 6540 | 0.0 | 205.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002894 | Female | Yes | 0 | Graduate | No | 3166 | 0.0 | 36.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002917 | Female | No | 0 | Not Graduate | No | 2165 | 0.0 | 70.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002925 | Male | No | 0 | Graduate | No | 4750 | 0.0 | 94.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002926 | Male | Yes | 2 | Graduate | Yes | 2726 | 0.0 | 106.0 | 360.0 | 0.0 | Semiurban | N | 5000 |
---|
LP002928 | Male | Yes | 0 | Graduate | No | 3000 | 3416.0 | 56.0 | 180.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002931 | Male | Yes | 2 | Graduate | Yes | 6000 | 0.0 | 205.0 | 240.0 | 1.0 | Semiurban | N | 5000 |
---|
LP002933 | Male | No | 3+ | Graduate | Yes | 9357 | 0.0 | 292.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002943 | Male | No | NaN | Graduate | No | 2987 | 0.0 | 88.0 | 360.0 | 0.0 | Semiurban | N | 5000 |
---|
LP002959 | Female | Yes | 1 | Graduate | No | 12000 | 0.0 | 496.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002961 | Male | Yes | 1 | Graduate | No | 3400 | 2500.0 | 173.0 | 360.0 | 1.0 | Semiurban | Y | 5000 |
---|
LP002990 | Female | No | 0 | Graduate | Yes | 4583 | 0.0 | 133.0 | 360.0 | 0.0 | Semiurban | N | 5000 |
---|
614 rows × 13 columns
impute_grps = T1.pivot_table(values=["Credit_History"],index=["Property_Area","rates"],aggfunc=np.mean)
ind = tuple([row['Property_Area'],row['rates']])
T1.loc["Credit_History"] = impute_grps.loc[ind].values[0]
grps = T1.groupby(["Property_Area","rates"])
grps["Credit_History"].mean()
Property_Area rates
0.8617511520737328 0.861751 0.861751
Rural 1000.000000 0.830303
Semiurban 5000.000000 0.861751
Urban 12000.000000 0.829670
Name: Credit_History, dtype: float64
T2=T1.groupby(["Property_Area","rates"])["Credit_History"].count().reset_index()
T2.set_index("Property_Area")
| rates | Credit_History |
---|
Property_Area | | |
---|
Rural | 1000 | 165 |
---|
Semiurban | 5000 | 217 |
---|
Urban | 12000 | 182 |
---|