https://www.kaggle.com/c/house-prices-advanced-regression-techniques
🟡 Description
df = pd.concat([train, test])
print(df.shape)
>>> (2919, 80)
print("왜도(Skewness):", train["SalePrice"].skew())
print("첨도(Kurtosis):", train["SalePrice"].kurtosis())
>>> 왜도(Skewness): 1.8828757597682129
>>> 첨도(Kurtosis): 6.536281860064529
train["SalePrice_log1p"] = np.log1p(train["SalePrice"])
print("왜도(Skewness):", train["SalePrice_log1p"].skew())
print("첨도(Kurtosis):", train["SalePrice_log1p"].kurt())
>>> 왜도(Skewness): 0.12134661989685333
>>> 첨도(Kurtosis): 0.809519155707878
isna_sum = df.isnull().sum()
isna_mean = df.isnull().mean()
pd.concat([isna_sum, isna_mean], axis=1).nlargest(10,1)
# 결측치 비율이 80%이상인 컬럼은 삭제
null_feature = isna_mean[isna_mean > 0.8].index
df = df.drop(columns=null_feature)
print(df.shape)
>>> (2919, 76)
# 확인
df.isnull().mean().sort_values(ascending=False).nlargest(10)
>>> SalePrice 0.499829
FireplaceQu 0.486468
LotFrontage 0.166495
GarageCond 0.054471
GarageYrBlt 0.054471
GarageFinish 0.054471
GarageQual 0.054471
GarageType 0.053786
BsmtExposure 0.028092
BsmtCond 0.028092
dtype: float64
corr = df.corr()
corr.loc[corr["SalePrice"] >= 0.5, "SalePrice"].sort_values(ascending=False)
SalePrice 1.000000
OverallQual 0.790982
GrLivArea 0.708624
GarageCars 0.640409
GarageArea 0.623431
TotalBsmtSF 0.613581
1stFlrSF 0.605852
FullBath 0.560664
TotRmsAbvGrd 0.533723
YearBuilt 0.522897
YearRemodAdd 0.507101
Name: SalePrice, dtype: float64
sp_cor_up = corr.loc[corr["SalePrice"] >= 0.7, "SalePrice"].index
sns.pairplot(train[sp_cor_up], corner=True)
df["TotalSF"] = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]
df[["TotalSF","TotalBsmtSF","1stFlrSF","2ndFlrSF"]]
# Garage 관련 범주형 변수 'None' 으로 결측치 대체
Garage_None = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
# df[Garage_None] = df[Garage_None].fillna("None")으로 한줄도 가능함
for col in Grage_None;
df.loc[df[col].isnull(), col] = "None"
# Garage 관련 수치형 변수 0 으로 결측치 대체
Garage_0 = ['GarageYrBlt', 'GarageArea', 'GarageCars']
# df[Garage_0] = df[Garage_0].fillna(0)
for col in Garage_0:
df.loc[df[col].isnull(), col] = 0
# 최빈값으로 채우기
fill_mod = ['MSZoning', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional']
# df[fill_mod] = df[fill_mod].fillna(df[fill_mod].mode().loc[0])
for col in fill_mod;
df.loc[df[col].isnull(), col] = df[col].value_counts().sort_values(ascending=False).index[0]
# 중앙값으로 대체
feature_num = df.select_dtypes(include="number").columns.tolist()
feature_num.remove("SalePrice")
# df[feature_num] = df[feature_num].fillna(df[feature_num].median())
for col in feature_num:
df.loc[df[col].isnull(), col] = df.describe().loc["50%"][col]
num_to_str_col = ["MSSubClass", "OverallCond", "YrSold", "MoSold"]
# df[num_to_str_col] = df[num_to_str_col].astype('object')
# 문자형태로 변경하게 되면 나중에 one-hot encoding으로 변경하게 된다.
for col in num_to_str_col:
df[col] = df[col].astype("object")
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MSSubClass 2919 non-null object
1 OverallCond 2919 non-null object
2 YrSold 2919 non-null object
3 MoSold 2919 non-null object
dtypes: object(4)
memory usage: 178.6+ KB
📍 왜도가 큰 피쳐
# 왜도가 큰 피쳐들만 찾기
feature_skew = abs(df.skew())
skewed_col = feature_skew[feature_skew>2].index
skewed_col
📍nunique()가 작은 피쳐
# 수치형변수 중 nunique값이 작은 피처 찾기
log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd']
lf_nu = df[log_features].nunique()[df[log_features].nunique() < 30].index
num_log_feature = list(set(log_feature) - set(lf_nu))
num_log_feature # 여기서 출력된 변수들은 로그변환을 해줘도 될 가능성이 있다.
sklearn에서 제공되는 기능 외에도 pandas로 바로 계산이 가능하다
주로 uniform한 분포일 때 값을 강조하거나 구분해서 보기 위해 사용한다.
squared_features = ['YearRemodAdd', 'LotFrontage',
'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea','GarageArea', 'TotalSF']
df_square = df[squared_features] ** 2
df_cate = df.select_dtypes(include="object")
feature_cate = df_cate.isnull().mean().sort_values()[:-2].index
feature_cate # 사용할 피쳐
label_name = "SalePrice_log1p"
feature_names = []
feature_names.extend(num_log_feature)
feature_names.append("TotalSF")
feature_names.extend(feature_cate)
feature_names.remove("2ndFlrSF")
feature_names.remove("1stFlrSF")
feature_names.remove("BsmtFinSF1")
feature_names.remove("BsmtFinSF2")
feature_names
list.append()는 봉지과자를 통째로 넣는 것이고, list.extend()는 봉지과자를 뜯어서 넣는것이다.
a = []
# append
a.append([1, 2, 3])
a => [[1, 2, 3]] # 리스트의 인덱스개수는 1개이다.
# extend
a.extend([1, 2, 3])
a => [1, 2, 3] # 리스트의 인덱스개수는 3개이다.
train과 test가 concat되어 있다면 pd.get_dummies를 사용하는 것이 편리하다.
df_ohe = pd.get_dummies(df[feature_names])
df[feature_names].shape, df_ohe.shape
>>> ((2919, 57), (2919, 297))
X_train = df_ohe.loc[train.index]
X_test = df_ohe.loc[test.index]
y_train = train[label_name]
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model
4.1 KFold로 Cross Validation하기 & 예측값 구하기
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
kf = KFold(random_state=42, n_splits=5, suffle=True)
y_val_predict = cross_val_predict(model, X_train, y_train, cv=kf, n_jobs=-1)
4.2 metrics
rmse = np.sqrt(np.square(y_train-y_predict).mean())
rmse
>>> 0.15360940285057656
from sklearn.metrics import r2_score
r2score = r2_score(y_train, y_val_predict)
r2score
>>> 0.8520176585188195
4.3 regplot/ kdeplot으로 비교하기
sns.regplot(x = y_train, y = y_val_predict)
sns.kdeplot(y_train)
sns.kdeplot(y_val_predict)
y_predict = model.fit(X_train, y_train).predict(X_test)
# 피쳐 중요도 시각화
fi = pd.Series(model.feature_importances_)
fi.index = model.feature_names_in_
fi.nlargest(20).plot(kind="barh")