import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df.head(3)


from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

# make extend boston dataset
# Refernce : https://github.com/amueller/mglearn/blob/master/mglearn/datasets.py#L30
def load_extended_boston():
    boston = load_boston()
    X = boston.data

    X = MinMaxScaler().fit_transform(boston.data)
    #Feature Generation 수행 degree = 2
    #X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)

    return X, boston.target


# 특징이 확장된 보스턴 부동산 가격 데이터 불러오기
X, y = load_extended_boston()
print(X)
print(y)
print('Extended Feature Shape :', X.shape)

[[0.00000000e+00 1.80000000e-01 6.78152493e-02 ... 2.87234043e-01
  1.00000000e+00 8.96799117e-02]
 [2.35922539e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01
  1.00000000e+00 2.04470199e-01]
 [2.35697744e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01
  9.89737254e-01 6.34657837e-02]
 ...
 [6.11892474e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
  1.00000000e+00 1.07891832e-01]
 [1.16072990e-03 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
  9.91300620e-01 1.31070640e-01]
 [4.61841693e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
  1.00000000e+00 1.69701987e-01]]
[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 30.3 34.6 34.9 32.9 24.1 42.3 48.5 50.  22.6 24.4 22.5 24.4 20.
 21.7 19.3 22.4 28.1 23.7 25.  23.3 28.7 21.5 23.  26.7 21.7 27.5 30.1
 44.8 50.  37.6 31.6 46.7 31.5 24.3 31.7 41.7 48.3 29.  24.  25.1 31.5
 23.7 23.3 22.  20.1 22.2 23.7 17.6 18.5 24.3 20.5 24.5 26.2 24.4 24.8
 29.6 42.8 21.9 20.9 44.  50.  36.  30.1 33.8 43.1 48.8 31.  36.5 22.8
 30.7 50.  43.5 20.7 21.1 25.2 24.4 35.2 32.4 32.  33.2 33.1 29.1 35.1
 45.4 35.4 46.  50.  32.2 22.  20.1 23.2 22.3 24.8 28.5 37.3 27.9 23.9
 21.7 28.6 27.1 20.3 22.5 29.  24.8 22.  26.4 33.1 36.1 28.4 33.4 28.2
 22.8 20.3 16.1 22.1 19.4 21.6 23.8 16.2 17.8 19.8 23.1 21.  23.8 23.1
 20.4 18.5 25.  24.6 23.  22.2 19.3 22.6 19.8 17.1 19.4 22.2 20.7 21.1
 19.5 18.5 20.6 19.  18.7 32.7 16.5 23.9 31.2 17.5 17.2 23.1 24.5 26.6
 22.9 24.1 18.6 30.1 18.2 20.6 17.8 21.7 22.7 22.6 25.  19.9 20.8 16.8
 21.9 27.5 21.9 23.1 50.  50.  50.  50.  50.  13.8 13.8 15.  13.9 13.3
 13.1 10.2 10.4 10.9 11.3 12.3  8.8  7.2 10.5  7.4 10.2 11.5 15.1 23.2
  9.7 13.8 12.7 13.1 12.5  8.5  5.   6.3  5.6  7.2 12.1  8.3  8.5  5.
 11.9 27.9 17.2 27.5 15.  17.2 17.9 16.3  7.   7.2  7.5 10.4  8.8  8.4
 16.7 14.2 20.8 13.4 11.7  8.3 10.2 10.9 11.   9.5 14.5 14.1 16.1 14.3
 11.7 13.4  9.6  8.7  8.4 12.8 10.5 17.1 18.4 15.4 10.8 11.8 14.9 12.6
 14.1 13.  13.4 15.2 16.1 17.8 14.9 14.1 12.7 13.5 14.9 20.  16.4 17.7
 19.5 20.2 21.4 19.9 19.  19.1 19.1 20.1 19.9 19.6 23.2 29.8 13.8 13.3
 16.7 12.  14.6 21.4 23.  23.7 25.  21.8 20.6 21.2 19.1 20.6 15.2  7.
  8.1 13.6 20.1 21.8 24.5 23.1 19.7 18.3 21.2 17.5 16.8 22.4 20.6 23.9
 22.  11.9]
Extended Feature Shape : (506, 13)


from sklearn.model_selection import KFold

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 40 
kf = KFold(n_splits=num_split)  

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    model_lr = LinearRegression()

    # 선형회귀(Linear Regression) 모델 학습하기
    model_lr.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = model_lr.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 23.99167263753876
Avergae RMSE : 4.898129503957481


from sklearn.linear_model import Ridge  # L2 규제

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 40 
kf = KFold(n_splits=num_split)

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 선형회귀(Linear Regression) 모델 선언하기
    ridge_reg = Ridge(alpha=0.8)
    #ridge_reg = Ridge(alpha=1)

    # 선형회귀(Linear Regression) 모델 학습하기
    ridge_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = ridge_reg.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', tot_MSE)
print('Avergae RMSE :', np.sqrt(tot_MSE))

Average MSE : 119.58522647540913
Avergae RMSE : 10.935503028000547


from sklearn.linear_model import Lasso  # L1 규제

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 40 
kf = KFold(n_splits=num_split)

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    #lasso_reg = Lasso(alpha=0.02)
    lasso_reg = Lasso(alpha=0.02)

    # 선형회귀(Linear Regression) 모델 학습하기
    lasso_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = lasso_reg.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 35.36971883670263
Avergae RMSE : 5.947244642412369


from sklearn.linear_model import ElasticNet

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 40 
kf = KFold(n_splits=num_split)

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    elasticnet_reg = ElasticNet(alpha=0.01)

    # 선형회귀(Linear Regression) 모델 학습하기
    elasticnet_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측을 수행합니다.
    y_pred = elasticnet_reg.predict(X_test)

    # MSE(Mean Squared Error)를 측정합니다.
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 31.723194699963994
Avergae RMSE : 5.632334746795861

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03

일상 코딩

일상 코딩

[kaggle] Boston 부동산 집 값 예측 본문

[kaggle] Boston 부동산 집 값 예측

보스턴 부동산 데이터의 특징들(Features)¶

필요한 라이브러리 임폴트¶

데이터 확인¶

데이터 정규화 - MinMaxScaler()¶

baseline 성능¶

baseline #1 - Average MSE : 37.1318(기본 Linear Regression)¶

baseline #2 - Average MSE : 34.10008 (기본 Linear Regression + Feature Selection 적용)¶

KFold 교차검증 + L2 규제 알고리즘¶

KFold 교차검증 + L1 규제 알고리즘¶

KFold 교차검증 + ElasticNet(L1+ L2) 규제 알고리즘¶

'머신러닝 > 선형회귀' 카테고리의 다른 글

티스토리툴바

티스토리툴바

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30