[Python 머신러닝] 05-10 회귀 실습 - 캐글 주택 가격: 고급 회귀 기법
회귀
회귀 실습 - 캐글 주택 가격: 고급 회귀 기법
- 피처 엔지니어링
- 로그 변환
- 이상치 제거
- 선형 회귀 모델
- 회귀 트리 모델
- 혼합 결합 및 스태킹 모델
데이터 사전 처리(Preprocessing)
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
house_df_org = pd.read_csv('house_price.csv')
house_df = house_df_org.copy()
house_df.head(3)
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 rows × 81 columns
house_df.head(20)
| MSSubClass | MSZoning | LotFrontage | LotArea | Street | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.110874 | RL | 4.189655 | 9.042040 | Pave | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 5.283204 | Gd | TA | PConc | Gd | TA | No | GLQ | 6.561031 | Unf | 0.000000 | 150 | 6.753438 | GasA | Ex | Y | SBrkr | 6.753438 | 854 | 0.0 | 7.444833 | 1 | 0.000000 | 2 | 1 | 3 | 0.693147 | Gd | 8 | Typ | 0 | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0.000000 | 4.127134 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 2 | 2008 | WD | Normal | 12.247699 |
| 1 | 3.044522 | RL | 4.394449 | 9.169623 | Pave | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.000000 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 6.886532 | Unf | 0.000000 | 284 | 7.141245 | GasA | Ex | Y | SBrkr | 7.141245 | 0 | 0.0 | 7.141245 | 0 | 0.693147 | 2 | 0 | 3 | 0.693147 | TA | 6 | Typ | 1 | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 5.700444 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 5 | 2007 | WD | Normal | 12.109016 |
| 2 | 4.110874 | RL | 4.234107 | 9.328212 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 5.093750 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 6.188264 | Unf | 0.000000 | 434 | 6.825460 | GasA | Ex | Y | SBrkr | 6.825460 | 866 | 0.0 | 7.488294 | 1 | 0.000000 | 2 | 1 | 3 | 0.693147 | Gd | 6 | Typ | 1 | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0.000000 | 3.761200 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 9 | 2008 | WD | Normal | 12.317171 |
| 3 | 4.262680 | RL | 4.110874 | 9.164401 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.000000 | TA | TA | BrkTil | TA | Gd | No | ALQ | 5.379897 | Unf | 0.000000 | 540 | 6.629363 | GasA | Gd | Y | SBrkr | 6.869014 | 756 | 0.0 | 7.448916 | 1 | 0.000000 | 1 | 0 | 3 | 0.693147 | Gd | 7 | Typ | 1 | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0.000000 | 3.583519 | 5.609472 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 2 | 2006 | WD | Abnorml | 11.849405 |
| 4 | 4.110874 | RL | 4.442651 | 9.565284 | Pave | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 5.860786 | Gd | TA | PConc | Gd | TA | Av | GLQ | 6.486161 | Unf | 0.000000 | 490 | 7.044033 | GasA | Ex | Y | SBrkr | 7.044033 | 1053 | 0.0 | 7.695758 | 1 | 0.000000 | 2 | 1 | 4 | 0.693147 | Gd | 9 | Typ | 1 | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 5.262690 | 4.442651 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 12 | 2008 | WD | Normal | 12.429220 |
| 5 | 3.931826 | RL | 4.454347 | 9.555064 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | 1.5Fin | 5 | 5 | 1993 | 1995 | Gable | CompShg | VinylSd | VinylSd | None | 0.000000 | TA | TA | Wood | Gd | TA | No | GLQ | 6.597146 | Unf | 0.000000 | 64 | 6.680855 | GasA | Ex | Y | SBrkr | 6.680855 | 566 | 0.0 | 7.217443 | 1 | 0.000000 | 1 | 1 | 1 | 0.693147 | TA | 5 | Typ | 0 | Attchd | 1993.0 | Unf | 2 | 480 | TA | TA | Y | 3.713572 | 3.433987 | 0.000000 | 5.771441 | 0.00000 | 0.0 | 6.552508 | 10 | 2009 | WD | Normal | 11.870607 |
| 6 | 3.044522 | RL | 4.330733 | 9.218804 | Pave | Reg | Lvl | AllPub | Inside | Gtl | Somerst | Norm | Norm | 1Fam | 1Story | 8 | 5 | 2004 | 2005 | Gable | CompShg | VinylSd | VinylSd | Stone | 5.231109 | Gd | TA | PConc | Ex | TA | Av | GLQ | 7.222566 | Unf | 0.000000 | 317 | 7.430707 | GasA | Ex | Y | SBrkr | 7.435438 | 0 | 0.0 | 7.435438 | 1 | 0.000000 | 2 | 0 | 3 | 0.693147 | Gd | 7 | Typ | 1 | Attchd | 2004.0 | RFn | 2 | 636 | TA | TA | Y | 5.545177 | 4.060443 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 8 | 2007 | WD | Normal | 12.634606 |
| 7 | 4.110874 | RL | 4.263383 | 9.247925 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NWAmes | PosN | Norm | 1Fam | 2Story | 7 | 6 | 1973 | 1973 | Gable | CompShg | HdBoard | HdBoard | Stone | 5.484797 | TA | TA | CBlock | Gd | TA | Mn | ALQ | 6.756932 | BLQ | 3.496508 | 216 | 7.010312 | GasA | Ex | Y | SBrkr | 7.010312 | 983 | 0.0 | 7.645398 | 1 | 0.000000 | 2 | 1 | 3 | 0.693147 | TA | 7 | Typ | 2 | Attchd | 1973.0 | RFn | 2 | 484 | TA | TA | Y | 5.463832 | 5.323010 | 5.433722 | 0.000000 | 0.00000 | 0.0 | 5.860786 | 11 | 2009 | WD | Normal | 12.206078 |
| 8 | 3.931826 | RM | 3.951244 | 8.719481 | Pave | Reg | Lvl | AllPub | Inside | Gtl | OldTown | Artery | Norm | 1Fam | 1.5Fin | 7 | 5 | 1931 | 1950 | Gable | CompShg | BrkFace | Wd Shng | None | 0.000000 | TA | TA | BrkTil | TA | TA | No | Unf | 0.000000 | Unf | 0.000000 | 952 | 6.859615 | GasA | Gd | Y | FuseF | 6.930495 | 752 | 0.0 | 7.481556 | 0 | 0.000000 | 2 | 0 | 2 | 1.098612 | TA | 8 | Min1 | 2 | Detchd | 1931.0 | Unf | 2 | 468 | Fa | TA | Y | 4.510860 | 0.000000 | 5.327876 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 4 | 2008 | WD | Abnorml | 11.774528 |
| 9 | 5.252273 | RL | 3.931826 | 8.912069 | Pave | Reg | Lvl | AllPub | Corner | Gtl | BrkSide | Artery | Artery | 2fmCon | 1.5Unf | 5 | 6 | 1939 | 1950 | Gable | CompShg | MetalSd | MetalSd | None | 0.000000 | TA | TA | BrkTil | TA | TA | No | GLQ | 6.747587 | Unf | 0.000000 | 140 | 6.899723 | GasA | Ex | Y | SBrkr | 6.982863 | 0 | 0.0 | 6.982863 | 1 | 0.000000 | 1 | 0 | 2 | 1.098612 | TA | 5 | Typ | 2 | Attchd | 1939.0 | RFn | 1 | 205 | Gd | TA | Y | 0.000000 | 1.609438 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 1 | 2008 | WD | Normal | 11.678448 |
| 10 | 3.044522 | RL | 4.262680 | 9.323758 | Pave | Reg | Lvl | AllPub | Inside | Gtl | Sawyer | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1965 | 1965 | Hip | CompShg | HdBoard | HdBoard | None | 0.000000 | TA | TA | CBlock | TA | TA | No | Rec | 6.810142 | Unf | 0.000000 | 134 | 6.947937 | GasA | Ex | Y | SBrkr | 6.947937 | 0 | 0.0 | 6.947937 | 1 | 0.000000 | 1 | 0 | 3 | 0.693147 | TA | 5 | Typ | 0 | Detchd | 1965.0 | Unf | 1 | 384 | TA | TA | Y | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 2 | 2008 | WD | Normal | 11.771444 |
| 11 | 4.110874 | RL | 4.454347 | 9.386392 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | NridgHt | Norm | Norm | 1Fam | 2Story | 9 | 5 | 2005 | 2006 | Hip | CompShg | WdShing | Wd Shng | Stone | 5.659482 | Ex | TA | PConc | Ex | TA | No | GLQ | 6.906755 | Unf | 0.000000 | 177 | 7.069874 | GasA | Ex | Y | SBrkr | 7.075809 | 1142 | 0.0 | 7.751475 | 1 | 0.000000 | 3 | 0 | 4 | 0.693147 | Ex | 11 | Typ | 2 | BuiltIn | 2005.0 | Fin | 3 | 736 | TA | TA | Y | 4.997212 | 3.091042 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 7 | 2006 | New | Partial | 12.751303 |
| 12 | 3.044522 | RL | 4.263383 | 9.470317 | Pave | IR2 | Lvl | AllPub | Inside | Gtl | Sawyer | Norm | Norm | 1Fam | 1Story | 5 | 6 | 1962 | 1962 | Hip | CompShg | HdBoard | Plywood | None | 0.000000 | TA | TA | CBlock | TA | TA | No | ALQ | 6.603944 | Unf | 0.000000 | 175 | 6.816736 | GasA | TA | Y | SBrkr | 6.816736 | 0 | 0.0 | 6.816736 | 1 | 0.000000 | 1 | 0 | 2 | 0.693147 | TA | 4 | Typ | 0 | Detchd | 1962.0 | Unf | 1 | 352 | TA | TA | Y | 4.948760 | 0.000000 | 0.000000 | 0.000000 | 5.17615 | 0.0 | 0.000000 | 9 | 2008 | WD | Normal | 11.877576 |
| 13 | 3.044522 | RL | 4.521789 | 9.273597 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 1Story | 7 | 5 | 2006 | 2007 | Gable | CompShg | VinylSd | VinylSd | Stone | 5.726848 | Gd | TA | PConc | Gd | TA | Av | Unf | 0.000000 | Unf | 0.000000 | 1494 | 7.309881 | GasA | Ex | Y | SBrkr | 7.309881 | 0 | 0.0 | 7.309881 | 0 | 0.000000 | 2 | 0 | 3 | 0.693147 | Gd | 7 | Typ | 1 | Attchd | 2006.0 | RFn | 3 | 840 | TA | TA | Y | 5.081404 | 3.526361 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 8 | 2007 | New | Partial | 12.540761 |
| 14 | 3.044522 | RL | 4.263383 | 9.298443 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | MetalSd | MetalSd | BrkFace | 5.361292 | TA | TA | CBlock | TA | TA | No | BLQ | 6.598509 | Unf | 0.000000 | 520 | 7.134094 | GasA | TA | Y | SBrkr | 7.134094 | 0 | 0.0 | 7.134094 | 1 | 0.000000 | 1 | 1 | 2 | 0.693147 | TA | 5 | Typ | 1 | Attchd | 1960.0 | RFn | 1 | 352 | TA | TA | Y | 0.000000 | 5.365976 | 5.176150 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 5 | 2008 | WD | Normal | 11.964007 |
| 15 | 3.828641 | RM | 3.951244 | 8.719481 | Pave | Reg | Lvl | AllPub | Corner | Gtl | BrkSide | Norm | Norm | 1Fam | 1.5Unf | 7 | 8 | 1929 | 2001 | Gable | CompShg | Wd Sdng | Wd Sdng | None | 0.000000 | TA | TA | BrkTil | TA | TA | No | Unf | 0.000000 | Unf | 0.000000 | 832 | 6.725034 | GasA | Ex | Y | FuseA | 6.751101 | 0 | 0.0 | 6.751101 | 0 | 0.000000 | 1 | 0 | 2 | 0.693147 | TA | 5 | Typ | 0 | Detchd | 1991.0 | Unf | 2 | 576 | TA | TA | Y | 3.891820 | 4.727388 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 7 | 2007 | WD | Normal | 11.790565 |
| 16 | 3.044522 | RL | 4.263383 | 9.327412 | Pave | IR1 | Lvl | AllPub | CulDSac | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 7 | 1970 | 1970 | Gable | CompShg | Wd Sdng | Wd Sdng | BrkFace | 5.198497 | TA | TA | CBlock | TA | TA | No | ALQ | 6.361302 | Unf | 0.000000 | 426 | 6.912743 | GasA | Ex | Y | SBrkr | 6.912743 | 0 | 0.0 | 6.912743 | 1 | 0.000000 | 1 | 0 | 2 | 0.693147 | TA | 5 | Typ | 1 | Attchd | 1970.0 | Fin | 2 | 480 | TA | TA | Y | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 6.552508 | 3 | 2010 | WD | Normal | 11.911708 |
| 17 | 4.510860 | RL | 4.290459 | 9.286560 | Pave | Reg | Lvl | AllPub | Inside | Gtl | Sawyer | Norm | Norm | Duplex | 1Story | 4 | 5 | 1967 | 1967 | Gable | CompShg | MetalSd | MetalSd | None | 0.000000 | TA | TA | Slab | NaN | NaN | NaN | NaN | 0.000000 | NaN | 0.000000 | 0 | 0.000000 | GasA | TA | Y | SBrkr | 7.167809 | 0 | 0.0 | 7.167809 | 0 | 0.000000 | 2 | 0 | 2 | 1.098612 | TA | 6 | Typ | 0 | CarPort | 1967.0 | Unf | 2 | 516 | TA | TA | Y | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 6.216606 | 10 | 2006 | WD | Normal | 11.407576 |
| 18 | 3.044522 | RL | 4.204693 | 9.524859 | Pave | Reg | Lvl | AllPub | Inside | Gtl | SawyerW | RRAe | Norm | 1Fam | 1Story | 5 | 5 | 2004 | 2004 | Gable | CompShg | VinylSd | VinylSd | None | 0.000000 | TA | TA | PConc | TA | TA | No | GLQ | 6.472346 | Unf | 0.000000 | 468 | 7.016610 | GasA | Ex | Y | SBrkr | 7.016610 | 0 | 0.0 | 7.016610 | 1 | 0.000000 | 1 | 1 | 3 | 0.693147 | Gd | 6 | Typ | 0 | Detchd | 2004.0 | Unf | 2 | 576 | TA | TA | Y | 0.000000 | 4.634729 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 6 | 2008 | WD | Normal | 11.976666 |
| 19 | 3.044522 | RL | 4.262680 | 8.930759 | Pave | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 5 | 6 | 1958 | 1965 | Hip | CompShg | BrkFace | Plywood | None | 0.000000 | TA | TA | CBlock | TA | TA | No | LwQ | 6.224558 | Unf | 0.000000 | 525 | 6.937314 | GasA | TA | Y | SBrkr | 7.200425 | 0 | 0.0 | 7.200425 | 0 | 0.000000 | 1 | 0 | 3 | 0.693147 | TA | 6 | Min1 | 0 | Attchd | 1958.0 | Unf | 1 | 294 | TA | TA | Y | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.0 | 0.000000 | 5 | 2009 | COD | Abnorml | 11.842236 |
주요 컬럼 설명 1stFlrSF: First Floor square feet
2ndFlrSF: Second floor square feet
GrLivArea: Above grade (ground) living area square feet
Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
LotArea: Lot size in square feet
GarageArea: Size of garage in square feet
OverallQual: Rates the overall material and finish of the house
10 Very Excellent
9 Excellent
8 Very Good
7 Good
6 Above Average
5 Average
4 Below Average
3 Fair
2 Poor
1 Very Poor
OverallCond: Rates the overall condition of the house
10 Very Excellent
9 Excellent
8 Very Good
7 Good
6 Above Average
5 Average
4 Below Average
3 Fair
2 Poor
1 Very Poor
YearBuilt: Original construction date
Neighborhood: Physical locations within Ames city limits
Blmngtn Bloomington Heights Blueste Bluestem BrDale Briardale …..
RoofMatl: Roof material ClyTile Clay or Tile CompShg Standard (Composite) Shingle Membran Membrane Metal Metal …..
RoofStyle: Type of roof Flat Flat Gable Gable Gambrel Gabrel (Barn) …..
데이터 타입과 Null 값 갯수 확인
house_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
#pd.set_option('display.max_rows', 100)
print('데이터 세트의 Shape:', house_df.shape)
print('\n전체 feature 들의 type \n',house_df.dtypes.value_counts())
isnull_series = house_df.isnull().sum()
print('\nNull 컬럼과 그 건수:\n ', isnull_series[isnull_series > 0].sort_values(ascending=False))
데이터 세트의 Shape: (1460, 81)
전체 feature 들의 type
object 43
int64 35
float64 3
dtype: int64
Null 컬럼과 그 건수:
PoolQC 1453
MiscFeature 1406
Alley 1369
Fence 1179
FireplaceQu 690
LotFrontage 259
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
BsmtExposure 38
BsmtFinType2 38
BsmtFinType1 37
BsmtCond 37
BsmtQual 37
MasVnrArea 8
MasVnrType 8
Electrical 1
dtype: int64
타겟값 SalePrice의 분포도 확인
plt.title('Original Sale Price Histogram')
plt.xticks(rotation=15)
sns.histplot(house_df['SalePrice'], kde=True)
plt.show()
로그 변환을 통해 SalePrice 값 분포도 확인
plt.title('Log Transformed Sale Price Histogram')
log_SalePrice = np.log1p(house_df['SalePrice'])
sns.histplot(log_SalePrice, kde=True)
plt.show()
null_column_count = house_df.isnull().sum()[house_df.isnull().sum() > 0]
house_df.dtypes[null_column_count.index]
LotFrontage float64
Alley object
MasVnrType object
MasVnrArea float64
BsmtQual object
BsmtCond object
BsmtExposure object
BsmtFinType1 object
BsmtFinType2 object
Electrical object
FireplaceQu object
GarageType object
GarageYrBlt float64
GarageFinish object
GarageQual object
GarageCond object
PoolQC object
Fence object
MiscFeature object
dtype: object
타겟값인 Price를 로그변환하여 정규 분포 형태로 변환하고, 피처들 중 숫자형 컬럼의 Null값 데이터 처리
# SalePrice 로그 변환
original_SalePrice = house_df['SalePrice']
house_df['SalePrice'] = np.log1p(house_df['SalePrice'])
# Null 이 너무 많은 컬럼들과 불필요한 컬럼 삭제
house_df.drop(['Id','PoolQC' , 'MiscFeature', 'Alley', 'Fence','FireplaceQu'], axis=1 , inplace=True)
# Drop 하지 않는 숫자형 Null컬럼들은 평균값으로 대체
house_df.fillna(house_df.mean(),inplace=True)
# Null 값이 있는 피처명과 타입을 추출
null_column_count = house_df.isnull().sum()[house_df.isnull().sum() > 0]
print('## Null 피처의 Type :\n', house_df.dtypes[null_column_count.index])
## Null 피처의 Type :
MasVnrType object
BsmtQual object
BsmtCond object
BsmtExposure object
BsmtFinType1 object
BsmtFinType2 object
Electrical object
GarageType object
GarageFinish object
GarageQual object
GarageCond object
dtype: object
house_df['BsmtQual'].value_counts()
TA 649
Gd 618
Ex 121
Fa 35
Name: BsmtQual, dtype: int64
문자열값은 모두 카테고리값. 판다스의 get_dummies( )를 이용하여 원-핫 인코딩 수행
- get_dummies()는 null값을 반영하여 자동 원-핫 인코딩 수행
s1 = ['a', 'b', np.nan]
pd.get_dummies(s1, dummy_na=True)
| a | b | NaN | |
|---|---|---|---|
| 0 | 1 | 0 | 0 |
| 1 | 0 | 1 | 0 |
| 2 | 0 | 0 | 1 |
print('get_dummies() 수행 전 데이터 Shape:', house_df.shape)
house_df_ohe = pd.get_dummies(house_df)
print('get_dummies() 수행 후 데이터 Shape:', house_df_ohe.shape)
null_column_count = house_df_ohe.isnull().sum()[house_df_ohe.isnull().sum() > 0]
print('## Null 피처의 Type :\n', house_df_ohe.dtypes[null_column_count.index])
get_dummies() 수행 전 데이터 Shape: (1460, 75)
get_dummies() 수행 후 데이터 Shape: (1460, 271)
## Null 피처의 Type :
Series([], dtype: object)
pd.set_option('display.max_columns', 300)
house_df_ohe.head(5)
| MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | MSZoning_C (all) | MSZoning_FV | MSZoning_RH | MSZoning_RL | MSZoning_RM | Street_Grvl | Street_Pave | LotShape_IR1 | LotShape_IR2 | LotShape_IR3 | LotShape_Reg | LandContour_Bnk | LandContour_HLS | LandContour_Low | LandContour_Lvl | Utilities_AllPub | Utilities_NoSeWa | LotConfig_Corner | LotConfig_CulDSac | LotConfig_FR2 | LotConfig_FR3 | LotConfig_Inside | LandSlope_Gtl | LandSlope_Mod | LandSlope_Sev | Neighborhood_Blmngtn | Neighborhood_Blueste | Neighborhood_BrDale | Neighborhood_BrkSide | Neighborhood_ClearCr | Neighborhood_CollgCr | Neighborhood_Crawfor | Neighborhood_Edwards | Neighborhood_Gilbert | Neighborhood_IDOTRR | Neighborhood_MeadowV | Neighborhood_Mitchel | Neighborhood_NAmes | Neighborhood_NPkVill | Neighborhood_NWAmes | Neighborhood_NoRidge | Neighborhood_NridgHt | Neighborhood_OldTown | Neighborhood_SWISU | Neighborhood_Sawyer | Neighborhood_SawyerW | Neighborhood_Somerst | Neighborhood_StoneBr | Neighborhood_Timber | Neighborhood_Veenker | Condition1_Artery | Condition1_Feedr | Condition1_Norm | Condition1_PosA | Condition1_PosN | Condition1_RRAe | Condition1_RRAn | Condition1_RRNe | Condition1_RRNn | Condition2_Artery | Condition2_Feedr | Condition2_Norm | Condition2_PosA | Condition2_PosN | Condition2_RRAe | Condition2_RRAn | Condition2_RRNn | BldgType_1Fam | BldgType_2fmCon | BldgType_Duplex | BldgType_Twnhs | BldgType_TwnhsE | HouseStyle_1.5Fin | HouseStyle_1.5Unf | HouseStyle_1Story | HouseStyle_2.5Fin | HouseStyle_2.5Unf | HouseStyle_2Story | HouseStyle_SFoyer | HouseStyle_SLvl | RoofStyle_Flat | RoofStyle_Gable | RoofStyle_Gambrel | RoofStyle_Hip | RoofStyle_Mansard | RoofStyle_Shed | RoofMatl_ClyTile | RoofMatl_CompShg | RoofMatl_Membran | RoofMatl_Metal | RoofMatl_Roll | RoofMatl_Tar&Grv | RoofMatl_WdShake | RoofMatl_WdShngl | Exterior1st_AsbShng | Exterior1st_AsphShn | Exterior1st_BrkComm | Exterior1st_BrkFace | Exterior1st_CBlock | Exterior1st_CemntBd | Exterior1st_HdBoard | Exterior1st_ImStucc | Exterior1st_MetalSd | Exterior1st_Plywood | Exterior1st_Stone | Exterior1st_Stucco | Exterior1st_VinylSd | Exterior1st_Wd Sdng | Exterior1st_WdShing | Exterior2nd_AsbShng | Exterior2nd_AsphShn | Exterior2nd_Brk Cmn | Exterior2nd_BrkFace | Exterior2nd_CBlock | Exterior2nd_CmentBd | Exterior2nd_HdBoard | Exterior2nd_ImStucc | Exterior2nd_MetalSd | Exterior2nd_Other | Exterior2nd_Plywood | Exterior2nd_Stone | Exterior2nd_Stucco | Exterior2nd_VinylSd | Exterior2nd_Wd Sdng | Exterior2nd_Wd Shng | MasVnrType_BrkCmn | MasVnrType_BrkFace | MasVnrType_None | MasVnrType_Stone | ExterQual_Ex | ExterQual_Fa | ExterQual_Gd | ExterQual_TA | ExterCond_Ex | ExterCond_Fa | ExterCond_Gd | ExterCond_Po | ExterCond_TA | Foundation_BrkTil | Foundation_CBlock | Foundation_PConc | Foundation_Slab | Foundation_Stone | Foundation_Wood | BsmtQual_Ex | BsmtQual_Fa | BsmtQual_Gd | BsmtQual_TA | BsmtCond_Fa | BsmtCond_Gd | BsmtCond_Po | BsmtCond_TA | BsmtExposure_Av | BsmtExposure_Gd | BsmtExposure_Mn | BsmtExposure_No | BsmtFinType1_ALQ | BsmtFinType1_BLQ | BsmtFinType1_GLQ | BsmtFinType1_LwQ | BsmtFinType1_Rec | BsmtFinType1_Unf | BsmtFinType2_ALQ | BsmtFinType2_BLQ | BsmtFinType2_GLQ | BsmtFinType2_LwQ | BsmtFinType2_Rec | BsmtFinType2_Unf | Heating_Floor | Heating_GasA | Heating_GasW | Heating_Grav | Heating_OthW | Heating_Wall | HeatingQC_Ex | HeatingQC_Fa | HeatingQC_Gd | HeatingQC_Po | HeatingQC_TA | CentralAir_N | CentralAir_Y | Electrical_FuseA | Electrical_FuseF | Electrical_FuseP | Electrical_Mix | Electrical_SBrkr | KitchenQual_Ex | KitchenQual_Fa | KitchenQual_Gd | KitchenQual_TA | Functional_Maj1 | Functional_Maj2 | Functional_Min1 | Functional_Min2 | Functional_Mod | Functional_Sev | Functional_Typ | GarageType_2Types | GarageType_Attchd | GarageType_Basment | GarageType_BuiltIn | GarageType_CarPort | GarageType_Detchd | GarageFinish_Fin | GarageFinish_RFn | GarageFinish_Unf | GarageQual_Ex | GarageQual_Fa | GarageQual_Gd | GarageQual_Po | GarageQual_TA | GarageCond_Ex | GarageCond_Fa | GarageCond_Gd | GarageCond_Po | GarageCond_TA | PavedDrive_N | PavedDrive_P | PavedDrive_Y | SaleType_COD | SaleType_CWD | SaleType_Con | SaleType_ConLD | SaleType_ConLI | SaleType_ConLw | SaleType_New | SaleType_Oth | SaleType_WD | SaleCondition_Abnorml | SaleCondition_AdjLand | SaleCondition_Alloca | SaleCondition_Family | SaleCondition_Normal | SaleCondition_Partial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | 65.0 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | 0 | 150 | 856 | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | 8 | 0 | 2003.0 | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 | 12.247699 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 20 | 80.0 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | 0 | 284 | 1262 | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | 6 | 1 | 1976.0 | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | 12.109016 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 60 | 68.0 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | 0 | 434 | 920 | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | 6 | 1 | 2001.0 | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 | 12.317171 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 70 | 60.0 | 9550 | 7 | 5 | 1915 | 1970 | 0.0 | 216 | 0 | 540 | 756 | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | 7 | 1 | 1998.0 | 3 | 642 | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 | 11.849405 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4 | 60 | 84.0 | 14260 | 8 | 5 | 2000 | 2000 | 350.0 | 655 | 0 | 490 | 1145 | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | 9 | 1 | 2000.0 | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 | 12.429220 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
선형 회귀 모델 학습/예측/평가
- RMSE 평가 함수 생성 타겟값을 로그 변환하여, 학습한 모델은 예측값도 원본 데이터에 로그 스케일로 예측함. 실제 타겟값도 로그값, 예측값도 로그값이므로 RMSE를 적용하면 자연스럽게 RMSLE로 적용됨.
# 학습이 완료된 모델을 인자로 받아서 테스트 데이터로 예측하고 RMSE를 계산
def get_rmse(model):
pred = model.predict(X_test)
# y_test, pred는 로그 스케일임.
mse = mean_squared_error(y_test , pred)
rmse = np.sqrt(mse)
print('{0} 로그 변환된 RMSE: {1}'.format(model.__class__.__name__,np.round(rmse, 3)))
return rmse
# 여러 모델들을 list 형태로 인자로 받아서 개별 모델들의 RMSE를 list로 반환.
def get_rmses(models):
rmses = [ ]
for model in models:
rmse = get_rmse(model)
rmses.append(rmse)
return rmses
LinearRegression, Ridge, Lasso 학습, 예측, 평가
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
y_target = house_df_ohe['SalePrice']
X_features = house_df_ohe.drop('SalePrice',axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)
# LinearRegression, Ridge, Lasso 학습, 예측, 평가
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso()
lasso_reg.fit(X_train, y_train)
models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)
LinearRegression 로그 변환된 RMSE: 0.132
Ridge 로그 변환된 RMSE: 0.128
Lasso 로그 변환된 RMSE: 0.176
[0.13189576579154258, 0.12750846334053145, 0.17628250556471395]
회귀 계수값과 컬럼명 시각화를 위해 상위 10개, 하위 10개(-값으로 가장 큰 10개) 회귀 계수값과 컬럼명을 가지는 Series생성 함수.
coef = pd.Series(lr_reg.coef_, index=X_features.columns)
# + 상위 10개 , - 하위 10개 coefficient 추출하여 반환.
coef_high = coef.sort_values(ascending=False).head(10)
coef_low = coef.sort_values(ascending=False).tail(10)
coef_low
Neighborhood_MeadowV -0.228006
Condition2_RRAe -0.243308
Electrical_FuseP -0.258129
Heating_Grav -0.264520
GarageCond_Ex -0.265441
Electrical_Mix -0.300984
MSZoning_C (all) -0.323040
Functional_Maj2 -0.426389
Condition2_PosN -0.663744
RoofMatl_ClyTile -2.372268
dtype: float64
def get_top_bottom_coef(model):
# coef_ 속성을 기반으로 Series 객체를 생성. index는 컬럼명.
coef = pd.Series(model.coef_, index=X_features.columns)
# + 상위 10개 , - 하위 10개 coefficient 추출하여 반환.
coef_high = coef.sort_values(ascending=False).head(10)
coef_low = coef.sort_values(ascending=False).tail(10)
return coef_high, coef_low
인자로 입력되는 여러개의 회귀 모델들에 대한 회귀계수값과 컬럼명 시각화
def visualize_coefficient(models):
# 3개 회귀 모델의 시각화를 위해 3개의 컬럼을 가지는 subplot 생성
fig, axs = plt.subplots(figsize=(24,10),nrows=1, ncols=3)
fig.tight_layout()
# 입력인자로 받은 list객체인 models에서 차례로 model을 추출하여 회귀 계수 시각화.
for i_num, model in enumerate(models):
# 상위 10개, 하위 10개 회귀 계수를 구하고, 이를 판다스 concat으로 결합.
coef_high, coef_low = get_top_bottom_coef(model)
coef_concat = pd.concat( [coef_high , coef_low] )
# 순차적으로 ax subplot에 barchar로 표현. 한 화면에 표현하기 위해 tick label 위치와 font 크기 조정.
axs[i_num].set_title(model.__class__.__name__+' Coeffiecents', size=25)
axs[i_num].tick_params(axis="y",direction="in", pad=-120)
for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
label.set_fontsize(22)
sns.barplot(x=coef_concat.values, y=coef_concat.index , ax=axs[i_num])
# 앞 예제에서 학습한 lr_reg, ridge_reg, lasso_reg 모델의 회귀 계수 시각화.
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)
5 폴드 교차검증으로 모델별로 RMSE와 평균 RMSE출력
from sklearn.model_selection import cross_val_score
def get_avg_rmse_cv(models):
for model in models:
# 분할하지 않고 전체 데이터로 cross_val_score( ) 수행. 모델별 CV RMSE값과 평균 RMSE 출력
rmse_list = np.sqrt(-cross_val_score(model, X_features, y_target,
scoring="neg_mean_squared_error", cv = 5))
rmse_avg = np.mean(rmse_list)
print('\n{0} CV RMSE 값 리스트: {1}'.format( model.__class__.__name__, np.round(rmse_list, 3)))
print('{0} CV 평균 RMSE 값: {1}'.format( model.__class__.__name__, np.round(rmse_avg, 3)))
# 앞 예제에서 학습한 ridge_reg, lasso_reg 모델의 CV RMSE값 출력
models = [ridge_reg, lasso_reg]
get_avg_rmse_cv(models)
Ridge CV RMSE 값 리스트: [0.117 0.154 0.142 0.117 0.189]
Ridge CV 평균 RMSE 값: 0.144
Lasso CV RMSE 값 리스트: [0.161 0.204 0.177 0.181 0.265]
Lasso CV 평균 RMSE 값: 0.198
각 모델들의 alpha값을 변경하면서 하이퍼 파라미터 튜닝 후 다시 재 학습/예측/평가
from sklearn.model_selection import GridSearchCV
def print_best_params(model, params):
grid_model = GridSearchCV(model, param_grid=params,
scoring='neg_mean_squared_error', cv=5)
grid_model.fit(X_features, y_target)
rmse = np.sqrt(-1* grid_model.best_score_)
print('{0} 5 CV 시 최적 평균 RMSE 값: {1}, 최적 alpha:{2}'.format(model.__class__.__name__,
np.round(rmse, 4), grid_model.best_params_))
return grid_model.best_estimator_
ridge_params = { 'alpha':[0.05, 0.1, 1, 5, 8, 10, 12, 15, 20] }
lasso_params = { 'alpha':[0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1,5, 10] }
best_rige = print_best_params(ridge_reg, ridge_params)
best_lasso = print_best_params(lasso_reg, lasso_params)
Ridge 5 CV 시 최적 평균 RMSE 값: 0.1418, 최적 alpha:{'alpha': 12}
Lasso 5 CV 시 최적 평균 RMSE 값: 0.142, 최적 alpha:{'alpha': 0.001}
# 앞의 최적화 alpha값으로 학습데이터로 학습, 테스트 데이터로 예측 및 평가 수행.
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge(alpha=12)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)
# 모든 모델의 RMSE 출력
models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)
# 모든 모델의 회귀 계수 시각화
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)
LinearRegression 로그 변환된 RMSE: 0.132
Ridge 로그 변환된 RMSE: 0.124
Lasso 로그 변환된 RMSE: 0.12
숫자 피처들에 대한 데이터 분포 왜곡도 확인 후 높은 왜곡도를 가지는 피처 추출
Skew 유형에 따른 변환 방법
- Right Skew된 경우는 로그 변환(Log Transformation) 적용
- 일상 업무의 데이터는 Right Skew된 경우가 많음
- 음수 값이 포함된 데이터는 로그 변환 될 수 없음 (이 경우 최소 음수 값에 해당되는 양수 값을 일괄적으로 더해서 보정 후 변환)
- 학습 데이터를 보정하여 스케일 변환하면 테스트 데이터도 동일하게 변환해야 함
- Left Skew된 경우는 Exponential/Power 변환 적용
Skewness(왜도)의 이해
-0.5 ~ 0.5 사이는 대칭에 가깝고, -1보다 작거나(Left Skew) 또는 1보다 클 경우(Right Skew) 왜도가 심함
- Right Skew: mean > median > mode
- Left Skew: mode > median > mean
features_index = house_df.dtypes[house_df.dtypes != 'object'].index
# house_df에 컬럼 index를 [ ]로 입력하면 해당하는 컬럼 데이터 셋 반환. apply lambda로 skew( )호출
skew_features = house_df[features_index].apply(lambda x : skew(x))
house_df[skew_features_top.index]
| MSSubClass | LotFrontage | LotArea | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | TotalBsmtSF | 1stFlrSF | LowQualFinSF | GrLivArea | BsmtHalfBath | KitchenAbvGr | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | 65.0 | 8450 | 196.0 | 706 | 0 | 856 | 856 | 0 | 1710 | 0 | 1 | 0 | 61 | 0 | 0 | 0 | 0 | 0 |
| 1 | 20 | 80.0 | 9600 | 0.0 | 978 | 0 | 1262 | 1262 | 0 | 1262 | 1 | 1 | 298 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 60 | 68.0 | 11250 | 162.0 | 486 | 0 | 920 | 920 | 0 | 1786 | 0 | 1 | 0 | 42 | 0 | 0 | 0 | 0 | 0 |
| 3 | 70 | 60.0 | 9550 | 0.0 | 216 | 0 | 756 | 961 | 0 | 1717 | 0 | 1 | 0 | 35 | 272 | 0 | 0 | 0 | 0 |
| 4 | 60 | 84.0 | 14260 | 350.0 | 655 | 0 | 1145 | 1145 | 0 | 2198 | 0 | 1 | 192 | 84 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 60 | 62.0 | 7917 | 0.0 | 0 | 0 | 953 | 953 | 0 | 1647 | 0 | 1 | 0 | 40 | 0 | 0 | 0 | 0 | 0 |
| 1456 | 20 | 85.0 | 13175 | 119.0 | 790 | 163 | 1542 | 2073 | 0 | 2073 | 0 | 1 | 349 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1457 | 70 | 66.0 | 9042 | 0.0 | 275 | 0 | 1152 | 1188 | 0 | 2340 | 0 | 1 | 0 | 60 | 0 | 0 | 0 | 0 | 2500 |
| 1458 | 20 | 68.0 | 9717 | 0.0 | 49 | 1029 | 1078 | 1078 | 0 | 1078 | 0 | 1 | 366 | 0 | 112 | 0 | 0 | 0 | 0 |
| 1459 | 20 | 75.0 | 9937 | 0.0 | 830 | 290 | 1256 | 1256 | 0 | 1256 | 0 | 1 | 736 | 68 | 0 | 0 | 0 | 0 | 0 |
1460 rows × 19 columns
from scipy.stats import skew
# object가 아닌 숫자형 피쳐의 컬럼 index 객체 추출.
features_index = house_df.dtypes[house_df.dtypes != 'object'].index
# house_df에 컬럼 index를 [ ]로 입력하면 해당하는 컬럼 데이터 셋 반환. apply lambda로 skew( )호출
skew_features = house_df[features_index].apply(lambda x : skew(x))
# skew 정도가 1 이상인 컬럼들만 추출.
skew_features_top = skew_features[skew_features > 1]
print(skew_features_top.sort_values(ascending=False))
MiscVal 24.451640
PoolArea 14.813135
LotArea 12.195142
3SsnPorch 10.293752
LowQualFinSF 9.002080
KitchenAbvGr 4.483784
BsmtFinSF2 4.250888
ScreenPorch 4.117977
BsmtHalfBath 4.099186
EnclosedPorch 3.086696
MasVnrArea 2.673661
LotFrontage 2.382499
OpenPorchSF 2.361912
BsmtFinSF1 1.683771
WoodDeckSF 1.539792
TotalBsmtSF 1.522688
MSSubClass 1.406210
1stFlrSF 1.375342
GrLivArea 1.365156
dtype: float64
왜곡도가 1인 피처들은 로그 변환 적용하고 다시 하이퍼 파라미터 튜닝 후 재 학습/예측/평가
house_df[skew_features_top.index] = np.log1p(house_df[skew_features_top.index])
# Skew가 높은 피처들을 로그 변환 했으므로 다시 원-핫 인코딩 적용 및 피처/타겟 데이터 셋 생성,
house_df_ohe = pd.get_dummies(house_df)
y_target = house_df_ohe['SalePrice']
X_features = house_df_ohe.drop('SalePrice',axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)
# 피처들을 로그 변환 후 다시 최적 하이퍼 파라미터와 RMSE 출력
ridge_params = { 'alpha':[0.05, 0.1, 1, 5, 8, 10, 12, 15, 20] }
lasso_params = { 'alpha':[0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1,5, 10] }
best_ridge = print_best_params(ridge_reg, ridge_params)
best_lasso = print_best_params(lasso_reg, lasso_params)
Ridge 5 CV 시 최적 평균 RMSE 값: 0.1275, 최적 alpha:{'alpha': 10}
Lasso 5 CV 시 최적 평균 RMSE 값: 0.1252, 최적 alpha:{'alpha': 0.001}
# 앞의 최적화 alpha값으로 학습데이터로 학습, 테스트 데이터로 예측 및 평가 수행.
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge(alpha=10)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)
# 모든 모델의 RMSE 출력
models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)
# 모든 모델의 회귀 계수 시각화
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)
LinearRegression 로그 변환된 RMSE: 0.128
Ridge 로그 변환된 RMSE: 0.122
Lasso 로그 변환된 RMSE: 0.119
이상치 데이터 검출을 위해 주요 피처인 GrLivArea값에 대한 산점도 확인
plt.scatter(x = house_df_org['GrLivArea'], y = house_df_org['SalePrice'])
plt.ylabel('SalePrice', fontsize=15)
plt.xlabel('GrLivArea', fontsize=15)
plt.show()
이상치 데이터 삭제 후 재 학습/예측/평가
house_df_ohe['GrLivArea']
0 7.444833
1 7.141245
2 7.488294
3 7.448916
4 7.695758
...
1455 7.407318
1456 7.637234
1457 7.758333
1458 6.983790
1459 7.136483
Name: GrLivArea, Length: 1460, dtype: float64
outlier_index
Int64Index([523, 1298], dtype='int64')
# GrLivArea와 SalePrice 모두 로그 변환되었으므로 이를 반영한 조건 생성.
cond1 = house_df_ohe['GrLivArea'] > np.log1p(4000)
cond2 = house_df_ohe['SalePrice'] < np.log1p(500000)
outlier_index = house_df_ohe[cond1 & cond2].index
print('아웃라이어 레코드 index :', outlier_index.values)
print('아웃라이어 삭제 전 house_df_ohe shape:', house_df_ohe.shape)
# DataFrame의 index를 이용하여 아웃라이어 레코드 삭제.
house_df_ohe.drop(outlier_index , axis=0, inplace=True)
print('아웃라이어 삭제 후 house_df_ohe shape:', house_df_ohe.shape)
아웃라이어 레코드 index : [ 523 1298]
아웃라이어 삭제 전 house_df_ohe shape: (1460, 271)
아웃라이어 삭제 후 house_df_ohe shape: (1458, 271)
y_target = house_df_ohe['SalePrice']
X_features = house_df_ohe.drop('SalePrice',axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)
ridge_params = { 'alpha':[0.05, 0.1, 1, 5, 8, 10, 12, 15, 20] }
lasso_params = { 'alpha':[0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1,5, 10] }
best_ridge = print_best_params(ridge_reg, ridge_params)
best_lasso = print_best_params(lasso_reg, lasso_params)
Ridge 5 CV 시 최적 평균 RMSE 값: 0.1125, 최적 alpha:{'alpha': 8}
Lasso 5 CV 시 최적 평균 RMSE 값: 0.1122, 최적 alpha:{'alpha': 0.001}
# 앞의 최적화 alpha값으로 학습데이터로 학습, 테스트 데이터로 예측 및 평가 수행.
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge(alpha=8)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)
# 모든 모델의 RMSE 출력
models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)
# 모든 모델의 회귀 계수 시각화
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)
LinearRegression 로그 변환된 RMSE: 0.129
Ridge 로그 변환된 RMSE: 0.103
Lasso 로그 변환된 RMSE: 0.1
회귀 트리 모델 학습/예측/평가
XGBoost와 LightGBM 학습/예측/평가
from xgboost import XGBRegressor
xgb_params = {'n_estimators':[1000]}
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05,
colsample_bytree=0.5, subsample=0.8)
best_xgb = print_best_params(xgb_reg, xgb_params)
from lightgbm import LGBMRegressor
lgbm_params = {'n_estimators':[1000]}
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=4,
subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)
best_lgbm = print_best_params(lgbm_reg, lgbm_params)
# 모델의 중요도 상위 20개의 피처명과 그때의 중요도값을 Series로 반환.
def get_top_features(model):
ftr_importances_values = model.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=X_features.columns )
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
return ftr_top20
def visualize_ftr_importances(models):
# 2개 회귀 모델의 시각화를 위해 2개의 컬럼을 가지는 subplot 생성
fig, axs = plt.subplots(figsize=(24,10),nrows=1, ncols=2)
fig.tight_layout()
# 입력인자로 받은 list객체인 models에서 차례로 model을 추출하여 피처 중요도 시각화.
for i_num, model in enumerate(models):
# 중요도 상위 20개의 피처명과 그때의 중요도값 추출
ftr_top20 = get_top_features(model)
axs[i_num].set_title(model.__class__.__name__+' Feature Importances', size=25)
#font 크기 조정.
for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
label.set_fontsize(22)
sns.barplot(x=ftr_top20.values, y=ftr_top20.index , ax=axs[i_num])
# 앞 예제에서 print_best_params( )가 반환한 GridSearchCV로 최적화된 모델의 피처 중요도 시각화
models = [best_xgb, best_lgbm]
visualize_ftr_importances(models)
회귀 모델의 예측 결과 혼합을 통한 최종 예측
def get_rmse_pred(preds):
for key in preds.keys():
pred_value = preds[key]
mse = mean_squared_error(y_test , pred_value)
rmse = np.sqrt(mse)
print('{0} 모델의 RMSE: {1}'.format(key, rmse))
# 개별 모델의 학습
ridge_reg = Ridge(alpha=8)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)
# 개별 모델 예측
ridge_pred = ridge_reg.predict(X_test)
lasso_pred = lasso_reg.predict(X_test)
# 개별 모델 예측값 혼합으로 최종 예측값 도출
pred = 0.4 * ridge_pred + 0.6 * lasso_pred
preds = {'최종 혼합': pred,
'Ridge': ridge_pred,
'Lasso': lasso_pred}
#최종 혼합 모델, 개별모델의 RMSE 값 출력
get_rmse_pred(preds)
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05,
colsample_bytree=0.5, subsample=0.8)
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=4,
subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)
xgb_reg.fit(X_train, y_train)
lgbm_reg.fit(X_train, y_train)
xgb_pred = xgb_reg.predict(X_test)
lgbm_pred = lgbm_reg.predict(X_test)
pred = 0.5 * xgb_pred + 0.5 * lgbm_pred
preds = {'최종 혼합': pred,
'XGBM': xgb_pred,
'LGBM': lgbm_pred}
get_rmse_pred(preds)
스태킹 앙상블 모델을 통한 회귀 예측
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
# 개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수.
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds ):
# 지정된 n_folds값으로 KFold 생성.
kf = KFold(n_splits=n_folds, shuffle=False)
#추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
test_pred = np.zeros((X_test_n.shape[0],n_folds))
print(model.__class__.__name__ , ' model 시작 ')
for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n)):
#입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출
print('\t 폴드 세트: ',folder_counter,' 시작 ')
X_tr = X_train_n[train_index]
y_tr = y_train_n[train_index]
X_te = X_train_n[valid_index]
#폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행.
model.fit(X_tr , y_tr)
#폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장.
train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
#입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장.
test_pred[:, folder_counter] = model.predict(X_test_n)
# 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
#train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
return train_fold_pred , test_pred_mean
# get_stacking_base_datasets( )은 넘파이 ndarray를 인자로 사용하므로 DataFrame을 넘파이로 변환.
X_train_n = X_train.values
X_test_n = X_test.values
y_train_n = y_train.values
# 각 개별 기반(Base)모델이 생성한 학습용/테스트용 데이터 반환.
ridge_train, ridge_test = get_stacking_base_datasets(ridge_reg, X_train_n, y_train_n, X_test_n, 5)
lasso_train, lasso_test = get_stacking_base_datasets(lasso_reg, X_train_n, y_train_n, X_test_n, 5)
xgb_train, xgb_test = get_stacking_base_datasets(xgb_reg, X_train_n, y_train_n, X_test_n, 5)
lgbm_train, lgbm_test = get_stacking_base_datasets(lgbm_reg, X_train_n, y_train_n, X_test_n, 5)
# 개별 모델이 반환한 학습 및 테스트용 데이터 세트를 Stacking 형태로 결합.
Stack_final_X_train = np.concatenate((ridge_train, lasso_train,
xgb_train, lgbm_train), axis=1)
Stack_final_X_test = np.concatenate((ridge_test, lasso_test,
xgb_test, lgbm_test), axis=1)
# 최종 메타 모델은 라쏘 모델을 적용.
meta_model_lasso = Lasso(alpha=0.0005)
#기반 모델의 예측값을 기반으로 새롭게 만들어진 학습 및 테스트용 데이터로 예측하고 RMSE 측정.
meta_model_lasso.fit(Stack_final_X_train, y_train)
final = meta_model_lasso.predict(Stack_final_X_test)
mse = mean_squared_error(y_test , final)
rmse = np.sqrt(mse)
print('스태킹 회귀 모델의 최종 RMSE 값은:', rmse)