8 분 소요

Regression with a Flood Prediction Dataset

import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv('train.csv')

test = pd.read_csv('test.csv')

submission = pd.read_csv('sample_submission.csv')
train.shape
(1117957, 22)
train.head()
id MonsoonIntensity TopographyDrainage RiverManagement Deforestation Urbanization ClimateChange DamsQuality Siltation AgriculturalPractices ... DrainageSystems CoastalVulnerability Landslides Watersheds DeterioratingInfrastructure PopulationScore WetlandLoss InadequatePlanning PoliticalFactors FloodProbability
0 0 5 8 5 8 6 4 4 3 3 ... 5 3 3 5 4 7 5 7 3 0.445
1 1 6 7 4 4 8 8 3 5 4 ... 7 2 0 3 5 3 3 4 3 0.450
2 2 6 5 6 7 3 7 1 5 4 ... 7 3 7 5 6 8 2 3 3 0.530
3 3 3 4 6 5 4 8 4 7 6 ... 2 4 7 4 4 6 5 7 5 0.535
4 4 5 3 2 6 4 4 3 3 3 ... 2 2 6 6 4 1 2 3 5 0.415

5 rows × 22 columns

columns 설명

train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 22 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   id                               1117957 non-null  int64  
 1   MonsoonIntensity                 1117957 non-null  int64  
 2   TopographyDrainage               1117957 non-null  int64  
 3   RiverManagement                  1117957 non-null  int64  
 4   Deforestation                    1117957 non-null  int64  
 5   Urbanization                     1117957 non-null  int64  
 6   ClimateChange                    1117957 non-null  int64  
 7   DamsQuality                      1117957 non-null  int64  
 8   Siltation                        1117957 non-null  int64  
 9   AgriculturalPractices            1117957 non-null  int64  
 10  Encroachments                    1117957 non-null  int64  
 11  IneffectiveDisasterPreparedness  1117957 non-null  int64  
 12  DrainageSystems                  1117957 non-null  int64  
 13  CoastalVulnerability             1117957 non-null  int64  
 14  Landslides                       1117957 non-null  int64  
 15  Watersheds                       1117957 non-null  int64  
 16  DeterioratingInfrastructure      1117957 non-null  int64  
 17  PopulationScore                  1117957 non-null  int64  
 18  WetlandLoss                      1117957 non-null  int64  
 19  InadequatePlanning               1117957 non-null  int64  
 20  PoliticalFactors                 1117957 non-null  int64  
 21  FloodProbability                 1117957 non-null  float64
dtypes: float64(1), int64(21)
memory usage: 187.6 MB
train.describe()
id MonsoonIntensity TopographyDrainage RiverManagement Deforestation Urbanization ClimateChange DamsQuality Siltation AgriculturalPractices ... DrainageSystems CoastalVulnerability Landslides Watersheds DeterioratingInfrastructure PopulationScore WetlandLoss InadequatePlanning PoliticalFactors FloodProbability
count 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 ... 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06 1.117957e+06
mean 5.589780e+05 4.921450e+00 4.926671e+00 4.955322e+00 4.942240e+00 4.942517e+00 4.934093e+00 4.955878e+00 4.927791e+00 4.942619e+00 ... 4.946893e+00 4.953999e+00 4.931376e+00 4.929032e+00 4.925907e+00 4.927520e+00 4.950859e+00 4.940587e+00 4.939004e+00 5.044803e-01
std 3.227265e+05 2.056387e+00 2.093879e+00 2.072186e+00 2.051689e+00 2.083391e+00 2.057742e+00 2.083063e+00 2.065992e+00 2.068545e+00 ... 2.072333e+00 2.088899e+00 2.078287e+00 2.082395e+00 2.064813e+00 2.074176e+00 2.068696e+00 2.081123e+00 2.090350e+00 5.102610e-02
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.850000e-01
25% 2.794890e+05 3.000000e+00 3.000000e+00 4.000000e+00 4.000000e+00 3.000000e+00 3.000000e+00 4.000000e+00 3.000000e+00 3.000000e+00 ... 4.000000e+00 3.000000e+00 3.000000e+00 3.000000e+00 3.000000e+00 3.000000e+00 4.000000e+00 3.000000e+00 3.000000e+00 4.700000e-01
50% 5.589780e+05 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 ... 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00 5.050000e-01
75% 8.384670e+05 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 ... 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 6.000000e+00 5.400000e-01
max 1.117956e+06 1.600000e+01 1.800000e+01 1.600000e+01 1.700000e+01 1.700000e+01 1.700000e+01 1.600000e+01 1.600000e+01 1.600000e+01 ... 1.700000e+01 1.700000e+01 1.600000e+01 1.600000e+01 1.700000e+01 1.800000e+01 1.900000e+01 1.600000e+01 1.600000e+01 7.250000e-01

8 rows × 22 columns

test.shape
(745305, 21)
test.head()
id MonsoonIntensity TopographyDrainage RiverManagement Deforestation Urbanization ClimateChange DamsQuality Siltation AgriculturalPractices ... IneffectiveDisasterPreparedness DrainageSystems CoastalVulnerability Landslides Watersheds DeterioratingInfrastructure PopulationScore WetlandLoss InadequatePlanning PoliticalFactors
0 1117957 4 6 3 5 6 7 8 7 8 ... 8 5 7 5 6 3 6 4 4 5
1 1117958 4 4 2 9 5 5 4 7 5 ... 2 4 7 4 5 1 7 4 4 3
2 1117959 1 3 6 5 7 2 4 6 4 ... 7 9 2 5 5 2 3 6 8 3
3 1117960 2 4 4 6 4 5 4 3 4 ... 7 8 4 6 7 6 4 2 4 4
4 1117961 6 3 2 4 6 4 5 5 3 ... 4 3 2 6 4 6 8 4 5 5

5 rows × 21 columns

test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745305 entries, 0 to 745304
Data columns (total 21 columns):
 #   Column                           Non-Null Count   Dtype
---  ------                           --------------   -----
 0   id                               745305 non-null  int64
 1   MonsoonIntensity                 745305 non-null  int64
 2   TopographyDrainage               745305 non-null  int64
 3   RiverManagement                  745305 non-null  int64
 4   Deforestation                    745305 non-null  int64
 5   Urbanization                     745305 non-null  int64
 6   ClimateChange                    745305 non-null  int64
 7   DamsQuality                      745305 non-null  int64
 8   Siltation                        745305 non-null  int64
 9   AgriculturalPractices            745305 non-null  int64
 10  Encroachments                    745305 non-null  int64
 11  IneffectiveDisasterPreparedness  745305 non-null  int64
 12  DrainageSystems                  745305 non-null  int64
 13  CoastalVulnerability             745305 non-null  int64
 14  Landslides                       745305 non-null  int64
 15  Watersheds                       745305 non-null  int64
 16  DeterioratingInfrastructure      745305 non-null  int64
 17  PopulationScore                  745305 non-null  int64
 18  WetlandLoss                      745305 non-null  int64
 19  InadequatePlanning               745305 non-null  int64
 20  PoliticalFactors                 745305 non-null  int64
dtypes: int64(21)
memory usage: 119.4 MB
test.describe()
id MonsoonIntensity TopographyDrainage RiverManagement Deforestation Urbanization ClimateChange DamsQuality Siltation AgriculturalPractices ... IneffectiveDisasterPreparedness DrainageSystems CoastalVulnerability Landslides Watersheds DeterioratingInfrastructure PopulationScore WetlandLoss InadequatePlanning PoliticalFactors
count 7.453050e+05 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 ... 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000 745305.000000
mean 1.490609e+06 4.915610 4.930288 4.960027 4.946084 4.938424 4.933524 4.958468 4.927651 4.945308 ... 4.947436 4.944003 4.957209 4.927620 4.930720 4.926062 4.926957 4.948424 4.940204 4.943918
std 2.151512e+05 2.056295 2.094117 2.071722 2.052602 2.081816 2.059243 2.089312 2.068110 2.073404 ... 2.081322 2.072335 2.088787 2.079006 2.083348 2.065638 2.073692 2.065891 2.079128 2.087387
min 1.117957e+06 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.304283e+06 3.000000 3.000000 4.000000 4.000000 3.000000 3.000000 4.000000 3.000000 3.000000 ... 3.000000 4.000000 3.000000 3.000000 3.000000 3.000000 3.000000 4.000000 3.000000 3.000000
50% 1.490609e+06 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 ... 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
75% 1.676935e+06 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 ... 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000
max 1.863261e+06 16.000000 17.000000 16.000000 17.000000 17.000000 17.000000 16.000000 16.000000 16.000000 ... 16.000000 17.000000 17.000000 16.000000 16.000000 17.000000 19.000000 22.000000 16.000000 16.000000

8 rows × 21 columns


def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.drop('id', axis=1, inplace=True)
    return df_copy
train_copy = get_preprocessed_df(train)

print("train_copy shape:", train_copy.shape)
train_copy shape: (1117957, 21)
test_copy = get_preprocessed_df(test)

print("test_copy shape:", test_copy.shape)
test_copy shape: (745305, 20)
def get_train_dataset(df=None):
    df_copy = get_preprocessed_df(df)
    X_features = df_copy.iloc[:, :-1]
    y_target = df_copy.iloc[:, -1]
    
    # 학습 데이터 세트 반환
    return X_features, y_target

# 학습 데이터 분리
X_features, y_target = get_train_dataset(train)

# 결과 출력 (확인용)
print("X_features shape:", X_features.shape)
print("y_target shape:", y_target.shape)
X_features shape: (1117957, 20)
y_target shape: (1117957,)

corr = train_copy.corr()

plt.figure(figsize=(14, 14))
sns.heatmap(corr, annot=True, fmt='.3f')
plt.show()

png

corr
MonsoonIntensity TopographyDrainage RiverManagement Deforestation Urbanization ClimateChange DamsQuality Siltation AgriculturalPractices Encroachments ... DrainageSystems CoastalVulnerability Landslides Watersheds DeterioratingInfrastructure PopulationScore WetlandLoss InadequatePlanning PoliticalFactors FloodProbability
MonsoonIntensity 1.000000 -0.007362 -0.008070 -0.007251 -0.009309 -0.008031 -0.007787 -0.007836 -0.008232 -0.010309 ... -0.009716 -0.010659 -0.009121 -0.008900 -0.008486 -0.008679 -0.006811 -0.008155 -0.008474 0.189098
TopographyDrainage -0.007362 1.000000 -0.009924 -0.008548 -0.010532 -0.009619 -0.007607 -0.009824 -0.009496 -0.012887 ... -0.010056 -0.012526 -0.010240 -0.011067 -0.006628 -0.010815 -0.010267 -0.011617 -0.012350 0.187635
RiverManagement -0.008070 -0.009924 1.000000 -0.008574 -0.012292 -0.009237 -0.008711 -0.010058 -0.010783 -0.011615 ... -0.011277 -0.011680 -0.008994 -0.011412 -0.005827 -0.006727 -0.010069 -0.009673 -0.011550 0.187131
Deforestation -0.007251 -0.008548 -0.008574 1.000000 -0.012248 -0.008266 -0.009356 -0.011536 -0.010039 -0.013175 ... -0.010490 -0.012388 -0.009257 -0.010671 -0.008862 -0.011777 -0.011004 -0.010424 -0.009661 0.184001
Urbanization -0.009309 -0.010532 -0.012292 -0.012248 1.000000 -0.011199 -0.011128 -0.010153 -0.010559 -0.010784 ... -0.012572 -0.014497 -0.010582 -0.012107 -0.010656 -0.011485 -0.011023 -0.011584 -0.013005 0.180861
ClimateChange -0.008031 -0.009619 -0.009237 -0.008266 -0.011199 1.000000 -0.008427 -0.009457 -0.011517 -0.012533 ... -0.009650 -0.013005 -0.009352 -0.009882 -0.006324 -0.010332 -0.009376 -0.010772 -0.011379 0.184761
DamsQuality -0.007787 -0.007607 -0.008711 -0.009356 -0.011128 -0.008427 1.000000 -0.009401 -0.009033 -0.010890 ... -0.010439 -0.012096 -0.009924 -0.009085 -0.009831 -0.009599 -0.009372 -0.011374 -0.013081 0.187996
Siltation -0.007836 -0.009824 -0.010058 -0.011536 -0.010153 -0.009457 -0.009401 1.000000 -0.010179 -0.010628 ... -0.010702 -0.011762 -0.009173 -0.009457 -0.006312 -0.008739 -0.008667 -0.009079 -0.008493 0.186789
AgriculturalPractices -0.008232 -0.009496 -0.010783 -0.010039 -0.010559 -0.011517 -0.009033 -0.010179 1.000000 -0.012244 ... -0.010845 -0.011415 -0.010624 -0.009901 -0.008708 -0.008958 -0.011392 -0.010418 -0.011157 0.183366
Encroachments -0.010309 -0.012887 -0.011615 -0.013175 -0.010784 -0.012533 -0.010890 -0.010628 -0.012244 1.000000 ... -0.011994 -0.016203 -0.011299 -0.013542 -0.010404 -0.010645 -0.010364 -0.012175 -0.011860 0.178841
IneffectiveDisasterPreparedness -0.008032 -0.010746 -0.010675 -0.009512 -0.012685 -0.011346 -0.009515 -0.009099 -0.010763 -0.012211 ... -0.012613 -0.009798 -0.009669 -0.011588 -0.009871 -0.011893 -0.010299 -0.012080 -0.010563 0.183109
DrainageSystems -0.009716 -0.010056 -0.011277 -0.010490 -0.012572 -0.009650 -0.010439 -0.010702 -0.010845 -0.011994 ... 1.000000 -0.013467 -0.011215 -0.012690 -0.009601 -0.011975 -0.011199 -0.013060 -0.012244 0.179305
CoastalVulnerability -0.010659 -0.012526 -0.011680 -0.012388 -0.014497 -0.013005 -0.012096 -0.011762 -0.011415 -0.016203 ... -0.013467 1.000000 -0.013177 -0.012018 -0.009699 -0.013256 -0.011989 -0.014006 -0.013275 0.177774
Landslides -0.009121 -0.010240 -0.008994 -0.009257 -0.010582 -0.009352 -0.009924 -0.009173 -0.010624 -0.011299 ... -0.011215 -0.013177 1.000000 -0.012123 -0.007216 -0.008959 -0.011226 -0.011052 -0.010277 0.185346
Watersheds -0.008900 -0.011067 -0.011412 -0.010671 -0.012107 -0.009882 -0.009085 -0.009457 -0.009901 -0.013542 ... -0.012690 -0.012018 -0.012123 1.000000 -0.009812 -0.010315 -0.012403 -0.013059 -0.011324 0.181907
DeterioratingInfrastructure -0.008486 -0.006628 -0.005827 -0.008862 -0.010656 -0.006324 -0.009831 -0.006312 -0.008708 -0.010404 ... -0.009601 -0.009699 -0.007216 -0.009812 1.000000 -0.007608 -0.008607 -0.008214 -0.009388 0.190007
PopulationScore -0.008679 -0.010815 -0.006727 -0.011777 -0.011485 -0.010332 -0.009599 -0.008739 -0.008958 -0.010645 ... -0.011975 -0.013256 -0.008959 -0.010315 -0.007608 1.000000 -0.011128 -0.009847 -0.012772 0.185890
WetlandLoss -0.006811 -0.010267 -0.010069 -0.011004 -0.011023 -0.009376 -0.009372 -0.008667 -0.011392 -0.010364 ... -0.011199 -0.011989 -0.011226 -0.012403 -0.008607 -0.011128 1.000000 -0.009830 -0.011061 0.183396
InadequatePlanning -0.008155 -0.011617 -0.009673 -0.010424 -0.011584 -0.010772 -0.011374 -0.009079 -0.010418 -0.012175 ... -0.013060 -0.014006 -0.011052 -0.013059 -0.008214 -0.009847 -0.009830 1.000000 -0.011540 0.180968
PoliticalFactors -0.008474 -0.012350 -0.011550 -0.009661 -0.013005 -0.011379 -0.013081 -0.008493 -0.011157 -0.011860 ... -0.012244 -0.013275 -0.010277 -0.011324 -0.009388 -0.012772 -0.011061 -0.011540 1.000000 0.182417
FloodProbability 0.189098 0.187635 0.187131 0.184001 0.180861 0.184761 0.187996 0.186789 0.183366 0.178841 ... 0.179305 0.177774 0.185346 0.181907 0.190007 0.185890 0.183396 0.180968 0.182417 1.000000

21 rows × 21 columns


X_features.hist(figsize=(10, 9))
plt.tight_layout()
plt.show()

png


plt.title('Histogram')
plt.xticks(rotation=15)
sns.histplot(y_target, kde=True)
plt.show()

png


test_copy.hist(figsize=(10, 9))
plt.tight_layout()
plt.show()

png


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=42)

from sklearn.metrics import r2_score

def get_r2_score(model):
    pred = model.predict(X_test)
    r2 = r2_score(y_test, pred)
    print('{0} r2 score: {1}'.format(model.__class__.__name__, np.round(r2, 3)))
    return r2

# 여러 모델들을 list 형태로 인자로 받아서 개별 모델들의 R² 점수를 list로 반환.
def get_r2_scores(models):
    r2_scores = []
    for model in models:
        r2 = get_r2_score(model)
        r2_scores.append(r2)
    return r2_scores