[kaggle] Regression with a Flood Prediction Dataset
Regression with a Flood Prediction Dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
train.shape
(1117957, 22)
train.head()
| id | MonsoonIntensity | TopographyDrainage | RiverManagement | Deforestation | Urbanization | ClimateChange | DamsQuality | Siltation | AgriculturalPractices | ... | DrainageSystems | CoastalVulnerability | Landslides | Watersheds | DeterioratingInfrastructure | PopulationScore | WetlandLoss | InadequatePlanning | PoliticalFactors | FloodProbability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5 | 8 | 5 | 8 | 6 | 4 | 4 | 3 | 3 | ... | 5 | 3 | 3 | 5 | 4 | 7 | 5 | 7 | 3 | 0.445 |
| 1 | 1 | 6 | 7 | 4 | 4 | 8 | 8 | 3 | 5 | 4 | ... | 7 | 2 | 0 | 3 | 5 | 3 | 3 | 4 | 3 | 0.450 |
| 2 | 2 | 6 | 5 | 6 | 7 | 3 | 7 | 1 | 5 | 4 | ... | 7 | 3 | 7 | 5 | 6 | 8 | 2 | 3 | 3 | 0.530 |
| 3 | 3 | 3 | 4 | 6 | 5 | 4 | 8 | 4 | 7 | 6 | ... | 2 | 4 | 7 | 4 | 4 | 6 | 5 | 7 | 5 | 0.535 |
| 4 | 4 | 5 | 3 | 2 | 6 | 4 | 4 | 3 | 3 | 3 | ... | 2 | 2 | 6 | 6 | 4 | 1 | 2 | 3 | 5 | 0.415 |
5 rows × 22 columns
columns 설명
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 1117957 non-null int64
1 MonsoonIntensity 1117957 non-null int64
2 TopographyDrainage 1117957 non-null int64
3 RiverManagement 1117957 non-null int64
4 Deforestation 1117957 non-null int64
5 Urbanization 1117957 non-null int64
6 ClimateChange 1117957 non-null int64
7 DamsQuality 1117957 non-null int64
8 Siltation 1117957 non-null int64
9 AgriculturalPractices 1117957 non-null int64
10 Encroachments 1117957 non-null int64
11 IneffectiveDisasterPreparedness 1117957 non-null int64
12 DrainageSystems 1117957 non-null int64
13 CoastalVulnerability 1117957 non-null int64
14 Landslides 1117957 non-null int64
15 Watersheds 1117957 non-null int64
16 DeterioratingInfrastructure 1117957 non-null int64
17 PopulationScore 1117957 non-null int64
18 WetlandLoss 1117957 non-null int64
19 InadequatePlanning 1117957 non-null int64
20 PoliticalFactors 1117957 non-null int64
21 FloodProbability 1117957 non-null float64
dtypes: float64(1), int64(21)
memory usage: 187.6 MB
train.describe()
| id | MonsoonIntensity | TopographyDrainage | RiverManagement | Deforestation | Urbanization | ClimateChange | DamsQuality | Siltation | AgriculturalPractices | ... | DrainageSystems | CoastalVulnerability | Landslides | Watersheds | DeterioratingInfrastructure | PopulationScore | WetlandLoss | InadequatePlanning | PoliticalFactors | FloodProbability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | ... | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 | 1.117957e+06 |
| mean | 5.589780e+05 | 4.921450e+00 | 4.926671e+00 | 4.955322e+00 | 4.942240e+00 | 4.942517e+00 | 4.934093e+00 | 4.955878e+00 | 4.927791e+00 | 4.942619e+00 | ... | 4.946893e+00 | 4.953999e+00 | 4.931376e+00 | 4.929032e+00 | 4.925907e+00 | 4.927520e+00 | 4.950859e+00 | 4.940587e+00 | 4.939004e+00 | 5.044803e-01 |
| std | 3.227265e+05 | 2.056387e+00 | 2.093879e+00 | 2.072186e+00 | 2.051689e+00 | 2.083391e+00 | 2.057742e+00 | 2.083063e+00 | 2.065992e+00 | 2.068545e+00 | ... | 2.072333e+00 | 2.088899e+00 | 2.078287e+00 | 2.082395e+00 | 2.064813e+00 | 2.074176e+00 | 2.068696e+00 | 2.081123e+00 | 2.090350e+00 | 5.102610e-02 |
| min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | ... | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.850000e-01 |
| 25% | 2.794890e+05 | 3.000000e+00 | 3.000000e+00 | 4.000000e+00 | 4.000000e+00 | 3.000000e+00 | 3.000000e+00 | 4.000000e+00 | 3.000000e+00 | 3.000000e+00 | ... | 4.000000e+00 | 3.000000e+00 | 3.000000e+00 | 3.000000e+00 | 3.000000e+00 | 3.000000e+00 | 4.000000e+00 | 3.000000e+00 | 3.000000e+00 | 4.700000e-01 |
| 50% | 5.589780e+05 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | ... | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.000000e+00 | 5.050000e-01 |
| 75% | 8.384670e+05 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | ... | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 6.000000e+00 | 5.400000e-01 |
| max | 1.117956e+06 | 1.600000e+01 | 1.800000e+01 | 1.600000e+01 | 1.700000e+01 | 1.700000e+01 | 1.700000e+01 | 1.600000e+01 | 1.600000e+01 | 1.600000e+01 | ... | 1.700000e+01 | 1.700000e+01 | 1.600000e+01 | 1.600000e+01 | 1.700000e+01 | 1.800000e+01 | 1.900000e+01 | 1.600000e+01 | 1.600000e+01 | 7.250000e-01 |
8 rows × 22 columns
test.shape
(745305, 21)
test.head()
| id | MonsoonIntensity | TopographyDrainage | RiverManagement | Deforestation | Urbanization | ClimateChange | DamsQuality | Siltation | AgriculturalPractices | ... | IneffectiveDisasterPreparedness | DrainageSystems | CoastalVulnerability | Landslides | Watersheds | DeterioratingInfrastructure | PopulationScore | WetlandLoss | InadequatePlanning | PoliticalFactors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1117957 | 4 | 6 | 3 | 5 | 6 | 7 | 8 | 7 | 8 | ... | 8 | 5 | 7 | 5 | 6 | 3 | 6 | 4 | 4 | 5 |
| 1 | 1117958 | 4 | 4 | 2 | 9 | 5 | 5 | 4 | 7 | 5 | ... | 2 | 4 | 7 | 4 | 5 | 1 | 7 | 4 | 4 | 3 |
| 2 | 1117959 | 1 | 3 | 6 | 5 | 7 | 2 | 4 | 6 | 4 | ... | 7 | 9 | 2 | 5 | 5 | 2 | 3 | 6 | 8 | 3 |
| 3 | 1117960 | 2 | 4 | 4 | 6 | 4 | 5 | 4 | 3 | 4 | ... | 7 | 8 | 4 | 6 | 7 | 6 | 4 | 2 | 4 | 4 |
| 4 | 1117961 | 6 | 3 | 2 | 4 | 6 | 4 | 5 | 5 | 3 | ... | 4 | 3 | 2 | 6 | 4 | 6 | 8 | 4 | 5 | 5 |
5 rows × 21 columns
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745305 entries, 0 to 745304
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 745305 non-null int64
1 MonsoonIntensity 745305 non-null int64
2 TopographyDrainage 745305 non-null int64
3 RiverManagement 745305 non-null int64
4 Deforestation 745305 non-null int64
5 Urbanization 745305 non-null int64
6 ClimateChange 745305 non-null int64
7 DamsQuality 745305 non-null int64
8 Siltation 745305 non-null int64
9 AgriculturalPractices 745305 non-null int64
10 Encroachments 745305 non-null int64
11 IneffectiveDisasterPreparedness 745305 non-null int64
12 DrainageSystems 745305 non-null int64
13 CoastalVulnerability 745305 non-null int64
14 Landslides 745305 non-null int64
15 Watersheds 745305 non-null int64
16 DeterioratingInfrastructure 745305 non-null int64
17 PopulationScore 745305 non-null int64
18 WetlandLoss 745305 non-null int64
19 InadequatePlanning 745305 non-null int64
20 PoliticalFactors 745305 non-null int64
dtypes: int64(21)
memory usage: 119.4 MB
test.describe()
| id | MonsoonIntensity | TopographyDrainage | RiverManagement | Deforestation | Urbanization | ClimateChange | DamsQuality | Siltation | AgriculturalPractices | ... | IneffectiveDisasterPreparedness | DrainageSystems | CoastalVulnerability | Landslides | Watersheds | DeterioratingInfrastructure | PopulationScore | WetlandLoss | InadequatePlanning | PoliticalFactors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.453050e+05 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | ... | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 | 745305.000000 |
| mean | 1.490609e+06 | 4.915610 | 4.930288 | 4.960027 | 4.946084 | 4.938424 | 4.933524 | 4.958468 | 4.927651 | 4.945308 | ... | 4.947436 | 4.944003 | 4.957209 | 4.927620 | 4.930720 | 4.926062 | 4.926957 | 4.948424 | 4.940204 | 4.943918 |
| std | 2.151512e+05 | 2.056295 | 2.094117 | 2.071722 | 2.052602 | 2.081816 | 2.059243 | 2.089312 | 2.068110 | 2.073404 | ... | 2.081322 | 2.072335 | 2.088787 | 2.079006 | 2.083348 | 2.065638 | 2.073692 | 2.065891 | 2.079128 | 2.087387 |
| min | 1.117957e+06 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.304283e+06 | 3.000000 | 3.000000 | 4.000000 | 4.000000 | 3.000000 | 3.000000 | 4.000000 | 3.000000 | 3.000000 | ... | 3.000000 | 4.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 | 4.000000 | 3.000000 | 3.000000 |
| 50% | 1.490609e+06 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | ... | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
| 75% | 1.676935e+06 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | ... | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
| max | 1.863261e+06 | 16.000000 | 17.000000 | 16.000000 | 17.000000 | 17.000000 | 17.000000 | 16.000000 | 16.000000 | 16.000000 | ... | 16.000000 | 17.000000 | 17.000000 | 16.000000 | 16.000000 | 17.000000 | 19.000000 | 22.000000 | 16.000000 | 16.000000 |
8 rows × 21 columns
def get_preprocessed_df(df=None):
df_copy = df.copy()
df_copy.drop('id', axis=1, inplace=True)
return df_copy
train_copy = get_preprocessed_df(train)
print("train_copy shape:", train_copy.shape)
train_copy shape: (1117957, 21)
test_copy = get_preprocessed_df(test)
print("test_copy shape:", test_copy.shape)
test_copy shape: (745305, 20)
def get_train_dataset(df=None):
df_copy = get_preprocessed_df(df)
X_features = df_copy.iloc[:, :-1]
y_target = df_copy.iloc[:, -1]
# 학습 데이터 세트 반환
return X_features, y_target
# 학습 데이터 분리
X_features, y_target = get_train_dataset(train)
# 결과 출력 (확인용)
print("X_features shape:", X_features.shape)
print("y_target shape:", y_target.shape)
X_features shape: (1117957, 20)
y_target shape: (1117957,)
corr = train_copy.corr()
plt.figure(figsize=(14, 14))
sns.heatmap(corr, annot=True, fmt='.3f')
plt.show()

corr
| MonsoonIntensity | TopographyDrainage | RiverManagement | Deforestation | Urbanization | ClimateChange | DamsQuality | Siltation | AgriculturalPractices | Encroachments | ... | DrainageSystems | CoastalVulnerability | Landslides | Watersheds | DeterioratingInfrastructure | PopulationScore | WetlandLoss | InadequatePlanning | PoliticalFactors | FloodProbability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MonsoonIntensity | 1.000000 | -0.007362 | -0.008070 | -0.007251 | -0.009309 | -0.008031 | -0.007787 | -0.007836 | -0.008232 | -0.010309 | ... | -0.009716 | -0.010659 | -0.009121 | -0.008900 | -0.008486 | -0.008679 | -0.006811 | -0.008155 | -0.008474 | 0.189098 |
| TopographyDrainage | -0.007362 | 1.000000 | -0.009924 | -0.008548 | -0.010532 | -0.009619 | -0.007607 | -0.009824 | -0.009496 | -0.012887 | ... | -0.010056 | -0.012526 | -0.010240 | -0.011067 | -0.006628 | -0.010815 | -0.010267 | -0.011617 | -0.012350 | 0.187635 |
| RiverManagement | -0.008070 | -0.009924 | 1.000000 | -0.008574 | -0.012292 | -0.009237 | -0.008711 | -0.010058 | -0.010783 | -0.011615 | ... | -0.011277 | -0.011680 | -0.008994 | -0.011412 | -0.005827 | -0.006727 | -0.010069 | -0.009673 | -0.011550 | 0.187131 |
| Deforestation | -0.007251 | -0.008548 | -0.008574 | 1.000000 | -0.012248 | -0.008266 | -0.009356 | -0.011536 | -0.010039 | -0.013175 | ... | -0.010490 | -0.012388 | -0.009257 | -0.010671 | -0.008862 | -0.011777 | -0.011004 | -0.010424 | -0.009661 | 0.184001 |
| Urbanization | -0.009309 | -0.010532 | -0.012292 | -0.012248 | 1.000000 | -0.011199 | -0.011128 | -0.010153 | -0.010559 | -0.010784 | ... | -0.012572 | -0.014497 | -0.010582 | -0.012107 | -0.010656 | -0.011485 | -0.011023 | -0.011584 | -0.013005 | 0.180861 |
| ClimateChange | -0.008031 | -0.009619 | -0.009237 | -0.008266 | -0.011199 | 1.000000 | -0.008427 | -0.009457 | -0.011517 | -0.012533 | ... | -0.009650 | -0.013005 | -0.009352 | -0.009882 | -0.006324 | -0.010332 | -0.009376 | -0.010772 | -0.011379 | 0.184761 |
| DamsQuality | -0.007787 | -0.007607 | -0.008711 | -0.009356 | -0.011128 | -0.008427 | 1.000000 | -0.009401 | -0.009033 | -0.010890 | ... | -0.010439 | -0.012096 | -0.009924 | -0.009085 | -0.009831 | -0.009599 | -0.009372 | -0.011374 | -0.013081 | 0.187996 |
| Siltation | -0.007836 | -0.009824 | -0.010058 | -0.011536 | -0.010153 | -0.009457 | -0.009401 | 1.000000 | -0.010179 | -0.010628 | ... | -0.010702 | -0.011762 | -0.009173 | -0.009457 | -0.006312 | -0.008739 | -0.008667 | -0.009079 | -0.008493 | 0.186789 |
| AgriculturalPractices | -0.008232 | -0.009496 | -0.010783 | -0.010039 | -0.010559 | -0.011517 | -0.009033 | -0.010179 | 1.000000 | -0.012244 | ... | -0.010845 | -0.011415 | -0.010624 | -0.009901 | -0.008708 | -0.008958 | -0.011392 | -0.010418 | -0.011157 | 0.183366 |
| Encroachments | -0.010309 | -0.012887 | -0.011615 | -0.013175 | -0.010784 | -0.012533 | -0.010890 | -0.010628 | -0.012244 | 1.000000 | ... | -0.011994 | -0.016203 | -0.011299 | -0.013542 | -0.010404 | -0.010645 | -0.010364 | -0.012175 | -0.011860 | 0.178841 |
| IneffectiveDisasterPreparedness | -0.008032 | -0.010746 | -0.010675 | -0.009512 | -0.012685 | -0.011346 | -0.009515 | -0.009099 | -0.010763 | -0.012211 | ... | -0.012613 | -0.009798 | -0.009669 | -0.011588 | -0.009871 | -0.011893 | -0.010299 | -0.012080 | -0.010563 | 0.183109 |
| DrainageSystems | -0.009716 | -0.010056 | -0.011277 | -0.010490 | -0.012572 | -0.009650 | -0.010439 | -0.010702 | -0.010845 | -0.011994 | ... | 1.000000 | -0.013467 | -0.011215 | -0.012690 | -0.009601 | -0.011975 | -0.011199 | -0.013060 | -0.012244 | 0.179305 |
| CoastalVulnerability | -0.010659 | -0.012526 | -0.011680 | -0.012388 | -0.014497 | -0.013005 | -0.012096 | -0.011762 | -0.011415 | -0.016203 | ... | -0.013467 | 1.000000 | -0.013177 | -0.012018 | -0.009699 | -0.013256 | -0.011989 | -0.014006 | -0.013275 | 0.177774 |
| Landslides | -0.009121 | -0.010240 | -0.008994 | -0.009257 | -0.010582 | -0.009352 | -0.009924 | -0.009173 | -0.010624 | -0.011299 | ... | -0.011215 | -0.013177 | 1.000000 | -0.012123 | -0.007216 | -0.008959 | -0.011226 | -0.011052 | -0.010277 | 0.185346 |
| Watersheds | -0.008900 | -0.011067 | -0.011412 | -0.010671 | -0.012107 | -0.009882 | -0.009085 | -0.009457 | -0.009901 | -0.013542 | ... | -0.012690 | -0.012018 | -0.012123 | 1.000000 | -0.009812 | -0.010315 | -0.012403 | -0.013059 | -0.011324 | 0.181907 |
| DeterioratingInfrastructure | -0.008486 | -0.006628 | -0.005827 | -0.008862 | -0.010656 | -0.006324 | -0.009831 | -0.006312 | -0.008708 | -0.010404 | ... | -0.009601 | -0.009699 | -0.007216 | -0.009812 | 1.000000 | -0.007608 | -0.008607 | -0.008214 | -0.009388 | 0.190007 |
| PopulationScore | -0.008679 | -0.010815 | -0.006727 | -0.011777 | -0.011485 | -0.010332 | -0.009599 | -0.008739 | -0.008958 | -0.010645 | ... | -0.011975 | -0.013256 | -0.008959 | -0.010315 | -0.007608 | 1.000000 | -0.011128 | -0.009847 | -0.012772 | 0.185890 |
| WetlandLoss | -0.006811 | -0.010267 | -0.010069 | -0.011004 | -0.011023 | -0.009376 | -0.009372 | -0.008667 | -0.011392 | -0.010364 | ... | -0.011199 | -0.011989 | -0.011226 | -0.012403 | -0.008607 | -0.011128 | 1.000000 | -0.009830 | -0.011061 | 0.183396 |
| InadequatePlanning | -0.008155 | -0.011617 | -0.009673 | -0.010424 | -0.011584 | -0.010772 | -0.011374 | -0.009079 | -0.010418 | -0.012175 | ... | -0.013060 | -0.014006 | -0.011052 | -0.013059 | -0.008214 | -0.009847 | -0.009830 | 1.000000 | -0.011540 | 0.180968 |
| PoliticalFactors | -0.008474 | -0.012350 | -0.011550 | -0.009661 | -0.013005 | -0.011379 | -0.013081 | -0.008493 | -0.011157 | -0.011860 | ... | -0.012244 | -0.013275 | -0.010277 | -0.011324 | -0.009388 | -0.012772 | -0.011061 | -0.011540 | 1.000000 | 0.182417 |
| FloodProbability | 0.189098 | 0.187635 | 0.187131 | 0.184001 | 0.180861 | 0.184761 | 0.187996 | 0.186789 | 0.183366 | 0.178841 | ... | 0.179305 | 0.177774 | 0.185346 | 0.181907 | 0.190007 | 0.185890 | 0.183396 | 0.180968 | 0.182417 | 1.000000 |
21 rows × 21 columns
X_features.hist(figsize=(10, 9))
plt.tight_layout()
plt.show()

plt.title('Histogram')
plt.xticks(rotation=15)
sns.histplot(y_target, kde=True)
plt.show()

test_copy.hist(figsize=(10, 9))
plt.tight_layout()
plt.show()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=42)
from sklearn.metrics import r2_score
def get_r2_score(model):
pred = model.predict(X_test)
r2 = r2_score(y_test, pred)
print('{0} r2 score: {1}'.format(model.__class__.__name__, np.round(r2, 3)))
return r2
# 여러 모델들을 list 형태로 인자로 받아서 개별 모델들의 R² 점수를 list로 반환.
def get_r2_scores(models):
r2_scores = []
for model in models:
r2 = get_r2_score(model)
r2_scores.append(r2)
return r2_scores