In [ ]:
# matplotlib.pyplot: A plotting library used for creating static, animated, and interactive visualizations in Python.
# seaborn: A Python visualization library based on matplotlib that provides a high-level interface for drawing attractive statistical graphics.
# scipy.stats: A module in SciPy that contains a large number of probability distributions and statistical functions.
# numpy: A library for working with arrays and performing numerical computations.
# import modules
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as st
from sklearn import ensemble, tree, linear_model
import missingno as msno
In [ ]:
!pip install missingno #install missingno module
1. First, load the data - test and train datasets¶
In [ ]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
2. Get quick summary of the data (excludes NaN values) - descriptive statistics¶
In [ ]:
train.describe()
Out[ ]:
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | ... | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
| mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | ... | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
| std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | ... | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
| min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
| 25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
| 50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | ... | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
| 75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | ... | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
| max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | ... | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
8 rows × 38 columns
3. Get the first and last few rows with 'head' and 'tail' functions of the Pandas library¶
In [ ]:
train.head()
Out[ ]:
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
In [ ]:
train.tail()
Out[ ]:
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1455 | 1456 | 60 | RL | 62.0 | 7917 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 8 | 2007 | WD | Normal | 175000 |
| 1456 | 1457 | 20 | RL | 85.0 | 13175 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | MnPrv | NaN | 0 | 2 | 2010 | WD | Normal | 210000 |
| 1457 | 1458 | 70 | RL | 66.0 | 9042 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | GdPrv | Shed | 2500 | 5 | 2010 | WD | Normal | 266500 |
| 1458 | 1459 | 20 | RL | 68.0 | 9717 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 142125 |
| 1459 | 1460 | 20 | RL | 75.0 | 9937 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 6 | 2008 | WD | Normal | 147500 |
5 rows × 81 columns
Show shape of test and train¶
In [ ]:
train.shape, test.shape
Out[ ]:
((1460, 81), (1459, 80))
4. We examine the numeric and categorical features in the dataset¶
In [ ]:
numeric_features = train.select_dtypes(include=[np.number])
print(numeric_features.columns)
Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
dtype='object')
In [ ]:
categorical_features = train.select_dtypes(include='object')
print(categorical_features.columns)
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
'SaleType', 'SaleCondition'],
dtype='object')
5. a. Visualising data.... msno--is the Python library used for visualizing missing data in a dataset.¶
In [ ]:
# visualise missing data for a sample of 250
msno.matrix(train.sample(250))
Out[ ]:
<Axes: >
5b. Heatmap¶
The missingno correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the presence of another:
In [ ]:
msno.heatmap(train)
Out[ ]:
<Axes: >
5c. Missing values Visualisation... Overall Purpose: The primary use of this line of code is to quickly assess the completeness of the data in different columns of the train DataFrame. It helps in identifying which columns have missing data and to what extent, which is crucial for data cleaning and preprocessing in any data analysis or machine learning task.¶
In [ ]:
msno.bar(train.sample(1000))
Out[ ]:
<Axes: >
5d. Dendogram: The dendrogram allows you to more fully correlate variable completion, revealing trends deeper than the pairwise ones visible in the correlation heatmap:¶
Understanding the Dendrogram: NEW¶
- Each branch of the dendrogram represents a variable in your dataset. Variables that are grouped together (i.e., branches that emerge from a common "root") tend to have similar patterns of missing data. The arrangement helps you understand which variables have missing values that are correlated or occur together.
- Usage: This function is particularly useful in the exploratory data analysis phase to understand the structure of missingness in your data. By identifying patterns of missing data, it can help in formulating strategies for handling missing values, such as imputation, or in identifying potential issues with data collection processes.
In [ ]:
msno.dendrogram(train)
Out[ ]:
<Axes: >
6a. Estimate skewness and kurtosis¶
- Skewness: If your training dataset has a high skewness, it could lead to biased models, especially in linear models like regression. For example, right-skewed data might lead a model to predict higher values more often. You might need to transform the data (e.g., using a logarithmic transformation) to reduce skewness.
- Kurtosis: High kurtosis might indicate the presence of outliers that could affect the model's performance. Models like linear regression can be particularly sensitive to outliers, so it’s important to detect and possibly address them.
In [ ]:
train.skew(numeric_only=True), train.kurtosis(numeric_only=True)
Out[ ]:
(Id 0.000000 MSSubClass 1.407657 LotFrontage 2.163569 LotArea 12.207688 OverallQual 0.216944 OverallCond 0.693067 YearBuilt -0.613461 YearRemodAdd -0.503562 MasVnrArea 2.669084 BsmtFinSF1 1.685503 BsmtFinSF2 4.255261 BsmtUnfSF 0.920268 TotalBsmtSF 1.524255 1stFlrSF 1.376757 2ndFlrSF 0.813030 LowQualFinSF 9.011341 GrLivArea 1.366560 BsmtFullBath 0.596067 BsmtHalfBath 4.103403 FullBath 0.036562 HalfBath 0.675897 BedroomAbvGr 0.211790 KitchenAbvGr 4.488397 TotRmsAbvGrd 0.676341 Fireplaces 0.649565 GarageYrBlt -0.649415 GarageCars -0.342549 GarageArea 0.179981 WoodDeckSF 1.541376 OpenPorchSF 2.364342 EnclosedPorch 3.089872 3SsnPorch 10.304342 ScreenPorch 4.122214 PoolArea 14.828374 MiscVal 24.476794 MoSold 0.212053 YrSold 0.096269 SalePrice 1.882876 dtype: float64, Id -1.200000 MSSubClass 1.580188 LotFrontage 17.452867 LotArea 203.243271 OverallQual 0.096293 OverallCond 1.106413 YearBuilt -0.439552 YearRemodAdd -1.272245 MasVnrArea 10.082417 BsmtFinSF1 11.118236 BsmtFinSF2 20.113338 BsmtUnfSF 0.474994 TotalBsmtSF 13.250483 1stFlrSF 5.745841 2ndFlrSF -0.553464 LowQualFinSF 83.234817 GrLivArea 4.895121 BsmtFullBath -0.839098 BsmtHalfBath 16.396642 FullBath -0.857043 HalfBath -1.076927 BedroomAbvGr 2.230875 KitchenAbvGr 21.532404 TotRmsAbvGrd 0.880762 Fireplaces -0.217237 GarageYrBlt -0.418341 GarageCars 0.220998 GarageArea 0.917067 WoodDeckSF 2.992951 OpenPorchSF 8.490336 EnclosedPorch 10.430766 3SsnPorch 123.662379 ScreenPorch 18.439068 PoolArea 223.268499 MiscVal 701.003342 MoSold -0.404109 YrSold -1.190601 SalePrice 6.536282 dtype: float64)
6b. Plot distribution graphs - Johnson SU, Normal (i.e. Gaussian), and Logarithmic¶
The code plots histograms of the SalePrice data and overlays them with the fitted probability density functions (PDFs) (in red) for three different distributions: Johnson SU, Normal, and Log Normal. This helps to visually compare how well each distribution fits the data.¶
In [ ]:
# Extract the SalePrice column from the DataFrame train and assigns it to the variable y.
y = train['SalePrice']
# Generate 1,000 evenly spaced points between the minimum and maximum values of y. This array x will
# be used to evaluate the Johnson SU, Normal, and Log-normal distribution.
x = np.linspace(y.min(), y.max(), 1000)
# Johnson SU Distribution
plt.figure(1)
plt.title('Johnson SU')
# Plot a histogram of the SalePrice data (y) in blue without displaying the kernel density estimate (kde=False).
sns.histplot(y, kde=False, color="blue")
# Fit the SalePrice data (y) to a Johnson SU distribution and return the parameters of the distribution.
# These parameters are stored in johnson_su_params.
johnson_su_params = st.johnsonsu.fit(y)
# st.johnsonsu.pdf(x, *johnson_su_params): Computes the probability density function (PDF) of the Johnson SU distribution
# at each point in x using the fitted parameters. plt.plot(x, ..., 'r'): Plots the PDF of the Johnson SU distribution in
# red ('r') on top of the histogram.
plt.plot(x, st.johnsonsu.pdf(x, *johnson_su_params), 'r')
# Normal Distribution
plt.figure(2)
plt.title('Normal')
sns.histplot(y, kde=False, color="blue")
# Fit the data to a normal distribution and plot it
norm_params = st.norm.fit(y)
plt.plot(x, st.norm.pdf(x, *norm_params), 'r')
# Log Normal Distribution
plt.figure(3)
plt.title('Log Normal')
sns.histplot(y, kde=False, color="blue")
# Fit the data to a log normal distribution and plot it
lognorm_params = st.lognorm.fit(y)
plt.plot(x, st.lognorm.pdf(x, *lognorm_params), 'r')
plt.show()