# matplotlib.pyplot: A plotting library used for creating static, animated, and interactive visualizations in Python.
# seaborn: A Python visualization library based on matplotlib that provides a high-level interface for drawing attractive statistical graphics.
# scipy.stats: A module in SciPy that contains a large number of probability distributions and statistical functions.
# numpy: A library for working with arrays and performing numerical computations.

# import modules
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as st
from sklearn import ensemble, tree, linear_model
import missingno as msno

!pip install missingno #install missingno module

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train.describe()

train.head()

train.tail()

train.shape, test.shape

((1460, 81), (1459, 80))

numeric_features = train.select_dtypes(include=[np.number])
print(numeric_features.columns)

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

categorical_features = train.select_dtypes(include='object')
print(categorical_features.columns)

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

# visualise missing data for a sample of 250
msno.matrix(train.sample(250))

<Axes: >

msno.heatmap(train)

<Axes: >

msno.bar(train.sample(1000))

<Axes: >

msno.dendrogram(train)

<Axes: >

train.skew(numeric_only=True), train.kurtosis(numeric_only=True)

(Id                0.000000
 MSSubClass        1.407657
 LotFrontage       2.163569
 LotArea          12.207688
 OverallQual       0.216944
 OverallCond       0.693067
 YearBuilt        -0.613461
 YearRemodAdd     -0.503562
 MasVnrArea        2.669084
 BsmtFinSF1        1.685503
 BsmtFinSF2        4.255261
 BsmtUnfSF         0.920268
 TotalBsmtSF       1.524255
 1stFlrSF          1.376757
 2ndFlrSF          0.813030
 LowQualFinSF      9.011341
 GrLivArea         1.366560
 BsmtFullBath      0.596067
 BsmtHalfBath      4.103403
 FullBath          0.036562
 HalfBath          0.675897
 BedroomAbvGr      0.211790
 KitchenAbvGr      4.488397
 TotRmsAbvGrd      0.676341
 Fireplaces        0.649565
 GarageYrBlt      -0.649415
 GarageCars       -0.342549
 GarageArea        0.179981
 WoodDeckSF        1.541376
 OpenPorchSF       2.364342
 EnclosedPorch     3.089872
 3SsnPorch        10.304342
 ScreenPorch       4.122214
 PoolArea         14.828374
 MiscVal          24.476794
 MoSold            0.212053
 YrSold            0.096269
 SalePrice         1.882876
 dtype: float64,
 Id                -1.200000
 MSSubClass         1.580188
 LotFrontage       17.452867
 LotArea          203.243271
 OverallQual        0.096293
 OverallCond        1.106413
 YearBuilt         -0.439552
 YearRemodAdd      -1.272245
 MasVnrArea        10.082417
 BsmtFinSF1        11.118236
 BsmtFinSF2        20.113338
 BsmtUnfSF          0.474994
 TotalBsmtSF       13.250483
 1stFlrSF           5.745841
 2ndFlrSF          -0.553464
 LowQualFinSF      83.234817
 GrLivArea          4.895121
 BsmtFullBath      -0.839098
 BsmtHalfBath      16.396642
 FullBath          -0.857043
 HalfBath          -1.076927
 BedroomAbvGr       2.230875
 KitchenAbvGr      21.532404
 TotRmsAbvGrd       0.880762
 Fireplaces        -0.217237
 GarageYrBlt       -0.418341
 GarageCars         0.220998
 GarageArea         0.917067
 WoodDeckSF         2.992951
 OpenPorchSF        8.490336
 EnclosedPorch     10.430766
 3SsnPorch        123.662379
 ScreenPorch       18.439068
 PoolArea         223.268499
 MiscVal          701.003342
 MoSold            -0.404109
 YrSold            -1.190601
 SalePrice          6.536282
 dtype: float64)

# Extract the SalePrice column from the DataFrame train and assigns it to the variable y.
y = train['SalePrice']
# Generate 1,000 evenly spaced points between the minimum and maximum values of y. This array x will 
# be used to evaluate the Johnson SU, Normal, and Log-normal distribution.
x = np.linspace(y.min(), y.max(), 1000)

# Johnson SU Distribution
plt.figure(1)
plt.title('Johnson SU')
# Plot a histogram of the SalePrice data (y) in blue without displaying the kernel density estimate (kde=False).
sns.histplot(y, kde=False, color="blue")
# Fit the SalePrice data (y) to a Johnson SU distribution and return the parameters of the distribution. 
# These parameters are stored in johnson_su_params.
johnson_su_params = st.johnsonsu.fit(y)

# st.johnsonsu.pdf(x, *johnson_su_params): Computes the probability density function (PDF) of the Johnson SU distribution 
# at each point in x using the fitted parameters. plt.plot(x, ..., 'r'): Plots the PDF of the Johnson SU distribution in 
# red ('r') on top of the histogram.
plt.plot(x, st.johnsonsu.pdf(x, *johnson_su_params), 'r')

# Normal Distribution
plt.figure(2)
plt.title('Normal')
sns.histplot(y, kde=False, color="blue")
# Fit the data to a normal distribution and plot it
norm_params = st.norm.fit(y)
plt.plot(x, st.norm.pdf(x, *norm_params), 'r')

# Log Normal Distribution
plt.figure(3)
plt.title('Log Normal')
sns.histplot(y, kde=False, color="blue")
# Fit the data to a log normal distribution and plot it
lognorm_params = st.lognorm.fit(y)
plt.plot(x, st.lognorm.pdf(x, *lognorm_params), 'r')

plt.show()

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SalePrice
count	1460.000000	1460.000000	1201.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1452.000000	1460.000000	...	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000
mean	730.500000	56.897260	70.049958	10516.828082	6.099315	5.575342	1971.267808	1984.865753	103.685262	443.639726	...	94.244521	46.660274	21.954110	3.409589	15.060959	2.758904	43.489041	6.321918	2007.815753	180921.195890
std	421.610009	42.300571	24.284752	9981.264932	1.382997	1.112799	30.202904	20.645407	181.066207	456.098091	...	125.338794	66.256028	61.119149	29.317331	55.757415	40.177307	496.123024	2.703626	1.328095	79442.502883
min	1.000000	20.000000	21.000000	1300.000000	1.000000	1.000000	1872.000000	1950.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2006.000000	34900.000000
25%	365.750000	20.000000	59.000000	7553.500000	5.000000	5.000000	1954.000000	1967.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5.000000	2007.000000	129975.000000
50%	730.500000	50.000000	69.000000	9478.500000	6.000000	5.000000	1973.000000	1994.000000	0.000000	383.500000	...	0.000000	25.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.000000	2008.000000	163000.000000
75%	1095.250000	70.000000	80.000000	11601.500000	7.000000	6.000000	2000.000000	2004.000000	166.000000	712.250000	...	168.000000	68.000000	0.000000	0.000000	0.000000	0.000000	0.000000	8.000000	2009.000000	214000.000000
max	1460.000000	190.000000	313.000000	215245.000000	10.000000	9.000000	2010.000000	2010.000000	1600.000000	5644.000000	...	857.000000	547.000000	552.000000	508.000000	480.000000	738.000000	15500.000000	12.000000	2010.000000	755000.000000

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
1455	1456	60	RL	62.0	7917	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	8	2007	WD	Normal	175000
1456	1457	20	RL	85.0	13175	Pave	NaN	Reg	Lvl	AllPub	...	NaN	MnPrv	NaN	0	2	2010	WD	Normal	210000
1457	1458	70	RL	66.0	9042	Pave	NaN	Reg	Lvl	AllPub	...	NaN	GdPrv	Shed	2500	5	2010	WD	Normal	266500
1458	1459	20	RL	68.0	9717	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	4	2010	WD	Normal	142125
1459	1460	20	RL	75.0	9937	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	6	2008	WD	Normal	147500

Yemi Gabriel

Exploratory Data Analysis (EDA)

House Prices EDA practice¶

1. First, load the data - test and train datasets¶

2. Get quick summary of the data (excludes NaN values) - descriptive statistics¶

3. Get the first and last few rows with 'head' and 'tail' functions of the Pandas library¶

Show shape of test and train¶

4. We examine the numeric and categorical features in the dataset¶

5. a. Visualising data.... msno--is the Python library used for visualizing missing data in a dataset.¶

5b. Heatmap¶

5d. Dendogram: The dendrogram allows you to more fully correlate variable completion, revealing trends deeper than the pairwise ones visible in the correlation heatmap:¶

Understanding the Dendrogram: NEW¶

6a. Estimate skewness and kurtosis¶

6b. Plot distribution graphs - Johnson SU, Normal (i.e. Gaussian), and Logarithmic¶

The code plots histograms of the SalePrice data and overlays them with the fitted probability density functions (PDFs) (in red) for three different distributions: Johnson SU, Normal, and Log Normal. This helps to visually compare how well each distribution fits the data.¶