InĀ [1]:
Copied!
import matplotlib.pyplot as plt
from scipy.stats import probplot, norm
import pandas as pd
fig, ax = plt.subplots(figsize=(4,4))
norm_sample = norm.rvs(size=100)
probplot(norm_sample, plot=ax)
import matplotlib.pyplot as plt
from scipy.stats import probplot, norm
import pandas as pd
fig, ax = plt.subplots(figsize=(4,4))
norm_sample = norm.rvs(size=100)
probplot(norm_sample, plot=ax)
Out[1]:
((array([-2.46203784, -2.12570747, -1.93122778, -1.79044653, -1.67819304,
-1.58381122, -1.50174123, -1.42869743, -1.36256869, -1.30191411,
-1.24570419, -1.19317644, -1.14374949, -1.09696931, -1.05247413,
-1.00997067, -0.96921765, -0.93001393, -0.89218993, -0.85560121,
-0.82012357, -0.78564937, -0.75208458, -0.71934648, -0.68736185,
-0.65606548, -0.62539893, -0.59530962, -0.56574992, -0.53667655,
-0.50804994, -0.47983378, -0.45199463, -0.42450149, -0.39732558,
-0.37044003, -0.34381966, -0.31744076, -0.29128096, -0.26531902,
-0.23953472, -0.21390872, -0.18842244, -0.16305799, -0.13779803,
-0.1126257 , -0.08752455, -0.06247843, -0.03747145, -0.01248789,
0.01248789, 0.03747145, 0.06247843, 0.08752455, 0.1126257 ,
0.13779803, 0.16305799, 0.18842244, 0.21390872, 0.23953472,
0.26531902, 0.29128096, 0.31744076, 0.34381966, 0.37044003,
0.39732558, 0.42450149, 0.45199463, 0.47983378, 0.50804994,
0.53667655, 0.56574992, 0.59530962, 0.62539893, 0.65606548,
0.68736185, 0.71934648, 0.75208458, 0.78564937, 0.82012357,
0.85560121, 0.89218993, 0.93001393, 0.96921765, 1.00997067,
1.05247413, 1.09696931, 1.14374949, 1.19317644, 1.24570419,
1.30191411, 1.36256869, 1.42869743, 1.50174123, 1.58381122,
1.67819304, 1.79044653, 1.93122778, 2.12570747, 2.46203784]),
array([-2.44830184, -2.02998617, -2.01832697, -1.88827648, -1.8829128 ,
-1.8796135 , -1.6358162 , -1.55985054, -1.52759732, -1.51916628,
-1.45915343, -1.45518746, -1.37631179, -1.34355166, -1.30785552,
-1.25498437, -1.20370201, -1.12883074, -1.02799441, -1.02142495,
-0.98536632, -0.92549378, -0.89739628, -0.84865924, -0.84004157,
-0.80309913, -0.78787241, -0.72460806, -0.72362519, -0.65810047,
-0.63201213, -0.60986112, -0.60155744, -0.60143905, -0.59571075,
-0.56221674, -0.55689617, -0.54552105, -0.54364474, -0.51663865,
-0.48852302, -0.47480099, -0.45639851, -0.44956551, -0.4402891 ,
-0.41923124, -0.40817537, -0.40626393, -0.32432492, -0.31037607,
-0.29542827, -0.26741507, -0.23194245, -0.18548701, -0.1694805 ,
-0.14924765, -0.14730364, -0.13969416, -0.08049323, -0.06968245,
-0.05378667, 0.0084616 , 0.0273424 , 0.1562903 , 0.1642044 ,
0.18881956, 0.19401566, 0.21096791, 0.23489804, 0.26099298,
0.27488471, 0.33182229, 0.36487213, 0.40807126, 0.46939119,
0.47178008, 0.48553555, 0.49873911, 0.50231611, 0.63153126,
0.64858313, 0.65465624, 0.65651203, 0.70579422, 0.80309328,
0.82924662, 0.87432743, 0.99881214, 1.05294332, 1.05721602,
1.11938246, 1.28842505, 1.39150953, 1.44047513, 1.50378834,
1.57025122, 1.62162806, 2.17055666, 2.39380068, 3.24126204])),
(np.float64(1.0312192825303765),
np.float64(-0.18989314332756427),
np.float64(0.9909872901968868)))
InĀ [2]:
Copied!
loans = pd.read_csv("../data/loans_income.csv")
loan_values = loans["x"].dropna()
z_loan = (loan_values - loan_values.mean()) / loan_values.std()
z_loan
loans = pd.read_csv("../data/loans_income.csv")
loan_values = loans["x"].dropna()
z_loan = (loan_values - loan_values.mean()) / loan_values.std()
z_loan
Out[2]:
0 -0.053557
1 -0.509872
2 0.950336
3 0.304255
4 -0.964939
...
49995 -0.874924
49996 -0.449030
49997 -0.570714
49998 0.402758
49999 0.037706
Name: x, Length: 50000, dtype: float64
InĀ [3]:
Copied!
fig, ax = plt.subplots(figsize=(4,4))
probplot(z_loan, plot=ax)
fig, ax = plt.subplots(figsize=(4,4))
probplot(z_loan, plot=ax)
Out[3]:
((array([-4.19138481, -3.98563638, -3.8734641 , ..., 3.8734641 ,
3.98563638, 4.19138481], shape=(50000,)),
array([-1.97007936, -1.88185848, -1.87881638, ..., 3.94452245,
3.96201452, 3.96201452], shape=(50000,))),
(np.float64(0.9658789452608367),
np.float64(-4.2414428944933633e-16),
np.float64(0.9658085995476677)))
InĀ [4]:
Copied!
loan_dist = []
for _ in range(10000):
sample = loans.sample(100)
loan_dist.append(sample["x"].mean())
fig, ax = plt.subplots(figsize=(4,4))
probplot(x=loan_dist, plot=ax)
loan_dist = []
for _ in range(10000):
sample = loans.sample(100)
loan_dist.append(sample["x"].mean())
fig, ax = plt.subplots(figsize=(4,4))
probplot(x=loan_dist, plot=ax)
Out[4]:
((array([-3.81060943, -3.58545756, -3.46184782, ..., 3.46184782,
3.58545756, 3.81060943], shape=(10000,)),
array([57848.29, 57874.55, 58274.11, ..., 81005.91, 81481.89, 82403.12],
shape=(10000,))),
(np.float64(3271.061390190755),
np.float64(68758.103299),
np.float64(0.9996216628566901)))
We can see that sampling distribution of the sample mean is normally distributed.
InĀ [5]:
Copied!
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(loan_dist, bins=30, density=True, alpha=0.5, edgecolor='black')
sns.kdeplot(loan_dist, color='orange', linewidth=2)
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(loan_dist, bins=30, density=True, alpha=0.5, edgecolor='black')
sns.kdeplot(loan_dist, color='orange', linewidth=2)
Out[5]:
<Axes: ylabel='Density'>