# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
import time
from IPython.display import display, HTML, IFrame, clear_output
import ipywidgets as widgets
import warnings
warnings.filterwarnings('ignore')
def normal_curve(x, mu=0, sigma=1):
return 1 / np.sqrt(2*np.pi) * np.exp(-(x - mu)**2/(2 * sigma**2))
def normal_area(a, b, bars=False):
x = np.linspace(-4, 4, 1000)
y = normal_curve(x)
ix = (x >= a) & (x <= b)
plt.figure(figsize=(10, 5))
plt.plot(x, y, color='black')
plt.fill_between(x[ix], y[ix], color='gold')
if bars:
plt.axvline(a, color='red')
plt.axvline(b, color='red')
plt.title(f'Area between {np.round(a, 2)} and {np.round(b, 2)}')
plt.show()
def show_clt_slides():
src = "https://docs.google.com/presentation/d/e/2PACX-1vTcJd3U1H1KoXqBFcWGKFUPjZbeW4oiNZZLCFY8jqvSDsl4L1rRTg7980nPs1TGCAecYKUZxH5MZIBh/embed?start=false&loop=false&delayms=3000"
width = 960
height = 509
display(IFrame(src, width, height))
The Final project is due on Tuesday 3/14 at 11:59PM and has 8 sections. How much progress have you made?
A. Not started or barely started ⏳
B. Finished 1 or 2 sections
C. Finished 3 or 4 sections ❤️
D. Finished 5 or 6 sections
E. Finished 7 or 8 sections 🤯
SAT scores range from 0 to 1600. The distribution of SAT scores has a mean of 950 and a standard deviation of 300. Your friend tells you that their SAT score, in standard units, is 2.5. What do you conclude?
scipy.stats.norm.cdf
.Last time, we looked at a data set of heights and weights of 5000 adult males.
height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight
Height | Weight | |
---|---|---|
0 | 73.85 | 241.89 |
1 | 68.78 | 162.31 |
2 | 74.11 | 212.74 |
... | ... | ... |
4997 | 67.01 | 199.20 |
4998 | 71.56 | 185.91 |
4999 | 70.35 | 198.90 |
5000 rows × 2 columns
Both variables are roughly normal. What benefit is there to knowing that the two distributions are roughly normal?
Let's suppose, as is often the case, that we don't have access to the entire distribution of heights, just the mean and SD.
heights = height_and_weight.get('Height')
height_mean = heights.mean()
height_mean
69.02634590621737
height_std = np.std(heights)
height_std
2.863075878119538
Using just this information, we can estimate the proportion of heights between 65 and 70 inches:
stats.norm.cdf
to find the area between (1) and (2).left = (65 - height_mean) / height_std
left
-1.4063008029189459
right = (70 - height_mean) / height_std
right
0.3400727522534686
normal_area(left, right)
from scipy import stats
approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation
0.5532817187111865
Since we have access to the entire set of heights, we can compute the true proportion of heights between 65 and 70 inches.
# True proportion of values between 65 and 70.
height_and_weight[
(height_and_weight.get('Height') >= 65) &
(height_and_weight.get('Height') <= 70)
].shape[0] / height_and_weight.shape[0]
0.554
# Approximation using the standard normal curve.
approximation
0.5532817187111865
Pretty good for an approximation! 🤩
Consider the distribution of delays from earlier in the lecture.
delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');
The distribution above does not look normal. It won't look normal even if we standardize it. By standardizing a distribution, all we do is move it horizontally and stretch it vertically – the shape itself doesn't change.
HTML('data/delay_anim.html')
Percent in Range | Normal Distribution |
---|---|
$\text{mean} \pm 1 \: \text{SD}$ | $\approx 68\%$ |
$\text{mean} \pm 2 \: \text{SDs}$ | $\approx 95\%$ |
$\text{mean} \pm 3 \: \text{SDs}$ | $\approx 99.73\%$ |
normal_area(-1, 1, bars=True)
stats.norm.cdf(1) - stats.norm.cdf(-1)
0.6826894921370859
This means that if a variable follows a normal distribution, approximately 68% of values will be within 1 SD of the mean.
normal_area(-2, 2, bars=True)
stats.norm.cdf(2) - stats.norm.cdf(-2)
0.9544997361036416
Range | All Distributions (via Chebyshev's inequality) | Normal Distribution |
---|---|---|
mean $\pm \ 1$ SD | $\geq 0\%$ | $\approx 68\%$ |
mean $\pm \ 2$ SDs | $\geq 75\%$ | $\approx 95\%$ |
mean $\pm \ 3$ SDs | $\geq 88.8\%$ | $\approx 99.73\%$ |
Remember: The distribution of heights is roughly normal, but it is not a standard normal distribution.
height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=40, alpha=0.8, figsize=(10, 5));
plt.xticks(np.arange(60, 78, 2));
np.std(height_and_weight.get('Height'))
2.863075878119538
The distribution of flight delays that we've been looking at is not roughly normal.
delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Population Distribution of Flight Delays')
plt.xlabel('Delay (minutes)');
delays.get('Delay').describe()
count 13825.00 mean 16.66 std 39.48 ... 50% 2.00 75% 18.00 max 580.00 Name: Delay, Length: 8, dtype: float64
Since we have access to the population of flight delays, let's remind ourselves what the distribution of the sample mean looks like by drawing samples repeatedly from the population.
sample_means = np.array([])
repetitions = 2000
for i in np.arange(repetitions):
sample = delays.sample(500)
sample_mean = sample.get('Delay').mean()
sample_means = np.append(sample_means, sample_mean)
sample_means
array([15.65, 17.02, 16.58, ..., 18.76, 16.87, 13.23])
bpd.DataFrame().assign(sample_means=sample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([sample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(sample_means.mean(), color='green', label=f'mean={np.round(sample_means.mean(), 2)}', linewidth=4)
plt.xlim(5, 30)
plt.ylim(-0.013, 0.26)
plt.legend();