# Run this cell to set up packages for lecture.
from lec16_imports import *

np.std([2, 3, 3, 9])

2.7726341266023544

delays = bpd.read_csv('data/united_summer2015.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');

delay_mean = delays.get('Delay').mean()
delay_mean

16.658155515370705

delay_std = np.std(delays.get('Delay')) # There is no .std() method in babypandas!
delay_std

39.480199851609314

delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)

delay_mean - 3 * delay_std, delay_mean + 3 * delay_std

(-101.78244403945723, 135.09875507019865)

delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, alpha=0.65, ec='w', figsize=(10, 5), title='Flight Delays')
plt.axvline(delay_mean - 2 * delay_std, color='maroon', label='± 2 SD')
plt.axvline(delay_mean + 2 * delay_std, color='maroon')

plt.axvline(delay_mean + 3 * delay_std, color='blue',  label='± 3 SD')
plt.axvline(delay_mean - 3 * delay_std, color='blue')

plt.axvline(delay_mean, color='green', label='Mean')
plt.scatter([delay_mean], [-0.0017], color='green', marker='^', s=250)
plt.ylim(-0.0038, 0.06)
plt.legend();

delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)

within_2_sds = delays[(delays.get('Delay') >= delay_mean - 2 * delay_std) & 
                      (delays.get('Delay') <= delay_mean + 2 * delay_std)]

within_2_sds.shape[0] / delays.shape[0]

0.9560940325497288

height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight

height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=30, alpha=0.8, figsize=(10, 5));

height_and_weight.plot(kind='hist', y='Weight', density=True, ec='w', bins=30, alpha=0.8, color='C1', figsize=(10, 5));

height_and_weight.plot(kind='hist', density=True, ec='w', bins=60, alpha=0.8, figsize=(10, 5));

show_many_normal_distributions()

weights = height_and_weight.get('Weight')
(225 - weights.mean()) / np.std(weights)

1.9201699181580782

def standard_units(col):
    return (col - col.mean()) / np.std(col)

standardized_height = standard_units(height_and_weight.get('Height'))
standardized_height

0       1.68
1      -0.09
2       1.78
        ... 
4997   -0.70
4998    0.88
4999    0.46
Name: Height, Length: 5000, dtype: float64

standardized_weight = standard_units(height_and_weight.get('Weight'))
standardized_weight

0       2.77
1      -1.25
2       1.30
        ... 
4997    0.62
4998   -0.06
4999    0.60
Name: Weight, Length: 5000, dtype: float64

# e-15 means 10^(-15), which is a very small number, effectively zero.
standardized_height.describe()

count    5.00e+03
mean     1.49e-15
std      1.00e+00
           ...   
50%      4.76e-04
75%      6.85e-01
max      3.48e+00
Name: Height, Length: 8, dtype: float64

standardized_weight.describe()

count    5.00e+03
mean     5.96e-16
std      1.00e+00
           ...   
50%      6.53e-04
75%      6.74e-01
max      4.19e+00
Name: Weight, Length: 8, dtype: float64

HTML('data/height_anim.html')

HTML('data/weight_anim.html')

standardized_height_and_weight = bpd.DataFrame().assign(
    Height=standardized_height,
    Weight=standardized_weight
)
standardized_height_and_weight.plot(kind='hist', density=True, ec='w',bins=30, alpha=0.8, figsize=(10, 5));

def normal_curve(z):
    return 1 / np.sqrt(2 * np.pi) * np.exp((-z**2)/2)

x = np.linspace(-4, 4, 1000)
y = normal_curve(x)

plt.figure(figsize=(10, 5))
plt.plot(x, y, color='black');
plt.xlabel('$z$');
plt.title(r'$\phi(z) = \frac{1}{\sqrt{2 \pi}} e^{-\frac{1}{2}z^2}$');

standardized_height_and_weight.plot(kind='hist', density=True, ec='w', bins=120, alpha=0.8, figsize=(10, 5));
plt.plot(x, y, color='black', linestyle='--', label='Normal', linewidth=5)
plt.legend(loc='upper right');

sliders()

HBox(children=(FloatSlider(value=0.0, description='a', max=3.0, min=-4.0, step=0.25), FloatSlider(value=1.0, d…

Output()

normal_area(-np.inf, 0)

from scipy import stats
stats.norm.cdf(0)

0.5

normal_area(2, np.inf)

stats.norm.cdf(2)

0.9772498680518208

normal_area(-np.inf, 2)

1 - stats.norm.cdf(2)

0.02275013194817921

normal_area(-1, 0)

stats.norm.cdf(0) - stats.norm.cdf(-1)

0.3413447460685429

stats.norm.cdf(b) - stats.norm.cdf(a)

height_and_weight

weight_mean = weights.mean()
weight_mean

187.0206206581932

weight_std = np.std(weights)
weight_std

19.779176302396458

left = (200 - weight_mean) / weight_std
left

0.656214351061435

right = (225 - weight_mean) / weight_std
right

1.9201699181580782

normal_area(left, right)

approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation

0.22842488819306406

# True proportion of values between 200 and 225.
height_and_weight[
    (height_and_weight.get('Weight') >= 200) &
    (height_and_weight.get('Weight') <= 225)
].shape[0] / height_and_weight.shape[0]

0.2294

# Approximation using the standard normal curve.
approximation

0.22842488819306406

delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');

HTML('data/delay_anim.html')

normal_area(-1, 1, bars=True)

stats.norm.cdf(1) - stats.norm.cdf(-1)

0.6826894921370859

normal_area(-2, 2, bars=True)

stats.norm.cdf(2) - stats.norm.cdf(-2)

0.9544997361036416

normal_area(-1, 1)

height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=40, alpha=0.8, figsize=(10, 5));
plt.xticks(np.arange(60, 78, 2));

np.std(height_and_weight.get('Height'))

2.863075878119538

Range	Proportion
mean ± 2 SDs	at least $1 - \frac{1}{4}$ (75%)
mean ± 3 SDs	at least $1 - \frac{1}{9}$ (88.88..%)
mean ± 4 SDs	at least $1 - \frac{1}{16}$ (93.75%)
mean ± 5 SDs	at least $1 - \frac{1}{25}$ (96%)

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

Lecture 16 – Standardization and the Normal Distribution¶

DSC 10, Spring 2025¶

Agenda¶

Chebyshev's inequality¶

Recap: variance and standard deviation¶

Standard deviation¶

What can we do with the standard deviation?¶

Chebyshev’s inequality¶

Flight delays, revisited ✈️¶

Mean and standard deviation¶

Chebyshev's inequality provides lower bounds!¶

Activity¶

Standardization¶

Heights and weights 📏¶

Distributions of height and weight¶

Many normal distributions¶

Standard units¶

Standardization¶

The effect of standardization¶

Standardized histograms¶

The standard normal distribution¶

The standard normal distribution¶

The standard normal curve¶

Heights/weights are roughly normal¶

The standard normal distribution¶

Cumulative density functions¶

Areas under the standard normal curve¶

Areas under the standard normal curve¶

Areas under the standard normal curve¶

General strategy for finding area¶

Using the normal distribution¶

Standard units and the normal distribution¶

Example: Proportion of weights between 200 and 225 pounds¶

Checking the approximation¶

Warning: Standardization doesn't make a distribution normal!¶

Chebyshev's inequality and the normal distribution¶

68% of values are within 1 SD of the mean¶

95% of values are within 2 SDs of the mean¶

Recap: Proportion of values within $z$ SDs of the mean¶

Inflection points¶

Example: Inflection points¶

Summary, next time¶

Summary: Spread and Chebyshev's inequality¶

Summary: Standard units and the normal distribution¶

Next time¶