# Run this cell to set up packages for lecture.
from lec16_imports import *

population = bpd.read_csv('data/2022_salaries.csv').get(['TotalWages'])
population_median = population.get('TotalWages').median()
population_median # Can't see this in real life!

78136.0

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.
my_sample = population.sample(500)
sample_median = my_sample.get('TotalWages').median()
sample_median

76237.0

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

# Bootstrap the sample to get more sample medians.
n_resamples = 5000
boot_medians = np.array([])

for i in np.arange(n_resamples):
    resample = my_sample.sample(500, replace=True)
    median = resample.get('TotalWages').median()
    boot_medians = np.append(boot_medians, median)
    
boot_medians

array([76896. , 72945. , 73555. , ..., 74431. , 75868. , 78601.5])

left = np.percentile(boot_medians, 2.5)
right = np.percentile(boot_medians, 97.5)

# Therefore, our interval is:
[left, right]

[68469.0, 81253.5]

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(3)
plt.legend();

Video('data/ci-ring-toss.mp4', width=500)

show_confidence_interval_slides()

many_cis = np.load('data/many_cis.npy')
many_cis

array([[72881.5 , 85383.32],
       [66727.19, 81871.47],
       [65449.32, 82001.4 ],
       ...,
       [64915.5 , 81814.85],
       [66702.5 , 79711.  ],
       [67996.76, 82105.84]])

plt.figure(figsize=(10, 6))
for i, ci in enumerate(many_cis):
    plt.plot([ci[0], ci[1]], [i, i], color='gold', linewidth=2)
plt.axvline(x=population_median, color='blue');

plt.figure(figsize=(10, 6))
count_outside = 0
for i, ci in enumerate(many_cis):
    if ci[0] > population_median or ci[1] < population_median:
        plt.plot([ci[0], ci[1]], [i, i], color='gold', linewidth=2)
        count_outside = count_outside + 1
plt.axvline(x=population_median, color='blue');

count_outside

11

# Our interval:
[left, right]

[68469.0, 81253.5]

population.plot(kind='hist', y='TotalWages', density=True, ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.legend();

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.legend();

# Our interval:
[left, right]

[68469.0, 81253.5]

n_resamples = 5000
boot_maxes = np.array([])

for i in range(n_resamples):
    resample = my_sample.sample(500, replace=True)
    boot_max = resample.get('TotalWages').max()
    boot_maxes = np.append(boot_maxes, boot_max)

boot_maxes

array([339416., 347755., 347755., ..., 257627., 339416., 339416.])

population_max = population.get('TotalWages').max()
population_max

384909

bpd.DataFrame().assign(BootstrapMax=boot_maxes).plot(kind='hist', 
                                                     density=True, 
                                                     bins=10,
                                                     ec='w',
                                                     figsize=(10, 5))
plt.scatter(population_max, 0.0000008, color='blue', s=100, label='population max')
plt.legend();

my_sample.get('TotalWages').max()

347755

delays = bpd.read_csv('data/united_summer2015.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5))
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)');

delays.get('Delay').mean()

16.658155515370705

delays.get('Delay').median()

2.0

delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', alpha=0.65, figsize=(10, 5))
plt.plot([delays.get('Delay').mean(), delays.get('Delay').mean()], [0, 1], color='green', label='Mean', linewidth=2)
plt.scatter([delays.get('Delay').mean()], [-0.0017], color='green', marker='^', s=250)
plt.plot([delays.get('Delay').median(), delays.get('Delay').median()], [0, 1], color='purple', label='Median', linewidth=2)
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)')
plt.ylim(-0.005, 0.065)
plt.legend();

data = np.array([2, 3, 3, 9])
np.mean(data)

4.25

deviations = data - np.mean(data)
deviations

array([-2.25, -1.25, -1.25,  4.75])

np.mean(deviations)

0.0

# Square all the deviations:
deviations ** 2

array([ 5.06,  1.56,  1.56, 22.56])

variance = np.mean(deviations ** 2)
variance

7.6875

# Standard deviation (SD) is the square root of the variance.
sd = variance ** 0.5
sd

2.7726341266023544

# Note that this evaluates to the same number we found on the previous slide.
np.std(data)

2.7726341266023544

delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');

delay_mean = delays.get('Delay').mean()
delay_mean

16.658155515370705

delay_std = np.std(delays.get('Delay')) # There is no .std() method in babypandas!
delay_std

39.480199851609314

delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)

delay_mean - 3 * delay_std, delay_mean + 3 * delay_std

(-101.78244403945723, 135.09875507019865)

delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, alpha=0.65, ec='w', figsize=(10, 5), title='Flight Delays')
plt.axvline(delay_mean - 2 * delay_std, color='maroon', label='± 2 SD')
plt.axvline(delay_mean + 2 * delay_std, color='maroon')

plt.axvline(delay_mean + 3 * delay_std, color='blue',  label='± 3 SD')
plt.axvline(delay_mean - 3 * delay_std, color='blue')

plt.axvline(delay_mean, color='green', label='Mean')
plt.scatter([delay_mean], [-0.0017], color='green', marker='^', s=250)
plt.ylim(-0.0038, 0.06)
plt.legend();

delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)

within_2_sds = delays[(delays.get('Delay') >= delay_mean - 2 * delay_std) & 
                      (delays.get('Delay') <= delay_mean + 2 * delay_std)]

within_2_sds.shape[0] / delays.shape[0]

0.9560940325497288

Range	Proportion
mean ± 2 SDs	at least $1 - \frac{1}{4}$ (75%)
mean ± 3 SDs	at least $1 - \frac{1}{9}$ (88.88..%)
mean ± 4 SDs	at least $1 - \frac{1}{16}$ (93.75%)
mean ± 5 SDs	at least $1 - \frac{1}{25}$ (96%)

Lecture 15 – Confidence Intervals, Center, and Spread¶

DSC 10, Summer 2024¶

Announcements¶

Agenda¶

Interpreting confidence intervals¶

Recap: City of San Diego employee salaries¶

Confidence intervals describe a guess for the value of an unknown parameter¶

Interpreting confidence intervals¶

Capturing the true value¶

Confidence intervals are like ring toss!¶

Many confidence intervals¶

Which confidence intervals don't contain the true parameter?¶

Confidence tradeoffs¶

Misinterpreting confidence intervals¶

Pitfalls of bootstrapping¶

Bootstrapping rules of thumb¶

Example: Estimating the max of a population¶

Visualize¶

Center and spread¶

Some questions¶

Central tendency¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Example: Flight delays ✈️¶

Comparing the mean and median¶

Standard deviation¶

Question: How "wide" is a distribution?¶

Deviations from the mean¶

Average squared deviation¶

Standard deviation¶

Standard deviation¶

Variance and standard deviation¶

What can we do with the standard deviation?¶

Chebyshev’s inequality¶

Flight delays, revisited¶

Mean and standard deviation¶

Chebyshev's inequality provides lower bounds!¶

Activity¶

Summary, next time¶

Summary: Bootstrapping and confidence intervals¶

Summary: Center, spread, and Chebyshev's inequality¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶