# Run this cell to set up packages for lecture.
from lec18_imports import *

delays = bpd.read_csv('data/united_summer2015.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Population Distribution of Flight Delays')
plt.xlabel('Delay (minutes)');

delays.get('Delay').describe()

count    13825.00
mean        16.66
std         39.48
           ...   
50%          2.00
75%         18.00
max        580.00
Name: Delay, Length: 8, dtype: float64

sample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    sample = delays.sample(500) # Not bootstrapping!
    sample_mean = sample.get('Delay').mean()
    sample_means = np.append(sample_means, sample_mean)
    
sample_means

array([16.43, 14.02, 13.26, ..., 15.32, 13.86, 17.99])

bpd.DataFrame().assign(sample_means=sample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([sample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(sample_means.mean(), color='green', label=f'mean={np.round(sample_means.mean(), 2)}', linewidth=4)
plt.xlim(5, 30)
plt.ylim(-0.013, 0.26)
plt.legend();

def sample_mean_delays(sample_size):
    sample_means = np.array([])
    for i in np.arange(2000):
        sample = delays.sample(sample_size)
        sample_mean = sample.get('Delay').mean()
        sample_means = np.append(sample_means, sample_mean)
    return sample_means

sample_means = {}
sample_sizes = [5, 10, 50, 100, 200, 400, 800, 1600]

for size in sample_sizes:
    sample_means[size] = sample_mean_delays(size)

plot_many_distributions(sample_sizes, sample_means)

# Compute the standard deviation of each distribution.
sds = np.array([])
for size in sample_sizes:
    sd = np.std(sample_means[size])
    sds = np.append(sds, sd)
sds

array([17.77, 12.57,  5.69,  3.88,  2.76,  1.98,  1.36,  0.93])

observed = bpd.DataFrame().assign(
    SampleSize=sample_sizes,
    StandardDeviation=sds
)

observed.plot(kind='scatter', x='SampleSize', y='StandardDeviation', s=70, title="Standard Deviation of the Distribution of the Sample Mean vs. Sample Size", figsize=(10, 5));

np.random.seed(42)
my_sample = delays.sample(500)
my_sample.get('Delay').describe()

count    500.00
mean      13.01
std       28.00
          ...  
50%        3.00
75%       16.00
max      209.00
Name: Delay, Length: 8, dtype: float64

resample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    resample = my_sample.sample(500, replace=True) # Bootstrapping!
    resample_mean = resample.get('Delay').mean()
    resample_means = np.append(resample_means, resample_mean)
    
resample_means

array([12.65, 11.5 , 11.34, ..., 12.59, 11.89, 12.58])

bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([resample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(resample_means.mean(), color='green', label=f'mean={np.round(resample_means.mean(), 2)}', linewidth=4)
plt.xlim(7, 20)
plt.ylim(-0.015, 0.35)
plt.legend();

sample_mean_mean = my_sample.get('Delay').mean()
sample_mean_mean

13.008

sample_mean_sd = np.std(my_sample.get('Delay')) / np.sqrt(my_sample.shape[0])
sample_mean_sd

1.2511114546674091

norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=sample_mean_mean, sigma=sample_mean_sd)
bpd.DataFrame().assign(Bootstrapping=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='CLT')
plt.title('Distribution of the Sample Mean, Using Two Methods')
plt.xlim(7, 20)
plt.legend();

show_clt_slides()

my_sample.get('Delay').describe()

count    500.00
mean      13.01
std       28.00
          ...  
50%        3.00
75%       16.00
max      209.00
Name: Delay, Length: 8, dtype: float64

resample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    resample = my_sample.sample(500, replace=True)
    resample_mean = resample.get('Delay').mean()
    resample_means = np.append(resample_means, resample_mean)
    
resample_means

array([14.37, 13.93, 11.34, ..., 16.84, 14.46, 11.4 ])

bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([resample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(resample_means.mean(), color='green', label=f'mean={np.round(resample_means.mean(), 2)}', linewidth=4)
plt.xlim(7, 20)
plt.ylim(-0.015, 0.35)
plt.legend();

left_boot = np.percentile(resample_means, 2.5)
right_boot = np.percentile(resample_means, 97.5)
[left_boot, right_boot]

[10.6359, 15.61205]

bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', y='resample_means', alpha=0.65, bins=20, density=True, ec='w', figsize=(10, 5), title='Distribution of Bootstrapped Sample Means');
plt.plot([left_boot, right_boot], [0, 0], color='gold', linewidth=10, label='95% bootstrap-based confidence interval');
plt.xlim(7, 20);
plt.legend();

sample_mean_mean = my_sample.get('Delay').mean()
sample_mean_mean

13.008

sample_mean_sd = np.std(my_sample.get('Delay')) / np.sqrt(my_sample.shape[0])
sample_mean_sd

1.2511114546674091

plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=sample_mean_mean, sigma=sample_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.legend();

normal_area(-2, 2)

stats.norm.cdf(2) - stats.norm.cdf(-2)

0.9544997361036416

left_normal = sample_mean_mean - 2 * sample_mean_sd
right_normal = sample_mean_mean + 2 * sample_mean_sd
[left_normal, right_normal]

[10.50577709066518, 15.510222909334818]

plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=sample_mean_mean, sigma=sample_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.ylim(0, 0.41)
plt.plot([left_normal, right_normal], [0, 0], color='#8f6100', linewidth=10, label='95% CLT-based confidence interval')
plt.legend();

[left_boot, right_boot]

[10.6359, 15.61205]

[left_normal, right_normal]

[10.50577709066518, 15.510222909334818]

estimate_z()

HBox(children=(FloatSlider(value=2.0, description='z', max=4.0, step=0.05),))

Output()

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

	Bootstrapping	CLT
Pro	Works for many sample statistics (mean, median, standard deviation).	Only requires 3 numbers – the sample mean, sample SD, and sample size.
Con	Very computationally expensive (requires drawing many, many samples from the original sample).	Only works for the sample mean (and sum).

Lecture 17 – The Central Limit Theorem¶

DSC 10, Summer 2024¶

Announcements¶

Agenda¶

Recap: Standard units¶

Standard units¶

Activity: SAT scores¶

The Central Limit Theorem¶

Back to flight delays ✈️¶

Empirical distribution of a sample statistic¶

Empirical distribution of the sample mean¶

The Central Limit Theorem¶

Characteristics of the distribution of the sample mean¶

Changing the sample size¶

Standard deviation of the distribution of the sample mean¶

Standard deviation of the distribution of the sample mean¶

Recap: Distribution of the sample mean¶

Bootstrapping vs. the CLT¶

Estimating the distribution of the sample mean by bootstrapping¶

Using the CLT with just a single sample¶

Using the CLT with just a single sample¶

Why?¶

Confidence intervals¶

Confidence intervals¶

Constructing a 95% confidence interval through bootstrapping¶

Middle 95% of a normal distribution¶

Recall: Proportion of values within $z$ SDs of the mean¶

Computing a 95% confidence interval using the CLT¶

Visualizing the CLT-based confidence interval¶

Comparing confidence intervals¶

Recap: Confidence intervals for the population mean¶

Bootstrapping vs. the CLT¶

Activity¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Summary, next time¶

Summary¶

Next time¶

Lecture 17 – The Central Limit Theorem¶

DSC 10, Summer 2024¶

Announcements¶

Agenda¶

Recap: Standard units¶

Standard units¶

Activity: SAT scores¶

The Central Limit Theorem¶

Back to flight delays ✈️¶

Empirical distribution of a sample statistic¶

Empirical distribution of the sample mean¶

The Central Limit Theorem¶

Characteristics of the distribution of the sample mean¶

Changing the sample size¶

Standard deviation of the distribution of the sample mean¶

Standard deviation of the distribution of the sample mean¶

Recap: Distribution of the sample mean¶

Bootstrapping vs. the CLT¶

Estimating the distribution of the sample mean by bootstrapping¶

Using the CLT with just a single sample¶

Using the CLT with just a single sample¶

Why?¶

Confidence intervals¶

Confidence intervals¶

Constructing a 95% confidence interval through bootstrapping¶

Middle 95% of a normal distribution¶

Recall: Proportion of values within $z$ SDs of the mean¶

Computing a 95% confidence interval using the CLT¶

Visualizing the CLT-based confidence interval¶

Comparing confidence intervals¶

Recap: Confidence intervals for the population mean¶

Bootstrapping vs. the CLT¶

Activity¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶