# Run this cell to set up packages for lecture.
from lec14_imports import *

population = bpd.read_csv('data/2023_salaries.csv')
population

population.columns

Index(['Year', 'EmployerType', 'EmployerName', 'DepartmentOrSubdivision',
       'Position', 'ElectedOfficial', 'Judicial', 'OtherPositions',
       'MinPositionSalary', 'MaxPositionSalary', 'ReportedBaseWage',
       'RegularPay', 'OvertimePay', 'LumpSumPay', 'OtherPay', 'TotalWages',
       'DefinedBenefitPlanContribution', 'EmployeesRetirementCostCovered',
       'DeferredCompensationPlan', 'HealthDentalVision',
       'TotalRetirementAndHealthContribution', 'PensionFormula', 'EmployerURL',
       'EmployerPopulation', 'LastUpdatedDate', 'EmployerCounty',
       'SpecialDistrictActivities', 'IncludesUnfundedLiability',
       'SpecialDistrictType'],
      dtype='object')

population = population.get(['TotalWages'])
population

population.plot(kind='hist', bins=np.arange(0, 500000, 10000), density=True, ec='w', figsize=(10, 5),
                title='Distribution of Total Wages of San Diego City Employees in 2023');

population_median = population.get('TotalWages').median()
population_median

80492.0

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

# Take a sample of size 500.
my_sample = population.sample(500)
my_sample

# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median

82508.0

sample_medians = np.array([])
for i in np.arange(1000):
    median = population.sample(500).get('TotalWages').median()
    sample_medians = np.append(sample_medians, median)
sample_medians

array([82603.5, 84498. , 77594.5, ..., 83471.5, 84897.5, 75602. ])

(bpd.DataFrame()
 .assign(SampleMedians=sample_medians)
 .plot(kind='hist', density=True,
       bins=30, ec='w', figsize=(8, 5),
       title='Distribution of the Sample Median of 1000 Samples from the Population\nSample Size = 500')
);

fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 500_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);

show_bootstrapping_slides()

original = [7, 9, 4]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=False)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [9 4 7]     Median:  7.0
Resample:  [7 4 9]     Median:  7.0
Resample:  [4 7 9]     Median:  7.0
Resample:  [4 9 7]     Median:  7.0
Resample:  [7 4 9]     Median:  7.0
Resample:  [9 7 4]     Median:  7.0
Resample:  [4 7 9]     Median:  7.0
Resample:  [7 9 4]     Median:  7.0
Resample:  [9 4 7]     Median:  7.0
Resample:  [4 9 7]     Median:  7.0

original = [7, 9, 4]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=True)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [7 4 7]     Median:  7.0
Resample:  [9 4 9]     Median:  9.0
Resample:  [9 9 9]     Median:  9.0
Resample:  [9 4 9]     Median:  9.0
Resample:  [4 4 9]     Median:  4.0
Resample:  [9 9 7]     Median:  9.0
Resample:  [4 7 4]     Median:  4.0
Resample:  [9 4 7]     Median:  7.0
Resample:  [4 9 4]     Median:  4.0
Resample:  [7 4 4]     Median:  4.0

# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

n_resamples = 5000
boot_medians = np.array([])

for i in range(n_resamples):
    
    # Resample from my_sample WITH REPLACEMENT.
    resample = my_sample.sample(500, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Store it in our array of medians.
    boot_medians = np.append(boot_medians, median)

boot_medians

array([85751. , 76009. , 83106. , ..., 82760. , 83470.5, 82711. ])

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();

my_sample.get('TotalWages').median()

82508.0

(bpd.DataFrame()
 .assign(BootstrapMedians=boot_medians)
 .plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();

np.percentile([4, 6, 9, 2, 7], 50) # unsorted data

6.0

np.percentile([2, 4, 6, 7, 9], 50) # sorted data

6.0

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();

boot_medians

array([85751. , 76009. , 83106. , ..., 82760. , 83470.5, 82711. ])

# Left endpoint.
left = np.percentile(boot_medians, 2.5)
left

70671.5

# Right endpoint.
right = np.percentile(boot_medians, 97.5)
right

86405.0

# Therefore, our interval is:
[left, right]

[70671.5, 86405.0]

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5), zorder=1)
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval', zorder=2);
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median', zorder=3)
plt.legend();

print('Interval:', [left, right])
print('Width:', right - left)

Interval: [70671.5, 86405.0]
Width: 15733.5

	TotalWages
0	433011
1	416044
2	405315
...	...
13885	10
13886	8
13887	2

	TotalWages
4091	113944
2363	144835
3047	132502
...	...
4338	110628
9238	53840
4798	104600

Lecture 14 – Bootstrapping and Confidence Intervals¶

DSC 10, Spring 2025¶

Agenda¶

Recap: Statistical inference¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

Terminology recap¶

The sample median¶

How confident are we that this is a good estimate?¶

An impractical approach¶

The problem¶

Bootstrapping 🥾¶

Bootstrapping¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Informal definition¶

Calculating percentiles¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Endpoints of a 95% confidence interval¶

Finding the endpoints with np.percentile¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
0	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
1	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
2	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
13885	2023	City	San Diego	Transportation	...	San Diego	NaN	False	NaN
13886	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
13887	2023	City	San Diego	Public Utilities	...	San Diego	NaN	False	NaN

Lecture 14 – Bootstrapping and Confidence Intervals¶

DSC 10, Spring 2025¶

Agenda¶

Recap: Statistical inference¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

Terminology recap¶

The sample median¶

How confident are we that this is a good estimate?¶

An impractical approach¶

The problem¶

Bootstrapping 🥾¶

Bootstrapping¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Informal definition¶

Calculating percentiles¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Endpoints of a 95% confidence interval¶

Finding the endpoints with np.percentile¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶