# Run this cell to set up packages for lecture.
from lec14_imports import *
population = bpd.read_csv('data/2023_salaries.csv')
population
| Year | EmployerType | EmployerName | DepartmentOrSubdivision | ... | EmployerCounty | SpecialDistrictActivities | IncludesUnfundedLiability | SpecialDistrictType | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
| 1 | 2023 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
| 2 | 2023 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13885 | 2023 | City | San Diego | Transportation | ... | San Diego | NaN | False | NaN |
| 13886 | 2023 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
| 13887 | 2023 | City | San Diego | Public Utilities | ... | San Diego | NaN | False | NaN |
13888 rows × 29 columns
When you load in a dataset that has so many columns that you can't see them all, it's a good idea to look at the column names.
population.columns
Index(['Year', 'EmployerType', 'EmployerName', 'DepartmentOrSubdivision',
'Position', 'ElectedOfficial', 'Judicial', 'OtherPositions',
'MinPositionSalary', 'MaxPositionSalary', 'ReportedBaseWage',
'RegularPay', 'OvertimePay', 'LumpSumPay', 'OtherPay', 'TotalWages',
'DefinedBenefitPlanContribution', 'EmployeesRetirementCostCovered',
'DeferredCompensationPlan', 'HealthDentalVision',
'TotalRetirementAndHealthContribution', 'PensionFormula', 'EmployerURL',
'EmployerPopulation', 'LastUpdatedDate', 'EmployerCounty',
'SpecialDistrictActivities', 'IncludesUnfundedLiability',
'SpecialDistrictType'],
dtype='object')
We only need the 'TotalWages' column, so let's get just that column.
population = population.get(['TotalWages'])
population
| TotalWages | |
|---|---|
| 0 | 433011 |
| 1 | 416044 |
| 2 | 405315 |
| ... | ... |
| 13885 | 10 |
| 13886 | 8 |
| 13887 | 2 |
13888 rows × 1 columns
population.plot(kind='hist', bins=np.arange(0, 500000, 10000), density=True, ec='w', figsize=(10, 5),
title='Distribution of Total Wages of San Diego City Employees in 2023');
.median() to find the median salary of all city employees. population_median = population.get('TotalWages').median()
population_median
80492.0
Let's survey 500 employees at random. To do so, we can use the .sample method.
np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.
# Take a sample of size 500 without replacement (for 500 distinct individuals).
my_sample = population.sample(500)
my_sample
| TotalWages | |
|---|---|
| 4091 | 113944 |
| 2363 | 144835 |
| 3047 | 132502 |
| ... | ... |
| 4338 | 110628 |
| 9238 | 53840 |
| 4798 | 104600 |
500 rows × 1 columns
We won't reassign my_sample at any point in this notebook, so it will always refer to this particular sample.
my_sample.plot(kind='hist', bins=np.arange(0, 500000, 10000), density=True, ec='w', figsize=(10, 5),
color = green, title='Distribution of Total Wages in a Sample of 500 City Employees');
# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median
82508.0
sample_medians = np.array([])
for i in np.arange(1000):
median = population.sample(500).get('TotalWages').median()
sample_medians = np.append(sample_medians, median)
sample_medians
array([82603.5, 84498. , 77594.5, ..., 83471.5, 84897.5, 75602. ])
(bpd.DataFrame()
.assign(SampleMedians=sample_medians)
.plot(kind='hist', density=True,
bins=30, ec='w', figsize=(8, 5),
title='Distribution of the Sample Median of 1000 Samples from the Population\nSample Size = 500')
);
my_sample, looks a lot like the population.fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 500_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);
Note that unlike the previous histogram we saw, this is depicting the distribution of the population and of one particular sample (my_sample), not the distribution of sample medians for 1000 samples.
show_bootstrapping_slides()
original = [7, 9, 4]
for i in np.arange(10):
resample = np.random.choice(original, 3, replace=False)
print("Resample: ", resample, " Median: ", np.median(resample))
Resample: [9 4 7] Median: 7.0 Resample: [7 4 9] Median: 7.0 Resample: [4 7 9] Median: 7.0 Resample: [4 9 7] Median: 7.0 Resample: [7 4 9] Median: 7.0 Resample: [9 7 4] Median: 7.0 Resample: [4 7 9] Median: 7.0 Resample: [7 9 4] Median: 7.0 Resample: [9 4 7] Median: 7.0 Resample: [4 9 7] Median: 7.0
original = [7, 9, 4]
for i in np.arange(10):
resample = np.random.choice(original, 3, replace=True)
print("Resample: ", resample, " Median: ", np.median(resample))
Resample: [7 4 7] Median: 7.0 Resample: [9 4 9] Median: 9.0 Resample: [9 9 9] Median: 9.0 Resample: [9 4 9] Median: 9.0 Resample: [4 4 9] Median: 4.0 Resample: [9 9 7] Median: 9.0 Resample: [4 7 4] Median: 4.0 Resample: [9 4 7] Median: 7.0 Resample: [4 9 4] Median: 4.0 Resample: [7 4 4] Median: 4.0
When we resample without replacement, resamples look just like the original samples.
When we resample with replacement, resamples can have a different mean, median, max, and min than the original sample.
We can simulate the act of collecting new samples by sampling with replacement from our original sample, my_sample.
# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.
np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.
n_resamples = 5000
boot_medians = np.array([])
for i in range(n_resamples):
# Resample from my_sample WITH REPLACEMENT.
resample = my_sample.sample(500, replace=True)
# Compute the median.
median = resample.get('TotalWages').median()
# Store it in our array of medians.
boot_medians = np.append(boot_medians, median)
boot_medians
array([85751. , 76009. , 83106. , ..., 82760. , 83470.5, 82711. ])
bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color=pink, s=100, label='population median').set_zorder(2)
plt.legend();
The population median (pink dot) is near the middle.
In reality, we'd never get to see this!
We have a sample median wage:
my_sample.get('TotalWages').median()
82508.0
With it, we can say that the population median wage is approximately \$82,508, and not much else.
But by bootstrapping our one sample, we can generate an empirical distribution of the sample median:
(bpd.DataFrame()
.assign(BootstrapMedians=boot_medians)
.plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();
which allows us to say things like
We think the population median wage is between \$70,000 and \\$88,000.
Question: We could also say that we think the population median wage is between \$80,000 and \\$85,000. What range should we pick?
Let $p$ be a number between 0 and 100. The $p$th percentile of a numerical dataset is a number that's greater than or equal to $p$ percent of all data values.

Another example: If you're in the $80$th percentile for height, it means that roughly $80\%$ of people are shorter than you, and $20\%$ are taller.
numpy package provides a function to calculate percentiles, np.percentile(array, p), which returns the pth percentile of array. np.percentile([4, 6, 9, 2, 7], 50) # unsorted data
6.0
np.percentile([2, 4, 6, 7, 9], 50) # sorted data
6.0
Earlier in the lecture, we generated a bootstrapped distribution of sample medians.
bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color=pink, s=100, label='population median').set_zorder(2)
plt.legend();
What can we do with this distribution, now that we know about percentiles?
boot_medians
array([85751. , 76009. , 83106. , ..., 82760. , 83470.5, 82711. ])
# Left endpoint.
left = np.percentile(boot_medians, 2.5)
left
70671.5
# Right endpoint.
right = np.percentile(boot_medians, 97.5)
right
86405.0
# Therefore, our interval is:
[left, right]
[70671.5, 86405.0]
You will use the code above very frequently moving forward!
bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5), zorder=1)
plt.plot([left, right], [0, 0], color=gold, linewidth=12, label='95% confidence interval', zorder=2);
plt.scatter(population_median, 0.000004, color=pink, s=100, label='population median', zorder=3)
plt.legend();
We computed the following 95% confidence interval:
print('Interval:', [left, right])
print('Width:', right - left)
Interval: [70671.5, 86405.0] Width: 15733.5
If we instead computed an 80% confidence interval, would it be wider or narrower?
Now, instead of saying
We think the population median is close to our sample median, \$82,508.
We can say:
A 95% confidence interval for the population median is \$70,671.50 to \\$86,405.
Some lingering questions: What does 95% confidence mean? What are we confident about? Is this technique always "good"?
We will: