# Run this cell to set up packages for lecture.
from lec09_imports import *

die_faces = np.arange(1, 7, 1)
die = bpd.DataFrame().assign(face=die_faces)
die

bins = np.arange(0.5, 6.6, 1)

# Note that you can add titles to your visualizations, like this!
die.plot(kind='hist', y='face', bins=bins, density=True, ec='w', 
         title='Probability Distribution of a Die Roll',
         figsize=(5, 3))

# You can also set the y-axis label with plt.ylabel.
plt.ylabel('Probability');

num_rolls = 25
many_rolls = np.random.choice(die_faces, num_rolls)
many_rolls

array([2, 1, 1, ..., 5, 1, 4])

(bpd.DataFrame()
 .assign(face=many_rolls) 
 .plot(kind='hist', y='face', bins=bins, density=True, ec='w',
       title=f'Empirical Distribution of {num_rolls} Dice Rolls',
       figsize=(5, 3))
)
plt.ylabel('Probability');

for num_rolls in [10, 50, 100, 500, 1000, 5000, 10000]:
    # Don't worry about how .sample works just yet – we'll cover it shortly.
    (die.sample(n=num_rolls, replace=True)
     .plot(kind='hist', y='face', bins=bins, density=True, ec='w', 
           title=f'Distribution of {num_rolls} Die Rolls',
           figsize=(8, 3))
    )

colleges = np.array(['Revelle', 'John Muir', 'Thurgood Marshall', 
            'Earl Warren', 'Eleanor Roosevelt', 'Sixth', 'Seventh', 'Eighth'])

# Simple random sample of 3 colleges.
np.random.choice(colleges, 3, replace=False)

array(['Eighth', 'Sixth', 'Earl Warren'], dtype='<U17')

united_full = bpd.read_csv('data/united_summer2015.csv')
united_full

df.sample(n)

# 5 flights, chosen randomly without replacement.
united_full.sample(5)

# 5 flights, chosen randomly with replacement.
united_full.sample(5, replace=True)

united = united_full.get(['Delay'])
united

bins = np.arange(-20, 300, 10)
united.plot(kind='hist', y='Delay', bins=bins, density=True, ec='w', 
            title='Population Distribution of Flight Delays', figsize=(8, 3))
plt.ylabel('Proportion per minute');

sample_size = 100 # Change this and see what happens!
(united
 .sample(sample_size)
 .plot(kind='hist', y='Delay', bins=bins, density=True, ec='w',
       title=f'Distribution of Flight Delays in a Sample of Size {sample_size}',
       figsize=(8, 3))
);

# Calculate the mean of the population.
united_mean = united.get('Delay').mean()
united_mean

16.658155515370705

# Size 100.
united.sample(100).get('Delay').mean()

13.22

# Size 1000.
united.sample(1000).get('Delay').mean()

15.624

%%capture
anim, anim_means = sampling_animation(united, 1000);

HTML(anim.to_jshtml())

# Sample one thousand flights, two thousand times.
sample_size = 1000
repetitions = 2000
sample_means = np.array([])

for n in np.arange(repetitions):
    m = united.sample(sample_size).get('Delay').mean()
    sample_means = np.append(sample_means, m)

bpd.DataFrame().assign(sample_means=sample_means) \
               .plot(kind='hist', bins=np.arange(10, 25, 0.5), density=True, ec='w',
                     title=f'Distribution of Sample Mean with Sample Size {sample_size}',
                     figsize=(10, 5));
    
plt.axvline(x=united_mean, c='black', linewidth=4, label='population mean')
plt.legend();

population = bpd.read_csv('data/2023_salaries.csv')
population

population.columns

Index(['Year', 'EmployerType', 'EmployerName', 'DepartmentOrSubdivision',
       'Position', 'ElectedOfficial', 'Judicial', 'OtherPositions',
       'MinPositionSalary', 'MaxPositionSalary', 'ReportedBaseWage',
       'RegularPay', 'OvertimePay', 'LumpSumPay', 'OtherPay', 'TotalWages',
       'DefinedBenefitPlanContribution', 'EmployeesRetirementCostCovered',
       'DeferredCompensationPlan', 'HealthDentalVision',
       'TotalRetirementAndHealthContribution', 'PensionFormula', 'EmployerURL',
       'EmployerPopulation', 'LastUpdatedDate', 'EmployerCounty',
       'SpecialDistrictActivities', 'IncludesUnfundedLiability',
       'SpecialDistrictType'],
      dtype='object')

population = population.get(['TotalWages'])
population

population.plot(kind='hist', bins=np.arange(0, 500000, 10000), density=True, ec='w', figsize=(10, 5),
                title='Distribution of Total Wages of San Diego City Employees in 2023');

population_median = population.get('TotalWages').median()
population_median

80492.0

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

# Take a sample of size 500.
my_sample = population.sample(500)
my_sample

# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median

82508.0

sample_medians = np.array([])
for i in np.arange(1000):
    median = population.sample(500).get('TotalWages').median()
    sample_medians = np.append(sample_medians, median)
sample_medians

array([82603.5, 84498. , 77594.5, ..., 83471.5, 84897.5, 75602. ])

(bpd.DataFrame()
 .assign(SampleMedians=sample_medians)
 .plot(kind='hist', density=True,
       bins=30, ec='w', figsize=(8, 5),
       title='Distribution of the Sample Median of 1000 Samples from the Population\nSample Size = 500')
);

fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 300_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w', legend=False)
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w', legend=False)
ax.legend(['Population', 'My Sample']);

show_bootstrapping_slides()

original = [7, 9, 4]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=False)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [9 4 7]     Median:  7.0
Resample:  [7 4 9]     Median:  7.0
Resample:  [4 7 9]     Median:  7.0
Resample:  [4 9 7]     Median:  7.0
Resample:  [7 4 9]     Median:  7.0
Resample:  [9 7 4]     Median:  7.0
Resample:  [4 7 9]     Median:  7.0
Resample:  [7 9 4]     Median:  7.0
Resample:  [9 4 7]     Median:  7.0
Resample:  [4 9 7]     Median:  7.0

original = [7, 9, 4]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=True)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [7 4 7]     Median:  7.0
Resample:  [9 4 9]     Median:  9.0
Resample:  [9 9 9]     Median:  9.0
Resample:  [9 4 9]     Median:  9.0
Resample:  [4 4 9]     Median:  4.0
Resample:  [9 9 7]     Median:  9.0
Resample:  [4 7 4]     Median:  4.0
Resample:  [9 4 7]     Median:  7.0
Resample:  [4 9 4]     Median:  4.0
Resample:  [7 4 4]     Median:  4.0

# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

n_resamples = 5000
boot_medians = np.array([])

for i in range(n_resamples):
    
    # Resample from my_sample WITH REPLACEMENT.
    resample = my_sample.sample(500, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Store it in our array of medians.
    boot_medians = np.append(boot_medians, median)

boot_medians

array([85751. , 76009. , 83106. , ..., 82760. , 83470.5, 82711. ])

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();

my_sample.get('TotalWages').median()

82508.0

(bpd.DataFrame()
 .assign(BootstrapMedians=boot_medians)
 .plot(kind='hist', density=True, bins=np.arange(65000, 95000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();

	Date	Flight Number	Destination	Delay
0	6/1/15	73	HNL	257
1	6/1/15	217	EWR	28
2	6/1/15	237	STL	-3
...	...	...	...	...
13822	8/31/15	1994	ORD	3
13823	8/31/15	2000	PHX	-1
13824	8/31/15	2013	EWR	-2

	Date	Flight Number	Destination	Delay
4078	6/28/15	390	ORD	1
10603	8/10/15	611	SEA	19
6627	7/15/15	884	ORD	1
8354	7/26/15	1670	HNL	17
12817	8/24/15	1994	ORD	14

	Date	Flight Number	Destination	Delay
2178	6/15/15	1257	ANC	85
8112	7/24/15	1958	EWR	7
12450	8/22/15	720	DEN	-2
1759	6/12/15	1563	ORD	59
12065	8/19/15	1641	IAD	32

	Delay
0	257
1	28
2	-3
...	...
13822	3
13823	-1
13824	-2

	TotalWages
0	433011
1	416044
2	405315
...	...
13885	10
13886	8
13887	2

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
0	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
1	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
2	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
13885	2023	City	San Diego	Transportation	...	San Diego	NaN	False	NaN
13886	2023	City	San Diego	Police	...	San Diego	NaN	False	NaN
13887	2023	City	San Diego	Public Utilities	...	San Diego	NaN	False	NaN

	TotalWages
4091	113944
2363	144835
3047	132502
...	...
4338	110628
9238	53840
4798	104600

	face
0	1
1	2
2	3
3	4
4	5
5	6

Lecture 9 – Distributions and Sampling¶

DSC 10, Summer 2025¶

Agenda¶

Probability distributions vs. empirical distributions¶

Probability distributions¶

Example: Probability distribution of a die roll 🎲¶

Empirical distributions¶

Example: Empirical distribution of a die roll 🎲¶

Many die rolls 🎲¶

Why does this happen? ⚖️¶

Sampling¶

Populations and samples¶

Sampling strategies¶

Simple random sample¶

Sampling from a list or array¶

Example: Distribution of flight delays ✈️¶

Sampling rows from a DataFrame¶

The effect of sample size¶

Population distribution of flight delays ✈️¶

Sample distribution of flight delays ✈️¶

Parameters and statistics¶

Terminology¶

Mean flight delay ✈️¶

Population mean¶

Sample mean¶

The effect of sample size¶

Probability distribution of a statistic¶

Empirical distribution of a statistic¶

Distribution of sample means¶

What's the point?¶

Does sample size matter?¶

Concept Check ✅ – Answer at cc.dsc10.com¶

How we sample matters!¶

In summary:¶

But how do balance needing large samples with the challenges associated with collecting a very large sample?¶

Bootstrapping!¶

Example: City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

Quick terminology recap:¶

The sample median¶

How confident are we that this is a good estimate?¶

An impractical approach¶

Problem: Drawing new samples like this is impractical – we usually can't just ask for new samples from the population.¶

Bootstrapping 🥾¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Concept Check ✅ – Answer at cc.dsc10.com ¶