# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
import ipywidgets as widgets
from IPython.display import display, HTML
delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5))
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)');
Question: Which is larger – the mean or the median?
delays.get('Delay').mean()
16.658155515370705
delays.get('Delay').median()
2.0
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', alpha=0.65, figsize=(10, 5))
plt.plot([delays.get('Delay').mean(), delays.get('Delay').mean()], [0, 1], color='green', label='Mean', linewidth=2)
plt.scatter([delays.get('Delay').mean()], [-0.0017], color='green', marker='^', s=250)
plt.plot([delays.get('Delay').median(), delays.get('Delay').median()], [0, 1], color='purple', label='Median', linewidth=2)
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)')
plt.ylim(-0.005, 0.065)
plt.legend();
data = np.array([2, 3, 3, 9])
np.mean(data)
4.25
deviations = data - np.mean(data)
deviations
array([-2.25, -1.25, -1.25, 4.75])
Each entry in deviations
describes how far the corresponding element in data
is from 4.25.
What is the average deviation?
np.mean(deviations)
0.0
# Square all the deviations:
deviations ** 2
array([ 5.06, 1.56, 1.56, 22.56])
variance = np.mean(deviations ** 2)
variance
7.6875
This quantity, the average squared deviation from the mean, is called the variance.
# Standard deviation (SD) is the square root of the variance.
sd = variance ** 0.5
sd
2.7726341266023544
numpy
has a function, np.std
, that calculates the standard deviation for us.# Note that this evaluates to the same number we found on the previous slide.
np.std(data)
2.7726341266023544
To summarize:
$$\begin{align*}\text{variance} &= \text{average squared deviation from the mean}\\ &= \frac{(\text{value}_1 - \text{mean})^2 + ... + (\text{value}_n - \text{mean})^2}{n}\\ \text{standard deviation} &= \sqrt{\text{variance}} \end{align*}$$where $n$ is the number of observations.
It turns out, in any numerical distribution, the bulk of the data are in the range “mean ± a few SDs”.
Let's make this more precise.
Fact: In any numerical distribution, the proportion of values in the range “mean ± $z$ SDs” is at least
$$1 - \frac{1}{z^2} $$Range | Proportion |
---|---|
mean ± 2 SDs | at least $1 - \frac{1}{4}$ (75%) |
mean ± 3 SDs | at least $1 - \frac{1}{9}$ (88.88..%) |
mean ± 4 SDs | at least $1 - \frac{1}{16}$ (93.75%) |
mean ± 5 SDs | at least $1 - \frac{1}{25}$ (96%) |
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');
delay_mean = delays.get('Delay').mean()
delay_mean
16.658155515370705
delay_std = np.std(delays.get('Delay')) # There is no .std() method in babypandas!
delay_std
39.480199851609314
Chebyshev's inequality tells us that
delay_mean - 2 * delay_std, delay_mean + 2 * delay_std
(-62.30224418784792, 95.61855521858934)
delay_mean - 3 * delay_std, delay_mean + 3 * delay_std
(-101.78244403945723, 135.09875507019865)
Let's visualize these intervals!
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, alpha=0.65, ec='w', figsize=(10, 5), title='Flight Delays')
plt.axvline(delay_mean - 2 * delay_std, color='maroon', label='± 2 SD')
plt.axvline(delay_mean + 2 * delay_std, color='maroon')
plt.axvline(delay_mean + 3 * delay_std, color='blue', label='± 3 SD')
plt.axvline(delay_mean - 3 * delay_std, color='blue')
plt.axvline(delay_mean, color='green', label='Mean')
plt.scatter([delay_mean], [-0.0017], color='green', marker='^', s=250)
plt.ylim(-0.0038, 0.06)
plt.legend();
Remember, Chebyshev's inequality states that at least $1 - \frac{1}{z^2}$ of values are within $z$ SDs from the mean, for any numerical distribution.
For instance, it tells us that at least 75% of delays are in the following interval:
delay_mean - 2 * delay_std, delay_mean + 2 * delay_std
(-62.30224418784792, 95.61855521858934)
However, in this case, a much larger fraction of delays are in that interval.
within_2_sds = delays[(delays.get('Delay') >= delay_mean - 2 * delay_std) &
(delays.get('Delay') <= delay_mean + 2 * delay_std)]
within_2_sds.shape[0] / delays.shape[0]
0.9560940325497288
If we know more about the shape of the distribution, we can provide better guarantees for the proportion of values within $z$ SDs of the mean.
For a particular set of data points, Chebyshev's inequality states that at least $\frac{8}{9}$ of the data points are between $-20$ and $40$. What is the standard deviation of the data?
We'll work with a data set containing the heights and weights of 5000 adult males.
height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight
Height | Weight | |
---|---|---|
0 | 73.85 | 241.89 |
1 | 68.78 | 162.31 |
2 | 74.11 | 212.74 |
... | ... | ... |
4997 | 67.01 | 199.20 |
4998 | 71.56 | 185.91 |
4999 | 70.35 | 198.90 |
5000 rows × 2 columns
Let's look at the distributions of both numerical variables.
height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=30, alpha=0.8, figsize=(10, 5));
height_and_weight.plot(kind='hist', y='Weight', density=True, ec='w', bins=30, alpha=0.8, color='C1', figsize=(10, 5));
height_and_weight.plot(kind='hist', density=True, ec='w', bins=60, alpha=0.8, figsize=(10, 5));
Observation: The two distributions look like shifted and stretched versions of the same basic shape, called a bell curve 🔔.
Suppose $x$ is a numerical variable, and $x_i$ is one value of that variable. The function $$z(x_i) = \frac{x_i - \text{mean of $x$}}{\text{SD of $x$}}$$
converts $x_i$ to standard units, which represents the number of standard deviations $x_i$ is above the mean.
Example: Suppose someone weighs 225 pounds. What is their weight in standard units?
weights = height_and_weight.get('Weight')
(225 - weights.mean()) / np.std(weights)
1.9201699181580782
The process of converting all values of a variable (i.e. a column) to standard units is known as standardization, and the resulting values are considered to be standardized.
def standard_units(col):
return (col - col.mean()) / np.std(col)
standardized_height = standard_units(height_and_weight.get('Height'))
standardized_height
0 1.68 1 -0.09 2 1.78 ... 4997 -0.70 4998 0.88 4999 0.46 Name: Height, Length: 5000, dtype: float64
standardized_weight = standard_units(height_and_weight.get('Weight'))
standardized_weight
0 2.77 1 -1.25 2 1.30 ... 4997 0.62 4998 -0.06 4999 0.60 Name: Weight, Length: 5000, dtype: float64
Standardized variables have:
We often standardize variables to bring them to the same scale.
Aside: To quickly see summary statistics for a numerical Series, use the .describe()
Series method.
# e-14 means 10^(-14), which is a very small number, effectively zero.
standardized_height.describe()
count 5.00e+03 mean 1.49e-15 std 1.00e+00 ... 50% 4.76e-04 75% 6.85e-01 max 3.48e+00 Name: Height, Length: 8, dtype: float64
standardized_weight.describe()
count 5.00e+03 mean 5.98e-16 std 1.00e+00 ... 50% 6.53e-04 75% 6.74e-01 max 4.19e+00 Name: Weight, Length: 8, dtype: float64
Let's look at how the process of standardization works visually.
HTML('data/height_anim.html')
HTML('data/weight_anim.html')