# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, PillowWriter
from scipy import stats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
from IPython.display import display, HTML, IFrame, clear_output
import ipywidgets as widgets
import warnings
warnings.filterwarnings('ignore')
# Demonstration code
def r_scatter(r):
"Generate a scatter plot with a correlation approximately r"
x = np.random.normal(0, 1, 1000)
z = np.random.normal(0, 1, 1000)
y = r * x + (np.sqrt(1 - r ** 2)) * z
plt.scatter(x, y)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
def show_scatter_grid():
plt.subplots(1, 4, figsize=(10, 2))
for i, r in enumerate([-1, -2/3, -1/3, 0]):
plt.subplot(1, 4, i+1)
r_scatter(r)
plt.title(f'r = {np.round(r, 2)}')
plt.show()
plt.subplots(1, 4, figsize=(10, 2))
for i, r in enumerate([1, 2/3, 1/3]):
plt.subplot(1, 4, i+1)
r_scatter(r)
plt.title(f'r = {np.round(r, 2)}')
plt.subplot(1, 4, 4)
plt.axis('off')
plt.show()
Every statistical test and simulation we've run in the second half of the class is related to one of the following four ideas. To solidify your understanding of what we've done, it's a good idea to review past lectures and assignments and see how what we did in each section relates to one of these four ideas.
hybrid = bpd.read_csv('data/hybrid.csv')
hybrid
vehicle | year | price | acceleration | mpg | class | |
---|---|---|---|---|---|---|
0 | Prius (1st Gen) | 1997 | 24509.74 | 7.46 | 41.26 | Compact |
1 | Tino | 2000 | 35354.97 | 8.20 | 54.10 | Compact |
2 | Prius (2nd Gen) | 2000 | 26832.25 | 7.97 | 45.23 | Compact |
... | ... | ... | ... | ... | ... | ... |
150 | C-Max Energi Plug-in | 2013 | 32950.00 | 11.76 | 43.00 | Midsize |
151 | Fusion Energi Plug-in | 2013 | 38700.00 | 11.76 | 43.00 | Midsize |
152 | Chevrolet Volt | 2013 | 39145.00 | 11.11 | 37.00 | Compact |
153 rows × 6 columns
'acceleration'
and 'price'
¶Is there an association between these two variables? If so, what kind?
hybrid.plot(kind='scatter', x='acceleration', y='price', figsize=(10, 5));
'mpg'
and 'price'
¶Is there an association between these two variables? If so, what kind?
hybrid.plot(kind='scatter', x='mpg', y='price', figsize=(10, 5));
Observations:
hybrid.assign(
km_per_liter=hybrid.get('mpg') * 0.425144,
yen=hybrid.get('price') * 140.34
).plot(kind='scatter', x='km_per_liter', y='yen', figsize=(10, 5));
def standard_units(any_numbers):
"Convert any array of numbers to standard units."
any_numbers = np.array(any_numbers)
return (any_numbers - any_numbers.mean()) / np.std(any_numbers)
def standardize(df):
"""Return a DataFrame in which all columns of df are converted to standard units."""
df_su = bpd.DataFrame()
for column in df.columns:
df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
return df_su
For a given pair of variables:
hybrid_su = standardize(hybrid.get(['price', 'acceleration', 'mpg'])).assign(vehicle=hybrid.get('vehicle'))
hybrid_su
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
0 | -6.94e-01 | -1.54 | 0.59 | Prius (1st Gen) |
1 | -1.86e-01 | -1.28 | 1.76 | Tino |
2 | -5.85e-01 | -1.36 | 0.95 | Prius (2nd Gen) |
... | ... | ... | ... | ... |
150 | -2.98e-01 | -0.07 | 0.75 | C-Max Energi Plug-in |
151 | -2.90e-02 | -0.07 | 0.75 | Fusion Energi Plug-in |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
153 rows × 4 columns
'acceleration'
and 'price'
¶hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)', figsize=(10, 5));
Which cars have 'acceleration'
s and 'price'
s that are more than 2 SDs above average?
hybrid_su[(hybrid_su.get('acceleration (su)') > 2) &
(hybrid_su.get('price (su)') > 2)]
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
47 | 2.71 | 2.05 | -1.46 | ActiveHybrid X6 |
60 | 3.04 | 2.88 | -1.16 | ActiveHybrid 7 |
95 | 2.96 | 2.12 | -1.35 | ActiveHybrid 7i |
146 | 2.11 | 2.12 | -0.90 | ActiveHybrid 7L |
147 | 2.66 | 2.24 | -0.90 | Panamera S |
'mpg'
and 'price'
¶hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)', figsize=(10, 5));
Which cars have close to average 'mpg'
s and close to average 'price'
s?
hybrid_su[(hybrid_su.get('mpg (su)') <= 0.3) &
(hybrid_su.get('mpg (su)') >= -0.3) &
(hybrid_su.get('price (su)') <= 0.3) &
(hybrid_su.get('price (su)') >= -0.3)]
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
10 | -1.24e-01 | -0.56 | -0.26 | Escape |
22 | -2.13e-01 | -1.02 | -0.17 | Mercury Mariner |
57 | -8.47e-02 | 0.72 | -0.11 | Audi Q5 |
... | ... | ... | ... | ... |
70 | -2.14e-01 | -0.07 | 0.02 | HS 250h |
102 | -2.69e-03 | -0.29 | 0.20 | Chevrolet Volt |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
8 rows × 4 columns
When there is a positive association, most data points fall in the lower left and upper right quadrants.
hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)', figsize=(10, 5))
plt.axvline(0, color='black');
plt.axhline(0, color='black');
When there is a negative association, most data points fall in the upper left and lower right quadrants.
hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)', figsize=(10, 5))
plt.axvline(0, color='black');
plt.axhline(0, color='black');
The correlation coefficient $r$ of two variables $x$ and $y$ is defined as the
If x
and y
are two Series or arrays,
r = (x_su * y_su).mean()
where x_su
and y_su
are x
and y
converted to standard units.
Let's calculate $r$ for 'acceleration'
and 'price'
.
hybrid_su
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
0 | -6.94e-01 | -1.54 | 0.59 | Prius (1st Gen) |
1 | -1.86e-01 | -1.28 | 1.76 | Tino |
2 | -5.85e-01 | -1.36 | 0.95 | Prius (2nd Gen) |
... | ... | ... | ... | ... |
150 | -2.98e-01 | -0.07 | 0.75 | C-Max Energi Plug-in |
151 | -2.90e-02 | -0.07 | 0.75 | Fusion Energi Plug-in |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
153 rows × 4 columns
r_acc_price = (hybrid_su.get('acceleration (su)') * hybrid_su.get('price (su)')).mean()
r_acc_price
0.6955778996913982
hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)', figsize=(10, 5))
plt.axvline(0, color='black');
plt.axhline(0, color='black');
Note that the correlation is positive, and most data points fall in the lower left and upper right quadrants!
Let's now calculate $r$ for 'mpg'
and 'price'
.
hybrid_su
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
0 | -6.94e-01 | -1.54 | 0.59 | Prius (1st Gen) |
1 | -1.86e-01 | -1.28 | 1.76 | Tino |
2 | -5.85e-01 | -1.36 | 0.95 | Prius (2nd Gen) |
... | ... | ... | ... | ... |
150 | -2.98e-01 | -0.07 | 0.75 | C-Max Energi Plug-in |
151 | -2.90e-02 | -0.07 | 0.75 | Fusion Energi Plug-in |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
153 rows × 4 columns
r_mpg_price = (hybrid_su.get('mpg (su)') * hybrid_su.get('price (su)')).mean()
r_mpg_price
-0.5318263633683789
hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)', figsize=(10, 5));
plt.axvline(0, color='black');
plt.axhline(0, color='black');
Note that the correlation is negative, and most data points fall in the upper left and lower right quadrants!
show_scatter_grid()