# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 5)
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
from IPython.display import display
import ipywidgets as widgets
import warnings
warnings.filterwarnings('ignore')
# Demonstration code
def r_scatter(r):
"Generate a scatter plot with a correlation approximately r"
x = np.random.normal(0, 1, 1000)
z = np.random.normal(0, 1, 1000)
y = r * x + (np.sqrt(1 - r ** 2)) * z
plt.scatter(x, y)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
def show_scatter_grid():
plt.subplots(1, 4, figsize=(10, 2))
for i, r in enumerate([-1, -2/3, -1/3, 0]):
plt.subplot(1, 4, i+1)
r_scatter(r)
plt.title(f'r = {np.round(r, 2)}')
plt.show()
plt.subplots(1, 4, figsize=(10, 2))
for i, r in enumerate([1, 2/3, 1/3]):
plt.subplot(1, 4, i+1)
r_scatter(r)
plt.title(f'r = {np.round(r, 2)}')
plt.subplot(1, 4, 4)
plt.axis('off')
plt.show()
Remember to review the end of Lecture 23 for a high-level summary of the second half of the class so far.
hybrid = bpd.read_csv('data/hybrid.csv')
hybrid
vehicle | year | price | acceleration | mpg | class | |
---|---|---|---|---|---|---|
0 | Prius (1st Gen) | 1997 | 24509.74 | 7.46 | 41.26 | Compact |
1 | Tino | 2000 | 35354.97 | 8.20 | 54.10 | Compact |
2 | Prius (2nd Gen) | 2000 | 26832.25 | 7.97 | 45.23 | Compact |
... | ... | ... | ... | ... | ... | ... |
150 | C-Max Energi Plug-in | 2013 | 32950.00 | 11.76 | 43.00 | Midsize |
151 | Fusion Energi Plug-in | 2013 | 38700.00 | 11.76 | 43.00 | Midsize |
152 | Chevrolet Volt | 2013 | 39145.00 | 11.11 | 37.00 | Compact |
153 rows × 6 columns
'price'
vs. 'acceleration'
¶Is there an association between these two variables? If so, what kind?
hybrid.plot(kind='scatter', x='acceleration', y='price');
'price'
vs. 'mpg'
¶Is there an association between these two variables? If so, what kind?
hybrid.plot(kind='scatter', x='mpg', y='price');
Observations:
hybrid.assign(
km_per_liter=hybrid.get('mpg') * 0.425144,
yen=hybrid.get('price') * 139.77
).plot(kind='scatter', x='km_per_liter', y='yen');
def standard_units(any_numbers):
"Convert any array of numbers to standard units."
any_numbers = np.array(any_numbers)
return (any_numbers - any_numbers.mean()) / np.std(any_numbers)
def standardize(df):
"""Return a DataFrame in which all columns of df are converted to standard units."""
df_su = bpd.DataFrame()
for column in df.columns:
df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
return df_su
For a given pair of variables:
hybrid_su = standardize(hybrid.get(['price', 'acceleration', 'mpg'])).assign(vehicle=hybrid.get('vehicle'))
hybrid_su
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
0 | -6.94e-01 | -1.54 | 0.59 | Prius (1st Gen) |
1 | -1.86e-01 | -1.28 | 1.76 | Tino |
2 | -5.85e-01 | -1.36 | 0.95 | Prius (2nd Gen) |
... | ... | ... | ... | ... |
150 | -2.98e-01 | -0.07 | 0.75 | C-Max Energi Plug-in |
151 | -2.90e-02 | -0.07 | 0.75 | Fusion Energi Plug-in |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
153 rows × 4 columns
'price'
vs. 'acceleration'
¶hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)');
Which cars have 'acceleration'
s and 'price'
s that are more than 2 SDs above average?
hybrid_su[(hybrid_su.get('acceleration (su)') > 2) &
(hybrid_su.get('price (su)') > 2)]
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
47 | 2.71 | 2.05 | -1.46 | ActiveHybrid X6 |
60 | 3.04 | 2.88 | -1.16 | ActiveHybrid 7 |
95 | 2.96 | 2.12 | -1.35 | ActiveHybrid 7i |
146 | 2.11 | 2.12 | -0.90 | ActiveHybrid 7L |
147 | 2.66 | 2.24 | -0.90 | Panamera S |
'price'
vs. 'mpg'
¶hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)');
Which cars have close to average 'mpg'
s and close to average 'price'
s?
hybrid_su[(hybrid_su.get('mpg (su)') <= 0.3) &
(hybrid_su.get('mpg (su)') >= -0.3) &
(hybrid_su.get('price (su)') <= 0.3) &
(hybrid_su.get('price (su)') >= -0.3)]
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
10 | -1.24e-01 | -0.56 | -0.26 | Escape |
22 | -2.13e-01 | -1.02 | -0.17 | Mercury Mariner |
57 | -8.47e-02 | 0.72 | -0.11 | Audi Q5 |
... | ... | ... | ... | ... |
70 | -2.14e-01 | -0.07 | 0.02 | HS 250h |
102 | -2.69e-03 | -0.29 | 0.20 | Chevrolet Volt |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
8 rows × 4 columns
The correlation coefficient $r$ of two variables $x$ and $y$ is defined as the
If x
and y
are two Series or arrays,
r = (x_su * y_su).mean()
where x_su
and y_su
are x
and y
converted to standard units.
def calculate_r(df, x, y):
x_su = df.get(x + ' (su)')
y_su = df.get(y + ' (su)')
return (x_su * y_su).mean()
Let's calculate $r$ for 'acceleration'
and 'price'
.
hybrid_su
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
0 | -6.94e-01 | -1.54 | 0.59 | Prius (1st Gen) |
1 | -1.86e-01 | -1.28 | 1.76 | Tino |
2 | -5.85e-01 | -1.36 | 0.95 | Prius (2nd Gen) |
... | ... | ... | ... | ... |
150 | -2.98e-01 | -0.07 | 0.75 | C-Max Energi Plug-in |
151 | -2.90e-02 | -0.07 | 0.75 | Fusion Energi Plug-in |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
153 rows × 4 columns
r_acc_price = calculate_r(hybrid_su, 'acceleration', 'price')
r_acc_price
0.6955778996913982
hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');
Note that the correlation is positive, and most data points fall in the lower left and upper right quadrants!
Let's now calculate $r$ for 'mpg'
and 'price'
.
hybrid_su
price (su) | acceleration (su) | mpg (su) | vehicle | |
---|---|---|---|---|
0 | -6.94e-01 | -1.54 | 0.59 | Prius (1st Gen) |
1 | -1.86e-01 | -1.28 | 1.76 | Tino |
2 | -5.85e-01 | -1.36 | 0.95 | Prius (2nd Gen) |
... | ... | ... | ... | ... |
150 | -2.98e-01 | -0.07 | 0.75 | C-Max Energi Plug-in |
151 | -2.90e-02 | -0.07 | 0.75 | Fusion Energi Plug-in |
152 | -8.17e-03 | -0.29 | 0.20 | Chevrolet Volt |
153 rows × 4 columns
r_mpg_price = calculate_r(hybrid_su, 'mpg', 'price')
r_mpg_price
-0.5318263633683789
hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)');
plt.axvline(0, color='black');
plt.axhline(0, color='black');
Note that the correlation is negative, and most data points fall in the upper left and lower right quadrants!
show_scatter_grid()