# Run this cell to set up packages for lecture.
from lec26_imports import *


import numpy as np
import babypandas as bpd
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from ipywidgets import widgets
from IPython.display import clear_output, display

import warnings
warnings.filterwarnings('ignore')

def standard_units(col):
    return (col - col.mean()) / np.std(col)

def standardize(df):
    """Return a DataFrame in which all columns of df are converted to standard units."""
    df_su = bpd.DataFrame()
    for column in df.columns:
        df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
    return df_su

def calculate_r(df, x, y):
    '''Returns the average value of the product of x and y, 
       when both are measured in standard units.'''
    x_su = standard_units(df.get(x))
    y_su = standard_units(df.get(y))
    return (x_su * y_su).mean()

def slope(df, x, y):
    '''Returns the slope of the regression line between columns x and y in df (in original units).'''
    r = calculate_r(df, x, y)
    return r * np.std(df.get(y)) / np.std(df.get(x))

def intercept(df, x, y):
    '''Returns the intercept of the regression line between columns x and y in df (in original units).'''
    return df.get(y).mean() - slope(df, x, y) * df.get(x).mean()

# All of the following code is for visualization.
def plot_regression_line(df, x, y, margin=.02, alpha=1, resid=False):
    '''Computes the slope and intercept of the regression line between columns x and y in df (in original units) and plots it.'''
    m = slope(df, x, y)
    b = intercept(df, x, y)
    
    df.plot(kind='scatter', x=x, y=y, s=50, figsize=(10, 5), label='original data', alpha=alpha)
    left = df.get(x).min()*(1 - margin)
    right = df.get(x).max()*(1 + margin)
    domain = np.linspace(left, right, 10)
    plt.plot(domain, m*domain + b, color='orange', label='regression line', lw=4)
    plt.suptitle(format_equation(m, b), fontsize=18)
    plt.legend();
    
    if resid:
        idx = np.random.randint(0, mom_son_predictions.shape[0], size=50)
        for i, k in enumerate(idx):
            x = mom_son_predictions.get('mom').iloc[k]
            y = mom_son_predictions.get('son').iloc[k]
            p = mom_son_predictions.get('predicted').iloc[k]
            plt.plot([x,x], [y,p], linewidth=3, color='purple', label='residuals' if i == 0 else None)
        plt.legend();
        print('Correlation:', calculate_r(mom_son, 'mom', 'son'))
        
def non_linear():
    np.random.seed(23)
    x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5) + np.random.normal(size=25), 
    y=np.arange(-6, 6.1, 0.5)**2 + np.random.normal(size=25)
    )
    plot_regression_line(x2, 'x', 'y')

def format_equation(m, b):
    if b > 0:
        return r'$y = %.2fx + %.2f$' % (m, b)
    elif b == 0:
        return r'$y = %.2fx' % m
    else:
        return r'$y = %.2fx %.2f$' % (m, b)
    
# Don't worry about how this code works.
def pred_interval(mom):
    plt.figure(figsize=(10, 5))
    x = np.arange(50, 80)
    ys = []
    for i, (m, b) in enumerate(zip(m_boot[:50], b_boot)):
        ys.append(m * x + b)
        plt.plot(x, m * x + b, linewidth=1, alpha=0.1)  
        
    boot_preds = m_boot * mom + b_boot
    l = np.percentile(boot_preds, 2.5)
    r = np.percentile(boot_preds, 97.5)
    plt.plot([mom, mom], [l, r], linewidth=5, color='#eb7e35', label='95% prediction interval')
    plt.xlim(50, 80)
    plt.ylim(62, 77)
    plt.title(f'95% prediction interval for the height of a son whose mother is {mom} inches tall: {[np.round(l, 3), np.round(r, 3)]}')
    plt.legend()
    plt.show()

def slider_widget():
    mom_slider = widgets.IntSlider(value=64, min=50, max=78, step=1, description="mom's height")
    ui = widgets.HBox([mom_slider])
    out = widgets.interactive_output(pred_interval, {'mom': mom_slider})
    display(ui, out)
    
def draw_many_lines(m_boot, b_boot):
    plt.figure(figsize=(10, 5))
    x = np.arange(50, 80)
    ys = []
    for i, (m, b) in enumerate(zip(m_boot[:50], b_boot)):
        ys.append(m * x + b)
        fig = plt.plot(x, m * x + b, linewidth=1)

non_linear()

def predicted(df, x, y):
    m = slope(df, x, y)
    b = intercept(df, x, y)
    return m * df.get(x) + b

def residual(df, x, y):
    return df.get(y) - predicted(df, x, y)

galton = bpd.read_csv('data/galton.csv')

male_children = galton[galton.get('gender') == 'male']
mom_son = bpd.DataFrame().assign(mom = male_children.get('mother'), 
                                 son = male_children.get('childHeight'))

mom_son_predictions = mom_son.assign(predicted=predicted(mom_son, 'mom', 'son'),
                                     residuals=residual(mom_son, 'mom', 'son'),
                                    )

plot_regression_line(mom_son_predictions, 'mom', 'son', resid=True)

Correlation: 0.3230049836849053

mom_son_predictions.plot(kind='scatter', x='mom', y='residuals', s=50, c='purple', figsize=(10, 5), label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for predicting son\'s height based on mother\'s height')
plt.legend();

hybrid = bpd.read_csv('data/hybrid.csv')
mpg_price = hybrid.assign(
    predicted=predicted(hybrid, 'mpg', 'price'),
    residuals=residual(hybrid, 'mpg', 'price')
)
mpg_price

# Plot of the original data and regression line.
plot_regression_line(hybrid, 'mpg', 'price');
print('Correlation:', calculate_r(hybrid, 'mpg', 'price'))

Correlation: -0.5318263633683786

# Residual plot.
mpg_price.plot(kind='scatter', x='mpg', y='residuals', figsize=(10, 5), s=50, color='purple', label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for regression between mpg and price')
plt.legend();

accel_mpg = hybrid.assign(
    predicted=predicted(hybrid, 'acceleration', 'mpg'),
    residuals=residual(hybrid, 'acceleration', 'mpg')
)
accel_mpg

# Plot of the original data and regression line.
plot_regression_line(accel_mpg, 'acceleration', 'mpg')
print('Correlation:', calculate_r(accel_mpg, 'acceleration', 'mpg'))

Correlation: -0.5060703843771186

# Residual plot.
accel_mpg.plot(kind='scatter', x='acceleration', y='residuals', figsize=(10, 5), s=50, color='purple', label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for regression between acceleration and mpg')
plt.legend();

dino = bpd.read_csv('data/Datasaurus_data.csv')
dino

calculate_r(dino, 'x', 'y')

-0.06447185270095163

slope(dino, 'x', 'y')

-0.10358250243265595

intercept(dino, 'x', 'y')

53.452978449229235

plot_regression_line(dino, 'x', 'y');

# Step 1: Resample the dataset.
resample = mom_son.sample(mom_son.shape[0], replace=True)

# Step 2: Compute the slope and intercept of the regression line for that resample.
plot_regression_line(resample, 'mom', 'son', alpha=0.5)

plt.ylim([60, 80])
plt.xlim([57, 72]);

m_orig = slope(mom_son, 'mom', 'son')
b_orig = intercept(mom_son, 'mom', 'son')

m_boot = np.array([])
b_boot = np.array([])

for i in np.arange(5000):
    # Step 1: Resample the dataset.
    resample = mom_son.sample(mom_son.shape[0], replace=True)
    
    # Step 2: Compute the slope and intercept of the regression line for that resample.
    m = slope(resample, 'mom', 'son')
    b = intercept(resample, 'mom', 'son')
    m_boot = np.append(m_boot, m)
    b_boot = np.append(b_boot, b)

pred_orig = m_orig * 68 + b_orig
pred_orig

70.68219686848825

m_orig

0.3650611602425757

m_boot

array([0.33, 0.36, 0.42, ..., 0.33, 0.33, 0.4 ])

b_orig

45.8580379719931

b_boot

array([48.18, 46.26, 42.22, ..., 48.02, 48.13, 43.57])

boot_preds = m_boot * 68 + b_boot
boot_preds

array([70.74, 70.53, 70.98, ..., 70.6 , 70.88, 70.8 ])

l = np.percentile(boot_preds, 2.5)
r = np.percentile(boot_preds, 97.5)
[l, r]

[70.21553543791681, 71.15983764737595]

bpd.DataFrame().assign(
    predictions=boot_preds
).plot(kind='hist', density=True, bins=20, figsize=(10, 5), ec='w', title='Interval of predicted heights for the son of a 68 inch tall mother')
plt.plot([l,r],[0.01,0.01], c='gold', linewidth=10, zorder=1, label='95% prediction interval')
plt.legend();

draw_many_lines(m_boot, b_boot)

slider_widget()

HBox(children=(IntSlider(value=64, description="mom's height", max=78, min=50),))

Output()

	vehicle	year	price	acceleration	mpg	class	predicted	residuals
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact	32609.64	-8099.90
1	Tino	2000	35354.97	8.20	54.10	Compact	19278.39	16076.58
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact	28487.75	-1655.50
...	...	...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize	30803.06	2146.94
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize	30803.06	7896.94
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact	37032.62	2112.38

	vehicle	year	price	acceleration	mpg	class	predicted	residuals
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact	43.29	-2.03
1	Tino	2000	35354.97	8.20	54.10	Compact	41.90	12.20
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact	42.33	2.90
...	...	...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize	35.17	7.83
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize	35.17	7.83
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact	36.40	0.60

	x	y
0	55.38	97.18
1	51.54	96.03
2	46.15	94.49
...	...	...
139	50.00	95.77
140	47.95	95.00
141	44.10	92.69

Lecture 25 – Residuals and Inference¶

DSC 10, Summer 2024¶

Announcements¶

Agenda¶

Residuals¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Example: Predicting a son's height from his mother's height 👵👨 📏¶

Residual plots¶

The residual plot for a non-linear association 🚗¶

Issue: Patterns in the residual plot¶

Another example: `'mpg'` and `'acceleration'` ⛽¶

Issue: Uneven vertical spread¶

Example: Anscombe's quartet¶

Example: The Datasaurus Dozen 🦖¶

Inference for regression¶

Another perspective on regression¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Prediction intervals¶

Bootstrapping the scatter plot of mother/son heights¶

Bootstrapping predictions: mother/son heights¶

If a mother is 68 inches tall, how tall do we predict her son to be?¶

How different could our prediction have been, for all inputs?¶

Prediction interval width vs. mother's height¶

Summary, next time¶

Summary¶

Next time¶

Lecture 25 – Residuals and Inference¶

DSC 10, Summer 2024¶

Announcements¶

Agenda¶

Residuals¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Example: Predicting a son's height from his mother's height 👵👨 📏¶

Residual plots¶

The residual plot for a non-linear association 🚗¶

Issue: Patterns in the residual plot¶

Another example: 'mpg' and 'acceleration' ⛽¶

Issue: Uneven vertical spread¶

Example: Anscombe's quartet¶

Example: The Datasaurus Dozen 🦖¶

Inference for regression¶

Another perspective on regression¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Prediction intervals¶

Bootstrapping the scatter plot of mother/son heights¶

Bootstrapping predictions: mother/son heights¶

If a mother is 68 inches tall, how tall do we predict her son to be?¶

How different could our prediction have been, for all inputs?¶

Prediction interval width vs. mother's height¶

Summary, next time¶

Summary¶

Next time¶

Another example: `'mpg'` and `'acceleration'` ⛽¶

Concept Check ✅ – Answer at cc.dsc10.com ¶