# Run this cell to set up packages for lecture.
from lec25_imports import *

galton = bpd.read_csv('data/galton.csv')
male_children = galton[galton.get('gender') == 'male']
mom_son = bpd.DataFrame().assign(mom = male_children.get('mother'), 
                                 son = male_children.get('childHeight'))
mom_son

mom_son.plot(kind='scatter', x='mom', y='son', figsize=(10, 5));

def standard_units(col):
    return (col - col.mean()) / np.std(col)

def calculate_r(df, x, y):
    '''Returns the average value of the product of x and y, 
       when both are measured in standard units.'''
    x_su = standard_units(df.get(x))
    y_su = standard_units(df.get(y))
    return (x_su * y_su).mean()

r_mom_son = calculate_r(mom_son, 'mom', 'son')
r_mom_son

0.3230049836849053

def slope(df, x, y):
    "Returns the slope of the regression line between columns x and y in df (in original units)."
    r = calculate_r(df, x, y)
    return r * np.std(df.get(y)) / np.std(df.get(x))

def intercept(df, x, y):
    "Returns the intercept of the regression line between columns x and y in df (in original units)."
    return df.get(y).mean() - slope(df, x, y) * df.get(x).mean()

m_heights = slope(mom_son, 'mom', 'son')
m_heights

0.3650611602425757

b_heights = intercept(mom_son, 'mom', 'son')
b_heights

45.8580379719931

def predict_son(mom):
    return m_heights * mom + b_heights

predict_son(62)

68.4918299070328

predict_son(55)

65.93640178533477

predict_son(73)

72.50750266970113

xs = np.arange(57, 72)
ys = predict_son(xs)
mom_son.plot(kind='scatter', x='mom', y='son', figsize=(10, 5), title='Regression line predictions, in original units', label='original data');
plt.plot(xs, ys, color='orange', lw=4, label='regression line')
plt.legend();

outlier = bpd.read_csv('data/outlier.csv')
outlier.plot(kind='scatter', x='x', y='y', s=100, figsize=(10, 5));

calculate_r(outlier, 'x', 'y')

-0.02793982443854457

plot_regression_line(outlier, 'x', 'y')

without_outlier = outlier[outlier.get('y') > 40]

calculate_r(without_outlier, 'x', 'y')

0.9851437295364016

plot_regression_line(without_outlier, 'x', 'y')

outlier.plot(kind='scatter', x='x', y='y', s=100, figsize=(10, 5));

m_no_outlier = slope(without_outlier, 'x', 'y')
b_no_outlier = intercept(without_outlier, 'x', 'y')

m_no_outlier, b_no_outlier

(0.975927715724588, 3.04233713529743)

plot_errors(without_outlier, m_no_outlier, b_no_outlier)

without_outlier

predicted_y = m_no_outlier * without_outlier.get('x') + b_no_outlier
predicted_y

array([51.84, 56.72, 61.6 , 66.48, 71.36, 76.24, 81.12, 86.  , 90.88])

# Errors.
without_outlier.get('y') - predicted_y

0    1.69
1   -2.51
2    4.06
     ... 
6   -1.51
7    2.18
8    0.18
Name: y, Length: 9, dtype: float64

# Squared errors.
(without_outlier.get('y') - predicted_y) ** 2

0     2.86
1     6.31
2    16.45
     ...  
6     2.27
7     4.74
8     0.03
Name: y, Length: 9, dtype: float64

# Mean squared error.
((without_outlier.get('y') - predicted_y) ** 2).mean()

4.823770221019625

# Root mean squared error.
np.sqrt(((without_outlier.get('y') - predicted_y) ** 2).mean())

2.196308316475541

def rmse(slope, intercept):
    '''Calculates the RMSE of the line with the given slope and intercept, 
    using the 'x' and 'y' columns of without_outlier.'''

    # The true values of y.
    true = without_outlier.get('y')
    
    # The predicted values of y, from plugging the x values from the 
    # given DataFrame into the line with the given slope and intercept.
    predicted = slope * without_outlier.get('x') + intercept
    
    return np.sqrt(((true - predicted) ** 2).mean())

# Check that our function works on the regression line.
rmse(m_no_outlier, b_no_outlier)

2.196308316475541

# Experiment by changing one of these!
lines = [(1.2, -15), (0.75, 11.5), (-0.4, 100)]

fig, ax = plt.subplots(1, 3, figsize=(14, 4))
for i, line in enumerate(lines):
    plt.subplot(1, 3, i + 1)
    m, b = line
    plot_errors(without_outlier, m, b, ax=ax[i])
    ax[i].set_title(format_equation(m, b) + f'\nRMSE={np.round(rmse(m, b), 2)}')

def f(x):
    return (x - 5) ** 2 + 4

# Plot of f(x).
x = np.linspace(0, 10)
y = f(x)
plt.plot(x, y)
plt.title(r'$f(x) = (x - 5)^2 + 4$');

minimize(f)

array([5.])

smallest_rmse_line = minimize(rmse)
smallest_rmse_line

array([0.98, 3.04])

# The slope and intercept with the smallest RMSE, from our call to minimize.
m_smallest_rmse = smallest_rmse_line[0]
b_smallest_rmse = smallest_rmse_line[1]
m_smallest_rmse, b_smallest_rmse

(0.975927458174801, 3.0423551816863985)

# The slope and intercept according to our regression line formulas.
slope(without_outlier, 'x', 'y'), intercept(without_outlier, 'x', 'y')

(0.975927715724588, 3.04233713529743)

np.random.seed(23)
x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5) + np.random.normal(size=25), 
    y=np.arange(-6, 6.1, 0.5)**2 + np.random.normal(size=25)
)
x2.plot(kind='scatter', x='x', y='y', s=100, figsize=(10, 5));

plot_regression_line(x2, 'x', 'y')

	mom	son
0	67.0	73.2
4	66.5	73.5
5	66.5	72.5
...	...	...
925	60.0	66.0
929	66.0	64.0
932	63.0	66.5

Lecture 24 – Regression and Least Squares¶

DSC 10, Summer 2024¶

Agenda¶

The regression line in standard units¶

Example: Predicting heights 👪 📏¶

Correlation¶

The regression line¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

The regression line in original units¶

Reflection¶

From standard units to original units¶

The regression line in original units¶

Making predictions¶

Outliers¶

The effect of outliers on correlation¶

Removing the outlier¶

Errors in prediction¶

Motivation¶

Example: Without the outlier¶

Measuring the error in prediction¶

Root mean squared error (RMSE) of the regression line's predictions¶

Root mean squared error (RMSE) in an arbirtrary line's predictions¶

Finding the "best" prediction line by minimizing RMSE¶

Aside: `minimize`¶

Finding the "best" prediction line by minimizing RMSE¶

Coincidence?¶

The regression line is the best line!¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Summary, next time¶

Summary¶

Next time¶

	x	y
0	50	53.53
1	55	54.21
2	60	65.65
...	...	...
6	80	79.61
7	85	88.17
8	90	91.05

Lecture 24 – Regression and Least Squares¶

DSC 10, Summer 2024¶

Agenda¶

The regression line in standard units¶

Example: Predicting heights 👪 📏¶

Correlation¶

The regression line¶

Concept Check ✅ – Answer at cc.dsc10.com¶

The regression line in original units¶

Reflection¶

From standard units to original units¶

The regression line in original units¶

Making predictions¶

Outliers¶

The effect of outliers on correlation¶

Removing the outlier¶

Errors in prediction¶

Motivation¶

Example: Without the outlier¶

Measuring the error in prediction¶

Root mean squared error (RMSE) of the regression line's predictions¶

Root mean squared error (RMSE) in an arbirtrary line's predictions¶

Finding the "best" prediction line by minimizing RMSE¶

Aside: minimize¶

Finding the "best" prediction line by minimizing RMSE¶

Coincidence?¶

The regression line is the best line!¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Aside: `minimize`¶