# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 5)
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import display, IFrame, YouTubeVideo
def show_grouping_animation():
src = "https://docs.google.com/presentation/d/e/2PACX-1vTgVlFngQcLMYHP-z1vq5lVXjsBgcHebc-3TX7SW6L_gjX6TD1gsflvVDQUpWiDdeEPqJASenUIfBVd/embed?start=false&loop=false&delayms=60000"
width = 960
height = 509
display(IFrame(src, width, height))
babypandas
notes. Review both of these materials to study.There are several keyboard shortcuts built into Jupyter Notebooks designed to help you save time. To see them, either click the keyboard button in the toolbar above or hit the H key on your keyboard (as long as you're not actively editing a cell).
Particularly useful shortcuts:
Action | Keyboard shortcut |
---|---|
Run cell + jump to next cell | SHIFT + ENTER |
Save the notebook | CTRL/CMD + S |
Create new cell above/below | A/B |
Delete cell | DD |
states = bpd.read_csv('data/states.csv').set_index('State')
states = states.assign(Density=states.get('Population') / states.get('Land Area'))
states
Region | Capital City | Population | Land Area | Party | Density | |
---|---|---|---|---|---|---|
State | ||||||
Alabama | South | Montgomery | 5024279 | 50645 | Republican | 99.21 |
Alaska | West | Juneau | 733391 | 570641 | Republican | 1.29 |
Arizona | West | Phoenix | 7151502 | 113594 | Republican | 62.96 |
... | ... | ... | ... | ... | ... | ... |
West Virginia | South | Charleston | 1793716 | 24038 | Republican | 74.62 |
Wisconsin | Midwest | Madison | 5893718 | 54158 | Republican | 108.82 |
Wyoming | West | Cheyenne | 576851 | 97093 | Republican | 5.94 |
50 rows × 6 columns
We can find the total population of any one region using the tools we already have.
states[states.get('Region') == 'West'].get('Population').sum()
78588572
states[states.get('Region') == 'Midwest'].get('Population').sum()
68985454
But can we find the total population of every region all at the same time, without writing very similar code multiple times? Yes, there is a better way!
.groupby
¶Observe what happens when we use the .groupby
method on states
with the argument 'Region'
.
states.groupby('Region').sum()
Population | Land Area | Density | |
---|---|---|---|
Region | |||
Midwest | 68985454 | 750524 | 1298.78 |
Northeast | 57609148 | 161912 | 4957.49 |
South | 125576562 | 868356 | 3189.37 |
West | 78588572 | 1751054 | 881.62 |
These populations (for the 'West'
and 'Midwest'
) match the ones we found on the previous slide, except now we get the populations for all regions at the same time. What just happened? 🤯
Consider the DataFrame pets
, shown below.
Species | Color | Weight | Age | |
---|---|---|---|---|
0 | dog | black | 40 | 5.0 |
1 | cat | golden | 15 | 8.0 |
2 | cat | black | 20 | 9.0 |
3 | dog | white | 80 | 2.0 |
4 | dog | golden | 25 | 0.5 |
5 | hamster | golden | 1 | 3.0 |
Let's see what happens under the hood when we run pets.groupby('Species').mean()
.
show_grouping_animation()
pets = bpd.DataFrame().assign(
Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
Weight=[40, 15, 20, 80, 25, 1],
Age=[5, 8, 9, 2, 0.5, 3]
)
pets
Species | Color | Weight | Age | |
---|---|---|---|---|
0 | dog | black | 40 | 5.0 |
1 | cat | golden | 15 | 8.0 |
2 | cat | black | 20 | 9.0 |
3 | dog | white | 80 | 2.0 |
4 | dog | golden | 25 | 0.5 |
5 | hamster | golden | 1 | 3.0 |
pets.groupby('Species').mean()
Weight | Age | |
---|---|---|
Species | ||
cat | 17.50 | 8.5 |
dog | 48.33 | 2.5 |
hamster | 1.00 | 3.0 |
It takes several steps to go from the original pets
DataFrame to this grouped DataFrame, but we don't get to see any of Python's inner workings, just the final output.
states.groupby('Region').sum()
Population | Land Area | Density | |
---|---|---|---|
Region | |||
Midwest | 68985454 | 750524 | 1298.78 |
Northeast | 57609148 | 161912 | 4957.49 |
South | 125576562 | 868356 | 3189.37 |
West | 78588572 | 1751054 | 881.62 |
# Note the use of .index – remember, the index isn't a column!
(
states
.groupby('Region')
.sum()
.sort_values(by='Population', ascending=False)
.index[0]
)
'South'
.groupby
in general¶In short, .groupby
aggregates (collects) all rows with the same value in a specified column (e.g. 'Region'
) into a single row in the resulting DataFrame, using an aggregation method (e.g. .sum()
) to combine values from different rows with the same value in the specified column.
To use .groupby
:
.groupby(column_name)
will gather rows which have the same value in the specified column (column_name
)..count()
, .sum()
, .mean()
, .median()
, .max()
, and .min()
.states
Region | Capital City | Population | Land Area | Party | Density | |
---|---|---|---|---|---|---|
State | ||||||
Alabama | South | Montgomery | 5024279 | 50645 | Republican | 99.21 |
Alaska | West | Juneau | 733391 | 570641 | Republican | 1.29 |
Arizona | West | Phoenix | 7151502 | 113594 | Republican | 62.96 |
... | ... | ... | ... | ... | ... | ... |
West Virginia | South | Charleston | 1793716 | 24038 | Republican | 74.62 |
Wisconsin | Midwest | Madison | 5893718 | 54158 | Republican | 108.82 |
Wyoming | West | Cheyenne | 576851 | 97093 | Republican | 5.94 |
50 rows × 6 columns
states.groupby('Region').sum()
Population | Land Area | Density | |
---|---|---|---|
Region | |||
Midwest | 68985454 | 750524 | 1298.78 |
Northeast | 57609148 | 161912 | 4957.49 |
South | 125576562 | 868356 | 3189.37 |
West | 78588572 | 1751054 | 881.62 |
states.groupby('Region').max()
Capital City | Population | Land Area | Party | Density | |
---|---|---|---|---|---|
Region | |||||
Midwest | Topeka | 12812508 | 81759 | Republican | 288.77 |
Northeast | Trenton | 20201249 | 47126 | Republican | 1263.12 |
South | Tallahassee | 29145505 | 261232 | Republican | 636.37 |
West | Santa Fe | 39538223 | 570641 | Republican | 253.81 |
12812508 / 81759 == 288.77
False
.count()
aggregation method.states.groupby('Region').count()
Capital City | Population | Land Area | Party | Density | |
---|---|---|---|---|---|
Region | |||||
Midwest | 12 | 12 | 12 | 12 | 12 |
Northeast | 9 | 9 | 9 | 9 | 9 |
South | 16 | 16 | 16 | 16 | 16 |
West | 13 | 13 | 13 | 13 | 13 |
Consider dropping unneeded columns and renaming columns as follows:
.assign
to create a new column containing the same values as the old column(s)..drop(columns=list_of_column_labels)
to drop the old column(s). .get(list_of_column_labels)
to keep only the columns in the given list. The columns will appear in the order you specify, so this is also useful for reordering columns!states_by_region = states.groupby('Region').count()
states_by_region = states_by_region.assign(
States=states_by_region.get('Capital City')
).get(['States'])
states_by_region
States | |
---|---|
Region | |
Midwest | 12 |
Northeast | 9 |
South | 16 |
West | 13 |
In Lecture 1, we were able to answer questions about the plot of Little Women without having to read the novel and without having to understand Python code. Some of those questions included:
We answered these questions from a data visualization alone!
bpd.read_csv('data/lw_counts.csv').plot(x='Chapter');
There are two main types of variables:
Note that here, "variable" does not mean a variable in Python, but rather it means a column in a DataFrame.
Which of these is not a numerical variable?
A. Fuel economy in miles per gallon.
B. Number of quarters at UCSD.
C. College at UCSD (Sixth, Seventh, etc).
D. Bank account number.
E. More than one of these are not numerical variables.
The type of visualization we create depends on the kinds of variables we're visualizing.
We may interchange the words "plot", "chart", and "graph"; they all mean the same thing.
An exoplanet is a planet outside our solar system. NASA has discovered over 5,000 exoplanets so far in its search for signs of life beyond Earth. 👽
Column | Contents |
---|---|
'Distance'
| Distance from Earth, in light years.
'Magnitude'
| Apparent magnitude, which measures brightness in such a way that brighter objects have lower values.
'Type'
| Categorization of planet based on its composition and size.
'Year'
| When the planet was discovered.
'Detection'
| The method of detection used to discover the planet.
'Mass'
| The ratio of the planet's mass to Earth's mass.
'Radius'
| The ratio of the planet's radius to Earth's radius.
exo = bpd.read_csv('data/exoplanets.csv').set_index('Name')
exo
Distance | Magnitude | Type | Year | Detection | Mass | Radius | |
---|---|---|---|---|---|---|---|
Name | |||||||
11 Comae Berenices b | 304.0 | 4.72 | Gas Giant | 2007 | Radial Velocity | 6165.90 | 11.88 |
11 Ursae Minoris b | 409.0 | 5.01 | Gas Giant | 2009 | Radial Velocity | 4684.81 | 11.99 |
14 Andromedae b | 246.0 | 5.23 | Gas Giant | 2008 | Radial Velocity | 1525.58 | 12.65 |
... | ... | ... | ... | ... | ... | ... | ... |
YZ Ceti b | 12.0 | 12.07 | Terrestrial | 2017 | Radial Velocity | 0.70 | 0.91 |
YZ Ceti c | 12.0 | 12.07 | Super Earth | 2017 | Radial Velocity | 1.14 | 1.05 |
YZ Ceti d | 12.0 | 12.07 | Super Earth | 2017 | Radial Velocity | 1.09 | 1.03 |
5043 rows × 7 columns
'Distance'
and 'Magnitude'
?exo.plot(kind='scatter', x='Distance', y='Magnitude');