# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 5)
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import display, IFrame
def binning_animation():
src="https://docs.google.com/presentation/d/e/2PACX-1vTnRGwEnKP2V-Z82DlxW1b1nMb2F0zWyrXIzFSpQx_8Wd3MFaf56y2_u3JrLwZ5SjWmfapL5BJLfsDG/embed?start=false&loop=false&delayms=60000&rm=minimal"
width=900
height=270
display(IFrame(src, width, height))
import warnings
warnings.simplefilter('ignore')
str.contains()
.Today's material is quite theoretical – make sure to go to discussion this week!
The type of visualization we create depends on the kinds of variables we're visualizing.
We may interchange the words "plot", "chart", and "graph"; they all mean the same thing.
How often does a variable take on a certain value?
The distribution of a categorical variable can be displayed as a table or bar chart, among other ways! For example, let's look at the colleges of students enrolled in DSC 10 this quarter.
colleges = bpd.read_csv('data/colleges-sp23.csv')
colleges
College | # Students | |
---|---|---|
0 | Sixth | 66 |
1 | Warren | 47 |
2 | Seventh | 40 |
3 | Marshall | 37 |
4 | Revelle | 35 |
5 | ERC | 28 |
6 | Muir | 20 |
colleges.plot(kind='barh', x='College', y='# Students');
colleges.plot(kind='bar', x='College', y='# Students');
charts = (bpd.read_csv('data/regional-us-daily-2023-04-13.csv')
.set_index('rank')
.get(['track_name', 'artist_names', 'streams', 'uri'])
)
charts
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
1 | Last Night | Morgan Wallen | 1801636 | spotify:track:7K3BhSpAxZBznislvUMVtn |
2 | Search & Rescue | Drake | 1515162 | spotify:track:7aRCf5cLOFN1U7kvtChY1G |
3 | Kill Bill | SZA | 1412326 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL |
... | ... | ... | ... | ... |
198 | Redbone | Childish Gambino | 291222 | spotify:track:0wXuerDYiBnERgIpbb3JBR |
199 | You're On Your Own, Kid | Taylor Swift | 290995 | spotify:track:4D7BCuvgdJlYvlX5WlN54t |
200 | Fall In Love | Bailey Zimmerman | 290535 | spotify:track:5gVCfYmQRPy1QJifP8f5gg |
200 rows × 4 columns
That is, how many songs does the artist with the most songs have? What about the artist with the second most songs?
First, let's create a DataFrame with a single column that describes the number of songs in the top 200 per artist. This involves using .groupby
with .count()
. Since we want one row per artist, we will group by 'artist_names'
.
charts
track_name | artist_names | streams | uri | |
---|---|---|---|---|
rank | ||||
1 | Last Night | Morgan Wallen | 1801636 | spotify:track:7K3BhSpAxZBznislvUMVtn |
2 | Search & Rescue | Drake | 1515162 | spotify:track:7aRCf5cLOFN1U7kvtChY1G |
3 | Kill Bill | SZA | 1412326 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL |
... | ... | ... | ... | ... |
198 | Redbone | Childish Gambino | 291222 | spotify:track:0wXuerDYiBnERgIpbb3JBR |
199 | You're On Your Own, Kid | Taylor Swift | 290995 | spotify:track:4D7BCuvgdJlYvlX5WlN54t |
200 | Fall In Love | Bailey Zimmerman | 290535 | spotify:track:5gVCfYmQRPy1QJifP8f5gg |
200 rows × 4 columns
songs_per_artist = charts.groupby('artist_names').count()
songs_per_artist
track_name | streams | uri | |
---|---|---|---|
artist_names | |||
21 Savage | 1 | 1 | 1 |
21 Savage, Metro Boomin | 2 | 2 | 2 |
Arctic Monkeys | 2 | 2 | 2 |
... | ... | ... | ... |
Yng Lvcas, Peso Pluma | 1 | 1 | 1 |
Zach Bryan | 2 | 2 | 2 |
d4vd | 2 | 2 | 2 |
136 rows × 3 columns
Using .assign
and .drop
, we'll create a column named 'count'
that contains the same information that the other 3 columns contain, and then .get
only that column (or equivalently, drop the other 3 columns).
# If we give .get a list, it will return a DataFrame instead of a Series!
songs_per_artist = (songs_per_artist
.assign(count=songs_per_artist.get('streams'))
.get(['count']))
songs_per_artist
count | |
---|---|
artist_names | |
21 Savage | 1 |
21 Savage, Metro Boomin | 2 |
Arctic Monkeys | 2 |
... | ... |
Yng Lvcas, Peso Pluma | 1 |
Zach Bryan | 2 |
d4vd | 2 |
136 rows × 1 columns
Let's try and create a bar chart directly.
songs_per_artist.plot(kind='barh', y='count');
That's hard to read! There are 136 bars, since there are 136 rows in songs_per_artist
. To keep things concise, let's just look at the artists with at least 3 songs on the charts.
(
songs_per_artist[songs_per_artist.get('count') >= 3]
.sort_values('count')
.plot(kind='barh', y='count')
);
Better!
# Instead of streams, we'll look at millions of streams.
charts = charts.assign(million_streams=np.round(charts.get('streams') / 1000000, 2))
charts
track_name | artist_names | streams | uri | million_streams | |
---|---|---|---|---|---|
rank | |||||
1 | Last Night | Morgan Wallen | 1801636 | spotify:track:7K3BhSpAxZBznislvUMVtn | 1.80 |
2 | Search & Rescue | Drake | 1515162 | spotify:track:7aRCf5cLOFN1U7kvtChY1G | 1.52 |
3 | Kill Bill | SZA | 1412326 | spotify:track:1Qrg8KqiBpW07V7PNxwwwL | 1.41 |
... | ... | ... | ... | ... | ... |
198 | Redbone | Childish Gambino | 291222 | spotify:track:0wXuerDYiBnERgIpbb3JBR | 0.29 |
199 | You're On Your Own, Kid | Taylor Swift | 290995 | spotify:track:4D7BCuvgdJlYvlX5WlN54t | 0.29 |
200 | Fall In Love | Bailey Zimmerman | 290535 | spotify:track:5gVCfYmQRPy1QJifP8f5gg | 0.29 |
200 rows × 5 columns
To see the distribution of the number of streams, we need to group by the 'million_streams'
column.
stream_counts = charts.groupby('million_streams').count()
stream_counts = (
stream_counts
.assign(count=stream_counts.get('track_name'))
.get(['count'])
)
stream_counts
count | |
---|---|
million_streams | |
0.29 | 4 |
0.30 | 16 |
0.31 | 11 |
... | ... |
1.41 | 1 |
1.52 | 1 |
1.80 | 1 |
55 rows × 1 columns
stream_counts.plot(kind='bar', y='count', figsize=(15,5));
The horizontal axis should be numerical (like a number line), not categorical.