def ggplot2_defaults():
False) for s in ax.spines.values()]
[s.set_visible(# ax.spines['top'].set_visible(False)
# ax.spines['bottom'].set_visible(False)
# ax.spines['left'].set_visible(False)
# ax.spines['right'].set_visible(False)
True) ## draws the grid below the bars
ax.set_axisbelow(= True, color = "white") ## turns on grid and sets lines to white
ax.grid(visible 'gainsboro') ## sets grid fill color plt.gca().patch.set_facecolor(
A Guide to Visualizing Data with Matplotlib
Prerequisite
This guide assumes that you have Python 3 installed as well as the pandas, numpy, and matplotlib packages.
Setup
The python
packages necessary for this guide are:
- pandas
- numpy
- matplotlib
- palmerpenguins
These can be installed via pip.
We’ll begin by loading these packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdatasets import data
from palmerpenguins import load_penguins
and then reading in the data
= load_penguins()
penguins penguins.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
Visualizations
Categorical Variables
Barchart
= penguins.groupby("species").size()
count_by_species
= plt.subplots()
fig, ax = count_by_species.index, height = count_by_species.values, color = "dimgray");
ax.bar(x "count")
ax.set_ylabel("species")
ax.set_xlabel( ggplot2_defaults()
## sort the bars
= False, inplace = True)
count_by_species.sort_values(ascending
= plt.subplots()
fig, ax = count_by_species.index, height = count_by_species.values, color = "dimgray");
ax.bar(x "count")
ax.set_ylabel("species")
ax.set_xlabel(
ggplot2_defaults()
Side-by-Side Barchart
= penguins.groupby(["species", "year"]).size()
count_by_species_and_year = count_by_species_and_year.reset_index()
count_by_species_and_year = {0:'value'}, inplace = True)
count_by_species_and_year.rename(columns
= 0.25 # the width of the bars
width = np.arange(count_by_species_and_year["species"].nunique())
x = 0
multiplier = plt.subplots()
fig, ax
for yr in count_by_species_and_year["year"].unique():
= count_by_species_and_year[count_by_species_and_year["year"] == yr]["value"]
ht = width * multiplier
offset = ax.bar(x + offset, ht, width, label = yr)
rects += 1
multiplier
'count')
ax.set_ylabel('species')
ax.set_xlabel(+ width, count_by_species_and_year["species"].unique())
ax.set_xticks(x =3)
ax.legend(ncols0, 60)
ax.set_ylim(
ggplot2_defaults()
Stacked Barchart
= 0.5
width
= plt.subplots()
fig, ax = np.zeros(3)
bottom
for yr in count_by_species_and_year["year"].unique():
= count_by_species_and_year[count_by_species_and_year["year"] == yr]
tmp = np.array(tmp["species"])
species = np.array(tmp["value"])
value = ax.bar(species, value, width, label=yr, bottom=bottom)
p += value
bottom
'count')
ax.set_ylabel('species')
ax.set_xlabel("species"].unique())
ax.set_xticks(count_by_species_and_year[=3)
ax.legend(ncols0, 160)
ax.set_ylim(
ggplot2_defaults()
Mosaic Plot
= count_by_species_and_year.groupby(["species"])["value"].sum().reset_index()
tot_by_grp = {"value":"tot"}, inplace = True)
tot_by_grp.rename(columns = count_by_species_and_year.merge(tot_by_grp, on = "species", how = "left")
count_by_species_and_year "prop"] = count_by_species_and_year["value"] / count_by_species_and_year["tot"]
count_by_species_and_year[
= count_by_species_and_year["tot"].unique()
totals = totals / totals.min()
scaled_totals
= scaled_totals
width = plt.subplots()
fig, ax = np.zeros(3)
bottom = [0, (width[0]/2 + width[1]/2) + .005, (width[0]/2 + width[1] + .005 + width[2]/2) + .005]
species_loc
for yr in count_by_species_and_year["year"].unique():
= count_by_species_and_year[count_by_species_and_year["year"] == yr]
tmp = np.array(tmp["prop"])
prop = ax.bar(species_loc, prop, width, label=yr, bottom=bottom, edgecolor = "white")
p += np.array(tmp["prop"])
bottom
'proportion')
ax.set_ylabel('species')
ax.set_xlabel(
= ax.get_position()
box * 0.8, box.height])
ax.set_position([box.x0, box.y0, box.width
= species_loc, labels = count_by_species_and_year["species"].unique())
ax.set_xticks(ticks = 1, loc = "center left", bbox_to_anchor=(1, 0.5))
ax.legend(ncol
ggplot2_defaults()
Piecharts
= plt.subplots()
fig, ax = count_by_species.values, labels = count_by_species.index, startangle = 90); ax.pie(x
Numerical Variables
Histograms
= plt.subplots()
fig, ax "bill_length_mm"], edgecolor = "white", color = "#cc4778");
ax.hist(penguins[
'count')
ax.set_ylabel('bill length (mm)')
ax.set_xlabel(
ggplot2_defaults()
Boxplots
= plt.subplots()
fig, ax
"bill_length_mm"].dropna(),
ax.boxplot(penguins[= True,
patch_artist = {"color": "black"},
medianprops = {"facecolor": "white"})
boxprops ;
plt.xticks([]) ggplot2_defaults()
Boxplot with points
= plt.subplots()
fig, ax = len(penguins["bill_length_mm"].dropna())
n
"bill_length_mm"].dropna(),
ax.boxplot(penguins[= True,
patch_artist = {"color": "black"},
medianprops = {"fill": None})
boxprops = penguins["bill_length_mm"].dropna(), x = [1] * n + np.random.normal(size = n, scale = 1/100), color = "black", alpha = 1/4)
ax.scatter(y
;
plt.xticks([]) ggplot2_defaults()