import altair as alt
import numpy as np
import pandas as pd
alt.renderers.enable('html')RendererRegistry.enable('html')
Explore how to use Altair for data exploration, creating interactive visualizations to make data insights more accessible and engaging.
Jacopo Repossi
March 4, 2022
exploratory data analysis in python, interactive visualization, interactive dataviz in python
This notebook collects some explorations of Altair’s most interesting features on the Kaggle’s House Prices competition.
For a basic tutorial on Altair, I created a notebook with the Titanic dataset!
RendererRegistry.enable('html')
Basic bar chart with a bars highlighted based on the percentage of missing values.
| Column | Count missing | |
|---|---|---|
| 0 | LotFrontage | 3.718593 |
| 1 | Alley | 19.655420 |
| 2 | MasVnrType | 0.114860 |
| 3 | MasVnrArea | 0.114860 |
| 4 | BsmtQual | 0.531228 |
alt.Chart(missing).mark_bar().encode(
x=alt.X('Column', sort='-y'),
y='Count missing',
color=alt.condition(
alt.datum['Count missing'] >10, # If count missing is > 10%, returns True,
alt.value('orange'), # which sets the bar orange.
alt.value('steelblue') # And if it's not true it sets the bar steelblue.
)
).properties(
width=500,
height=300
).configure_axis(
grid=False
)Creation of a basic boxplot using .mark_boxplot() method
Creation of a basic heatmap using .mark_rect() method.
Here you can select the KitchenQual feature from a dropdown menu and see how the graph changes color!
input_dropdown = alt.binding_select(options=list(train['KitchenQual'].unique()), name='Lot Shape')
selection = alt.selection_single(fields=['KitchenQual'], bind=input_dropdown)
color = alt.condition(selection,
alt.Color('KitchenQual:N', legend=None),
alt.value('lightgray'))
alt.Chart(train).mark_point().encode(
x='GrLivArea',
y='SalePrice',
color=color
).properties(
width=500,
height=300
).add_selection(
selection
).configure_axis(
grid=False
)C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\264273196.py:2: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use selection_point instead.
selection = alt.selection_single(fields=['KitchenQual'], bind=input_dropdown)
C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\264273196.py:14: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use add_params instead.
).add_selection(
In this more advanced example, I use the ExterQual feature as a filter for a binned heatmap.
Click on the bar chart bars to change the heatmap!
pts = alt.selection(type="single", encodings=['x'])
rect = alt.Chart(train).mark_rect().encode(
x=alt.X('GrLivArea', bin=alt.Bin(maxbins=40)),
y=alt.Y('GarageArea', bin=alt.Bin(maxbins=40)),
color='average(SalePrice)',
).properties(
width=500,
height=300
).transform_filter(
pts
)
bar = alt.Chart(train).mark_bar().encode(
x='ExterQual:N',
y='count()',
color=alt.condition(pts, alt.ColorValue("steelblue"), alt.ColorValue("grey"))
).properties(
width=550,
height=200
).add_selection(
pts
)
alt.vconcat(
rect,
bar
).resolve_legend(
color="independent",
size="independent"
).configure_axis(
grid=False
)C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\4015037448.py:1: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use 'selection_point()' or 'selection_interval()' instead.
These functions also include more helpful docstrings.
pts = alt.selection(type="single", encodings=['x'])
C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\4015037448.py:21: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use add_params instead.
).add_selection(
A Dot Dash Plot is basically a scatter plot with both axis removed and replaced with barcode plots (aka strip plots), which allow you to see the distribution of values of each measure used in the scatter plot.
# Configure the options common to all layers
brush = alt.selection(type='interval')
base = alt.Chart(train).add_selection(brush)
# Configure the points
points = base.mark_point().encode(
x=alt.X('GrLivArea', title=''),
y=alt.Y('SalePrice', title=''),
color=alt.condition(brush, 'KitchenQual', alt.value('grey'))
)
# Configure the ticks
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)
x_ticks = base.mark_tick().encode(
alt.X('GrLivArea', axis=tick_axis),
alt.Y('KitchenQual', title='', axis=tick_axis),
color=alt.condition(brush, 'KitchenQual', alt.value('lightgrey'))
)
y_ticks = base.mark_tick().encode(
alt.X('KitchenQual', title='', axis=tick_axis),
alt.Y('SalePrice', axis=tick_axis),
color=alt.condition(brush, 'KitchenQual', alt.value('lightgrey'))
)
# Build the chart
(
y_ticks | (points & x_ticks)
).configure_axis(
grid=False
)C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\2246844401.py:2: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use 'selection_point()' or 'selection_interval()' instead.
These functions also include more helpful docstrings.
brush = alt.selection(type='interval')
C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\2246844401.py:3: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use add_params instead.
base = alt.Chart(train).add_selection(brush)
C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\2246844401.py:29: UserWarning: Automatically deduplicated selection parameter with identical configuration. If you want independent parameters, explicitly name them differently (e.g., name='param1', name='param2'). See https://github.com/vega/altair/issues/3891
y_ticks | (points & x_ticks)
Let’s create a scatter plot with multiple feature encodings.
With .interactive() you can zoom in. You can also click on legend to select specific KitchenQual values.
selection = alt.selection_multi(fields=['KitchenQual'], bind='legend')
alt.Chart(train).mark_circle().encode(
alt.X('GrLivArea', scale=alt.Scale(zero=False)),
alt.Y('GarageArea', scale=alt.Scale(zero=False, padding=1)),
color='KitchenQual',
size=alt.Size('SalePrice', bin=alt.Bin(maxbins=10), title='SalePrice'),
opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).properties(
width=500,
height=500
).add_selection(
selection
).configure_axis(
grid=False
).interactive()C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\85585428.py:1: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use selection_point instead.
selection = alt.selection_multi(fields=['KitchenQual'], bind='legend')
C:\Users\jrepossi\AppData\Local\Temp\ipykernel_26264\85585428.py:12: AltairDeprecationWarning:
Deprecated since `altair=5.0.0`. Use add_params instead.
).add_selection(
Scatter matrix are one of the most common graph you’ll see on Kaggle. It consists of several pair-wise scatter plots of variables presented in a matrix format, useful to visualize multiple relationships between a pair of variables.
In Altair this can be achieved using a RepeatChart, let’s see how!
alt.Chart(train).mark_circle().encode(
alt.X(alt.repeat("column"), type='quantitative'),
alt.Y(alt.repeat("row"), type='quantitative'),
color='KitchenQual'
).properties(
width=300,
height=300
).repeat(
# Here we tell Altair we want to repeat out scatter plots for each row-column pair
row=['GrLivArea', 'GarageArea', 'TotalBsmtSF'],
column=['TotalBsmtSF', 'GarageArea', 'GrLivArea']
).configure_axis(
grid=False
)