%matplotlib inline
import os
import numpy as np
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from cycler import cycler
import pooch # download data / avoid re-downloading
from IPython import get_ipython
import lzma # to process zip file
import plotly.express as px
sns.set_palette("colorblind")
palette = sns.color_palette("twilight", n_colors=12)
pd.options.display.max_rows = 8Bike accident dataset
Disclaimer: this course is adapted from the work Pandas tutorial by Joris Van den Bossche. R users might also want to read Pandas: Comparison with R / R libraries for a smooth start in Pandas.
We start by importing the necessary libraries:
References:
Data loading and preprocessing
Data loading
# url = "https://koumoul.com/s/data-fair/api/v1/datasets/accidents-velos/raw"
url_db = "https://github.com/josephsalmon/HAX712X/raw/main/Data/accidents-velos_2022.csv.xz"
path_target = "./bicycle_db.csv.xz"
path, fname = os.path.split(path_target)
pooch.retrieve(url_db, path=path, fname=fname, known_hash=None)
with lzma.open(path_target) as f:
file_content = f.read().decode('utf-8')
# write the string file_content to a file named fname_uncompressed
with open("./bicycle_db.csv", 'w') as f:
f.write(file_content)df_bikes = pd.read_csv("bicycle_db.csv", na_values="", low_memory=False,
dtype={'data': str, 'heure': str, 'departement': str})In June 2023, the author decided to change the name of the columns, hence we had to define a dictionary to come back to legacy names:
new2old = {
"hrmn": "heure",
"secuexist": "existence securite",
"grav": "gravite accident",
"dep": "departement",
}
df_bikes.rename(columns=new2old, inplace=True)get_ipython().system('head -5 ./bicycle_db.csv')pd.options.display.max_columns = 40
df_bikes.head()| identifiant accident | date | mois | jour | heure | departement | commune | lat | lon | en agglomeration | type intersection | type collision | luminosite | conditions atmosperiques | type route | circulation | nb voies | profil long route | trace plan route | largeur TPC | largeur route | etat surface | amenagement | situation | categorie usager | gravite accident | sexe | age | motif deplacement | existence securite | usage securite | obstacle fixe heurte | obstacle mobile heurte | localisation choc | manoeuvre avant accident | identifiant vehicule | type autres vehicules | manoeuvre autres vehicules | nombre autres vehicules | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 201100000004 | 2011-09-22 | 09 - septembre | 3 - jeudi | 15 | 59 | 59011 | 50.51861 | 2.93043 | oui | Hors intersection | Trois véhicules et plus - collisions multiples | Plein jour | Normale | Route Départementale | NaN | NaN | Plat | Partie rectiligne | NaN | 58.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 39-40 | Promenade - loisirs | Casque | Oui | NaN | Véhicule | NaN | Sans changement de direction | 201100000004A01 | VL seul | Tournant à gauche | 1.0 |
| 1 | 201100000004 | 2011-09-22 | 09 - septembre | 3 - jeudi | 15 | 59 | 59011 | 50.51861 | 2.93043 | oui | Hors intersection | Trois véhicules et plus - collisions multiples | Plein jour | Normale | Route Départementale | NaN | NaN | Plat | Partie rectiligne | NaN | 58.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 30-31 | Promenade - loisirs | Casque | Non | NaN | Véhicule | NaN | Sans changement de direction | 201100000004B02 | VL seul | Tournant à gauche | 1.0 |
| 2 | 201100000006 | 2011-11-14 | 11 - novembre | 0 - lundi | 17 | 59 | 59011 | 50.52684 | 2.93423 | oui | Hors intersection | Deux véhicules - par l’arrière | Nuit avec éclairage public allumé | Normale | Route Départementale | NaN | NaN | NaN | Partie rectiligne | NaN | NaN | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 22-23 | Domicile - travail | Casque | Oui | NaN | NaN | Avant | NaN | 201100000006A01 | VL seul | Arrêté (hors stationnement) | 1.0 |
| 3 | 201100000020 | 2011-01-27 | 01 - janvier | 3 - jeudi | 18 | 59 | 59256 | 50.55818 | 3.13667 | oui | Hors intersection | Deux véhicules - par le coté | Nuit avec éclairage public allumé | Normale | Voie Communale | NaN | 2.0 | Plat | Partie rectiligne | NaN | 60.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | F | 36-37 | Autre | Equipement réfléchissant | Oui | NaN | Véhicule | Avant droit | Sans changement de direction | 201100000020B02 | VL seul | En stationnement (avec occupants) | 1.0 |
| 4 | 201100000022 | 2011-09-01 | 09 - septembre | 3 - jeudi | 19 | 59 | 59256 | 50.55448 | 3.12660 | oui | Hors intersection | Deux véhicules - frontale | Crépuscule ou aube | Normale | Hors réseau public | NaN | 2.0 | Plat | Partie rectiligne | NaN | 60.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 15-16 | Promenade - loisirs | Ceinture | NaN | NaN | Véhicule | Avant gauche | Sans changement de direction | 201100000022B02 | VL seul | Sans changement de direction | 1.0 |
df_bikes['existence securite'].unique()array(['Casque', 'Equipement réfléchissant', 'Ceinture', 'Autre', nan,
'Dispositif enfants'], dtype=object)
df_bikes['gravite accident'].unique()array(['1 - Blessé léger', '2 - Blessé hospitalisé', '3 - Tué',
'0 - Indemne'], dtype=object)
Handle missing values
df_bikes['date'].hasnansFalse
df_bikes['heure'].hasnansTrue
So arbitrarily we fill missing values with 0 (since apparently there is no time 0 reported…to double check in the source.)
df_bikes.fillna({'heure':'0'}, inplace=True)Date and time processing
Check the date/time format:
df_bikes['date'] + ' ' + df_bikes['heure']0 2011-09-22 15
1 2011-09-22 15
2 2011-11-14 17
3 2011-01-27 18
...
35330 2018-03-21 18
35331 2018-03-31 17
35332 2018-03-31 17
35333 2018-07-31 11
Length: 35334, dtype: object
time_improved = pd.to_datetime(
df_bikes["date"] + " " + df_bikes["heure"],
format="%Y-%m-%d %H",
errors="coerce",
)df_bikes["Time"] = time_improved
# remove rows with NaT
df_bikes.dropna(subset=["Time"], inplace=True)
# set new index
df_bikes.set_index("Time", inplace=True)
# remove useless columns
df_bikes.drop(columns=["heure", "date"], inplace=True)df_bikes.info()<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 27570 entries, 2011-09-22 15:00:00 to 2018-07-31 11:00:00
Data columns (total 37 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 identifiant accident 27570 non-null int64
1 mois 27570 non-null object
2 jour 27570 non-null object
3 departement 27570 non-null object
4 commune 27570 non-null object
5 lat 27570 non-null float64
6 lon 27570 non-null float64
7 en agglomeration 27570 non-null object
8 type intersection 27569 non-null object
9 type collision 27567 non-null object
10 luminosite 27570 non-null object
11 conditions atmosperiques 27569 non-null object
12 type route 27563 non-null object
13 circulation 110 non-null object
14 nb voies 24173 non-null float64
15 profil long route 25489 non-null object
16 trace plan route 24630 non-null object
17 largeur TPC 2097 non-null float64
18 largeur route 14460 non-null float64
19 etat surface 26268 non-null object
20 amenagement 2967 non-null object
21 situation 25539 non-null object
22 categorie usager 27570 non-null object
23 gravite accident 27570 non-null object
24 sexe 27570 non-null object
25 age 27562 non-null object
26 motif deplacement 21457 non-null object
27 existence securite 27123 non-null object
28 usage securite 26216 non-null object
29 obstacle fixe heurte 646 non-null object
30 obstacle mobile heurte 22338 non-null object
31 localisation choc 23548 non-null object
32 manoeuvre avant accident 24492 non-null object
33 identifiant vehicule 27570 non-null object
34 type autres vehicules 23689 non-null object
35 manoeuvre autres vehicules 21859 non-null object
36 nombre autres vehicules 23689 non-null float64
dtypes: float64(6), int64(1), object(30)
memory usage: 8.0+ MB
df_bike2 = df_bikes.loc[
:, ["gravite accident", "existence securite", "age", "sexe"]
]
df_bike2["existence securite"].replace({"Inconnu": np.nan}, inplace=True)
df_bike2.dropna(inplace=True)| gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué |
|---|---|---|---|---|
| existence securite | ||||
| Autre | 6.767309 | 61.829776 | 29.333680 | 2.069235 |
| Casque | 7.466130 | 54.049577 | 34.526415 | 3.957877 |
| Ceinture | 3.432836 | 14.626866 | 69.552239 | 12.388060 |
| Dispositif enfants | 7.575758 | 63.636364 | 25.757576 | 3.030303 |
| Equipement réfléchissant | 5.412946 | 57.310268 | 32.812500 | 4.464286 |
| gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué |
|---|---|---|---|---|
| existence securite | ||||
| Autre | 6.767309 | 61.829776 | 29.333680 | 2.069235 |
| Casque | 7.466130 | 54.049577 | 34.526415 | 3.957877 |
| Ceinture | 3.432836 | 14.626866 | 69.552239 | 12.388060 |
| Dispositif enfants | 7.575758 | 63.636364 | 25.757576 | 3.030303 |
| Equipement réfléchissant | 5.412946 | 57.310268 | 32.812500 | 4.464286 |
sexe
F 0.233188
M 0.766812
dtype: float64
| gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué | All |
|---|---|---|---|---|---|
| sexe | |||||
| F | 13.057158 | 27.135645 | 19.64851 | 14.70292 | 23.160612 |
| M | 86.942842 | 72.864355 | 80.35149 | 85.29708 | 76.839388 |
Data visualization
Note that in the dataset, the information on the level of bike practice by gender is missing.
Time series visualization
df_bikes["weekday"] = df_bikes.index.day_of_week # Monday=0, Sunday=6
df_bikes.groupby(['weekday', df_bikes.index.hour])['sexe'].count()weekday Time
0 10 244
11 247
12 277
13 272
...
6 20 122
21 73
22 42
23 32
Name: sexe, Length: 98, dtype: int64
df_bikes.groupby(['weekday', df_bikes.index.hour])['age'].count()weekday Time
0 10 244
11 246
12 277
13 272
...
6 20 122
21 73
22 42
23 32
Name: age, Length: 98, dtype: int64
The two last results are the same, no matter if you choose the 'age' or 'sexe' variable.
Create a daily profile per day of the week:
df_polar = (
df_bikes.groupby(["weekday", df_bikes.index.hour])["sexe"]
.count()
.reset_index()
) # all variable are similar in this sense, sexe could be replaced by age for instance here. XXX to simplify
df_polar = df_polar.astype({"Time": str}, copy=False)
df_polar["weekday"] = df_polar["weekday"].apply(lambda x: calendar.day_abbr[x])
df_polar.rename(columns={"sexe": "accidents"}, inplace=True)Display these daily profiles
n_colors = 8 # 7 days, but 8 colors help to have weekends days' color closer
colors = px.colors.sample_colorscale(
"mrybm", [n / (n_colors - 1) for n in range(n_colors)]
)
fig = px.line_polar(
df_polar,
r="accidents",
theta="Time",
color="weekday",
line_close=True,
range_r=[0, 600],
start_angle=0,
color_discrete_sequence=colors,
template="seaborn",
title="Daily accident profile: weekday effect?",
)
fig.show()In plotly the figure is interactive. If you click on the legend on the right, you can select the day you want to see. It is very convenient to compare days two by two for instance.
Create a daily profile per month:
df_bikes["month"] = df_bikes.index.month # Janvier=0, .... Decembre=11
# df_bikes['month'] = df_bikes['month'].apply(lambda x: calendar.month_abbr[x])
df_bikes.head()
df_bikes_month = (
df_bikes.groupby(["month", df_bikes.index.hour])["age"]
.count()
.unstack(level=0)
)df_polar2 = (
df_bikes.groupby(["month", df_bikes.index.hour])["sexe"]
.count()
.reset_index()
) # all variable are similar in this sense, sexe could be replaced by age for instance here. XXX to simplify
df_polar2 = df_polar2.astype({"Time": str}, copy=False)
df_polar2.rename(columns={"sexe": "accidents"}, inplace=True)
df_polar2["month"] = df_polar2["month"].apply(lambda x: calendar.month_abbr[x])Display these daily profiles :
# create a cyclical color scale for 12 values:
n_colors = 12
colors = px.colors.sample_colorscale(
"mrybm", [n / (n_colors - 1) for n in range(n_colors)]
)
fig = px.line_polar(
df_polar2,
r="accidents",
theta="Time",
color="month",
line_close=True,
range_r=[0, 410],
start_angle=0,
color_discrete_sequence=colors,
template="seaborn",
title="Daily accident profile: weekday effect?",
)
fig.show()Geographic visualization
In this part, we will use the geopandas library to visualize the data on a map, along with plotly for interactivity.
path_target = "./dpt_population.csv"
url = "https://public.opendatasoft.com/explore/dataset/population-francaise-par-departement-2018/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B"
path, fname = os.path.split(path_target)
pooch.retrieve(url, path=path, fname=fname, known_hash=None)'/home/bcharlier/enseignement/HAB796B9/2023-24/HAB796B9/Courses/Pandas/dpt_population.csv'
df_dtp_pop = pd.read_csv("dpt_population.csv", sep=";", low_memory=False)
df_dtp_pop["Code Département"].replace("2A", "20A", inplace=True)
df_dtp_pop["Code Département"].replace("2B", "20B", inplace=True)
df_dtp_pop.sort_values(by=["Code Département"], inplace=True)
df_bikes["departement"].replace("2A", "20A", inplace=True)
df_bikes["departement"].replace("2B", "20B", inplace=True)
df_bikes.sort_values(by=["departement"], inplace=True)
# Clean extra departements
df_bikes = df_bikes.loc[
df_bikes["departement"].isin(df_dtp_pop["Code Département"].unique())
]
gd = df_bikes.groupby(["departement"], as_index=True, sort=True).size()
data = {"code": gd.index, "# Accidents per million": gd.values}
df = pd.DataFrame(data)
df["# Accidents per million"] = (
df["# Accidents per million"].values
* 10000.0
/ df_dtp_pop["Population"].values
)We now need to download the .geojson file containing the geographic information for each department. We will use the pooch library to download the file and store it locally.
path_target = "./departements.geojson"
# url = "https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements-avec-outre-mer.geojson"
url = "https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements-version-simplifiee.geojson"
path, fname = os.path.split(path_target)
pooch.retrieve(url, path=path, fname=fname, known_hash=None)'/home/bcharlier/enseignement/HAB796B9/2023-24/HAB796B9/Courses/Pandas/departements.geojson'
First, you have to handle Corsican departments, which are not in the same format as the others.
import plotly.express as px
import geopandas
departement = geopandas.read_file("departements.geojson")
departement["code"].replace("2A", "20A", inplace=True)
departement["code"].replace("2B", "20B", inplace=True)
departement.sort_values(by=["code"], inplace=True)
a = ["0" + str(i) for i in range(1, 10)]
b = [str(i) for i in range(1, 10)]
dict_replace = dict(zip(a, b))
departement["code"].replace(dict_replace, inplace=True)
df["code"].replace(dict_replace, inplace=True)
departement["code"].replace("20A", "2A", inplace=True)
departement["code"].replace("20B", "2B", inplace=True)
df["code"].replace("20A", "2A", inplace=True)
df["code"].replace("20B", "2B", inplace=True)
departement.set_index("code", inplace=True)
print(departement['nom'].head(22))code
1 Ain
2 Aisne
3 Allier
4 Alpes-de-Haute-Provence
...
19 Corrèze
2A Corse-du-Sud
2B Haute-Corse
21 Côte-d'Or
Name: nom, Length: 22, dtype: object
Once this is done, you can plot the data on a map.
fig = px.choropleth_mapbox(
df,
geojson=departement,
locations="code",
color="# Accidents per million",
range_color=(0, df["# Accidents per million"].max()),
color_continuous_scale="rdbu",
center={"lat": 44, "lon": 2},
zoom=3.55,
mapbox_style="white-bg",
)
fig.update_traces(selector=dict(type="choroplethmapbox"))
fig.update_layout(
title_text="Accidents per million inhabitants by department",
coloraxis_colorbar=dict(thickness=20, orientation="h", y=0.051, x=0.5),
)
fig.show()References
- Other interactive tools for data visualization: Altair, Bokeh. See comparisons by Aarron Geller: link
- An interesting tutorial on Altair: Altair introduction