import pandas as pd
import plotly.express as px
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

def rename_col(df: pd.DataFrame):
    df=df.copy()

    df.columns = (
        df.columns
        .str.replace('\ufeff', '', regex=False)
        .str.strip()
        .str.lower()
        .str.replace(' ', '_')
    )
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
    return df

def parse_tranche_horaire_to_hours(s: str):
    """
    Converts a textual time slot into approximate (start_hour, end_hour) values.
    Expected examples from the dataset:
    "Avant 6h" → (0, 6)
    "De 6h à 10h" → (6, 10)
    "De 10h à 16h" → (10, 16)
    "De 16h à 20h" → (16, 20)
    "Après 20h" → (20, 24)
    """
    if not isinstance(s, str) or not s:
        return (np.nan, np.nan)
    s = s.lower().strip()
    if 'avant' in s and '6' in s:
        return (0, 6)
    if 'après' in s or 'apres' in s:
        num = ''.join(ch for ch in s if ch.isdigit())
        h = int(num) if num else 20
        return (h, 24)
    if 'de' in s and 'à' in s:
        parts = s.replace('à', 'a').split()
        nums = [int(''.join(ch for ch in p if ch.isdigit())) for p in parts if any(ch.isdigit() for ch in p)]
        if len(nums) == 2:
            return (nums[0], nums[1])
    return (np.nan, np.nan)


def daily_station_aggregate(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregates data by (date, station_name, line) to prepare for temporal analyses and potential regression models (daily traffic).
    """
    grp_cols = [c for c in ['date', 'nom_gare', 'ligne'] if c in df.columns]
    out = (df
           .groupby(grp_cols, dropna=False)['somme_de_montants']
           .sum()
           .reset_index()
           .rename(columns={'somme_de_montants': 'trafic_journalier'}))
    return out

# Reading the data & take a look on the first rows
df = pd.read_csv("comptage-voyageurs-trains-transilien.csv", sep=';')
df = rename_col(df)
df.head()

# Getting columns and types to see what pre-processing things I have to do
print("Dimensions :", df.shape)
print(df.info())

Dimensions : (7328, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7328 entries, 0 to 7327
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   nom_gare           7328 non-null   object        
 1   code_gare          7328 non-null   int64         
 2   type_jour          7328 non-null   object        
 3   date               7328 non-null   datetime64[ns]
 4   annee              7328 non-null   int64         
 5   ligne              7328 non-null   object        
 6   axe                7328 non-null   object        
 7   tranche_horaire    7328 non-null   object        
 8   somme_de_montants  7328 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 515.4+ KB
None

#Let's analyse the object columns 
df_objects_columns = df.select_dtypes(include=['object'])
print(df_objects_columns.dtypes)

print(df_objects_columns['nom_gare'].apply(type).unique())
print(df_objects_columns['type_jour'].apply(type).unique())
print(df_objects_columns['ligne'].apply(type).unique())
print(df_objects_columns['axe'].apply(type).unique())
print(df_objects_columns['tranche_horaire'].apply(type).unique())

nom_gare           object
type_jour          object
ligne              object
axe                object
tranche_horaire    object
dtype: object
[<class 'str'>]
[<class 'str'>]
[<class 'str'>]
[<class 'str'>]
[<class 'str'>]

#Let's check unique value to detect any error in the dataset
#Class, Number = np.unique(df['nom_gare'], return_counts=True)
Class, Number = np.unique(df['type_jour'], return_counts=True)
#Class, Number = np.unique(df['ligne'], return_counts=True)
#Class, Number = np.unique(df['axe'], return_counts=True)
#Class, Number = np.unique(df['tranche_horaire'], return_counts=True)
print(Class, Number)

['DIM' 'JOB' 'SAM'] [2393 2502 2433]

#It's seems there is no errors in the datasets
#Let's now check the overall statistics
df.describe(include="all")

#Let's check missing values
missing = df.isna().mean().sort_values(ascending=False)
display(missing.to_frame("missing_ratio").head(20))

#No Missing Values

"""
   Add derived temporal features from date and time slot.
"""
df = df.copy()
if 'date' in df.columns:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday  # 0=lundi
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)

if 'tranche_horaire' in df.columns:
    hours = df['tranche_horaire'].apply(parse_tranche_horaire_to_hours)
    df['start_hour'] = hours.apply(lambda x: x[0])
    df['end_hour'] = hours.apply(lambda x: x[1])
    df['hour_bucket'] = (df['start_hour'] + df['end_hour']) / 2.0
print(df.head())

                     nom_gare  code_gare type_jour       date  annee ligne  \
0               ACHERES-VILLE   87381657       DIM 2022-11-20   2022     A   
1                      BEYNES   87393363       SAM 2024-01-20   2024     N   
2  CHAMP-DE-COURSES-D'ENGHIEN   87276030       DIM 2024-01-21   2024     H   
3  CHAMP-DE-COURSES-D'ENGHIEN   87276030       SAM 2024-01-20   2024     H   
4   CHAMP-DE-MARS-TOUR-EIFFEL   87393058       DIM 2025-09-02   2025     C   

  axe tranche_horaire  somme_de_montants  year  month  day  weekday  \
0   A        Avant 6h                 39  2022     11   20        6   
1   N    De 10h à 16h                 94  2024      1   20        5   
2   H    De 10h à 16h                727  2024      1   21        6   
3   H        Avant 6h                 47  2024      1   20        5   
4   C        Avant 6h                  8  2025      9    2        1   

   is_weekend  start_hour  end_hour  hour_bucket  
0           1           0         6          3.0  
1           1          10        16         13.0  
2           1          10        16         13.0  
3           1           0         6          3.0  
4           0           0         6          3.0

# Top gares (total)
if 'nom_gare' in df.columns:
    top_gares = df.groupby('nom_gare')['somme_de_montants'].sum()
    s = top_gares.sort_values(ascending=False).head(20)
    plt.figure(figsize=(9, min(0.4*len(s), 10)))
    sns.barplot(x=s.values, y=s.index)
    plt.title("Top gares (total traffic)")
    plt.xlabel('Total')
    plt.ylabel('')
    plt.tight_layout()
    plt.show()

# Traffic by line
if 'ligne' in df.columns:
    trafic_lignes = df.groupby('ligne')['somme_de_montants'].sum().sort_values(ascending=False)
    fig = px.bar(trafic_lignes, title="Total traffic by line")
    fig.update_layout(xaxis_title='Line', yaxis_title='Total traffic')
    fig.show()

# Distribution by time slot (ordered approximately by midpoint hour)
if 'tranche_horaire' in df.columns:
    ordre = (df[['tranche_horaire','hour_bucket']]
             .drop_duplicates()
             .sort_values('hour_bucket'))['tranche_horaire'].tolist()
    trafic_tranches = (df
        .groupby('tranche_horaire')['somme_de_montants']
        .sum()
        .reindex(ordre))
    fig = px.bar(trafic_tranches, title="Total traffic by time slot")
    fig.update_layout(xaxis_title='Time slot', yaxis_title='Total traffic')
    fig.show()

# weekend vs weekday traffic
if 'is_weekend' in df.columns:
    agg = df.groupby('is_weekend')['somme_de_montants'].sum()
    labels = ['Semaine (0)', 'Weekend (1)']
    fig = px.pie(values=agg.values, names=[labels[i] for i in agg.index], title="Trafic share: Weekday vs Weekend")
    fig.show()

# Global daily traffic
daily = df.groupby('date')['somme_de_montants'].sum().sort_index()
plt.figure(figsize=(12,4))
daily.plot()
plt.title("Daily Traffic (all stations))")
plt.ylabel("Travellers")
plt.tight_layout()
plt.show()

# Heatmap jour_de_semaine (day of the week) x somme_tranche_horaire (addition of time slots)
if {'weekday','tranche_horaire','somme_de_montants'}.issubset(df.columns):
    pivot = (df
             .groupby(['weekday','tranche_horaire'])['somme_de_montants']
             .sum()
             .reset_index())
    # ordonate columns
    pivot['weekday_name'] = pivot['weekday'].map({0:'Lun/Mon',1:'Mar/Tue',2:'Mer/Wes',3:'Jeu/Thu',4:'Ven/Fri',5:'Sam/Satu',6:'Dim/Sun'})
    pivot = pivot.merge(
        df[['tranche_horaire','hour_bucket']].drop_duplicates(),
        on='tranche_horaire', how='left'
    )
    heat = pivot.pivot_table(index='weekday_name', columns='tranche_horaire', values='somme_de_montants', aggfunc='sum')
    # reordonate rows by hour_bucket
    ordre_tranches = (
        df[['tranche_horaire','hour_bucket']]
        .drop_duplicates()
        .sort_values('hour_bucket')['tranche_horaire']
        .tolist()
    )
    heat = heat[ordre_tranches]
    plt.figure(figsize=(12,4))
    sns.heatmap(heat, annot=False)
    plt.title("Heatmap traffic: day of the week × time slot")
    plt.xlabel("Time slot")
    plt.ylabel("Day")
    plt.tight_layout()
    plt.show()

# Aggregated table: daily traffic by (station, line)
df_day = daily_station_aggregate(df)

# Adding temporal features from date
df_day['year']    = df_day['date'].dt.year
df_day['month']   = df_day['date'].dt.month
df_day['day']     = df_day['date'].dt.day
df_day['weekday'] = df_day['date'].dt.weekday
df_day['is_weekend'] = df_day['weekday'].isin([5,6]).astype(int)

# Lags / moving averages (by station)
df_day = df_day.sort_values(['nom_gare', 'date'])
for lag in [1,7]:
    df_day[f'lag_{lag}'] = df_day.groupby('nom_gare')['trafic_journalier'].shift(lag)

df_day['roll7_mean'] = (df_day
                        .groupby('nom_gare')['trafic_journalier']
                        .transform(lambda x: x.shift(1).rolling(7, min_periods=3).mean()))

# Drop of the very first rows without lags
df_model = df_day.dropna(subset=['lag_1','lag_7','roll7_mean']).reset_index(drop=True)

print("Table d'apprentissage:", df_model.shape)
display(df_model.head())

Table d'apprentissage: (55, 12)

target = 'trafic_journalier'
numeric_features = ['year','month','day','weekday','is_weekend','lag_1','lag_7','roll7_mean']
categorical_features = []
if 'ligne' in df_model.columns:
    categorical_features.append('ligne')
if 'nom_gare' in df_model.columns:
    categorical_features.append('nom_gare')

X = df_model[numeric_features + categorical_features]
y = df_model[target]

# Preprocessing for model
preprocess = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Simple baseline model
from sklearn.linear_model import Ridge
model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('estimator', Ridge(alpha=1.0, random_state=42))
])

model.fit(X_train, y_train)
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
r2  = r2_score(y_test, pred)
print(f"Baseline – MAE: {mae:,.2f} | R²: {r2:,.3f}")

Baseline – MAE: 9,451.77 | R²: 0.313

	nom_gare	code_gare	type_jour	date	annee	ligne	axe	tranche_horaire	somme_de_montants
count	7328	7.328000e+03	7328	7328	7328.000000	7328	7328	7328	7328.000000
unique	434	NaN	3	NaN	NaN	17	21	5	NaN
top	PARIS-SAINT-LAZARE	NaN	JOB	NaN	NaN	C	C	Après 20h	NaN
freq	74	NaN	2502	NaN	NaN	1223	1118	1524	NaN
mean	NaN	8.740263e+07	NaN	2023-12-16 20:05:34.061135360	2023.620087	NaN	NaN	NaN	893.398062
min	NaN	8.700148e+07	NaN	2019-03-16 00:00:00	2019.000000	NaN	NaN	NaN	0.000000
25%	NaN	8.727611e+07	NaN	2023-11-19 00:00:00	2023.000000	NaN	NaN	NaN	40.000000
50%	NaN	8.738237e+07	NaN	2024-01-20 00:00:00	2024.000000	NaN	NaN	NaN	181.000000
75%	NaN	8.754524e+07	NaN	2024-04-23 00:00:00	2024.000000	NaN	NaN	NaN	702.000000
max	NaN	8.798872e+07	NaN	2025-11-02 00:00:00	2025.000000	NaN	NaN	NaN	47213.000000
std	NaN	1.855614e+05	NaN	NaN	1.258481	NaN	NaN	NaN	2495.150207

	date	nom_gare	ligne	trafic_journalier	year	month	day	weekday	is_weekend	lag_1	lag_7	roll7_mean
0	2023-10-10	AULNAY-SOUS-BOIS	B	35819	2023	10	10	1	0	4326.0	20023.0	7260.571429
1	2024-01-16	AULNAY-SOUS-BOIS	K	3778	2024	1	16	1	0	35819.0	7902.0	9517.142857
2	2024-01-20	AULNAY-SOUS-BOIS	K	738	2024	1	20	5	1	3778.0	5725.0	8928.000000
3	2024-01-21	AULNAY-SOUS-BOIS	K	225	2024	1	21	6	1	738.0	2256.0	8215.571429
4	2024-08-06	CHAVILLE-RIVE-DROITE	U	101	2024	8	6	1	0	370.0	3126.0	1152.571429

Training EDA : Analyze the frequentation of the SNCF Station during from 2019 and 2025¶

Overview¶

Contents¶

Import Libraries¶

Functions¶

Data loading & Overview¶

Data Exploration¶

Feature Engineering¶

Prepare Data for Modelling¶

	nom_gare	code_gare	type_jour	date	annee	ligne	axe	tranche_horaire	somme_de_montants
0	ACHERES-VILLE	87381657	DIM	2022-11-20	2022	A	A	Avant 6h	39
1	BEYNES	87393363	SAM	2024-01-20	2024	N	N	De 10h à 16h	94
2	CHAMP-DE-COURSES-D'ENGHIEN	87276030	DIM	2024-01-21	2024	H	H	De 10h à 16h	727
3	CHAMP-DE-COURSES-D'ENGHIEN	87276030	SAM	2024-01-20	2024	H	H	Avant 6h	47
4	CHAMP-DE-MARS-TOUR-EIFFEL	87393058	DIM	2025-09-02	2025	C	C	Avant 6h	8

	missing_ratio
nom_gare	0.0
code_gare	0.0
type_jour	0.0
date	0.0
annee	0.0
ligne	0.0
axe	0.0
tranche_horaire	0.0
somme_de_montants	0.0