Skip to content

Model Validation

Comprehensive toolkit for PD model validation per EBA GL/2017/16.

Discrimination

from creditriskengine.validation.discrimination import auroc, gini_coefficient, ks_statistic

auc = auroc(y_true, y_score)
gini = gini_coefficient(y_true, y_score)
ks = ks_statistic(y_true, y_score)

Calibration

from creditriskengine.validation.calibration import binomial_test, hosmer_lemeshow_test

result = binomial_test(n_defaults=15, n_observations=1000, predicted_pd=0.02)

Stability

from creditriskengine.validation.stability import population_stability_index

psi = population_stability_index(base_distribution, current_distribution)

creditriskengine.validation

Model validation and backtesting framework.

Provides discrimination, calibration, stability, and backtesting metrics for credit risk model validation per SR 11-7, EBA GL/2017/16.

pd_backtest_full(predicted_pds, observed_defaults, rating_grades, confidence=0.99)

Full PD backtest: binomial, traffic-light, and Jeffreys per rating grade.

Runs a comprehensive backtesting suite for each rating grade, combining frequentist (binomial), regulatory (traffic-light), and Bayesian (Jeffreys) approaches as recommended in BCBS WP14 and EBA GL/2017/16.

Parameters:

Name Type Description Default
predicted_pds ndarray

Predicted PDs per exposure.

required
observed_defaults ndarray

Binary default indicator (0/1).

required
rating_grades ndarray

Rating grade labels per exposure (same length).

required
confidence float

Confidence level for binomial and Jeffreys tests.

0.99

Returns:

Type Description
FullBacktestResult

FullBacktestResult with per-grade detail and overall assessment.

Raises:

Type Description
ValueError

If input arrays have mismatched lengths.

Source code in creditriskengine\validation\backtesting.py
def pd_backtest_full(
    predicted_pds: np.ndarray,
    observed_defaults: np.ndarray,
    rating_grades: np.ndarray,
    confidence: float = 0.99,
) -> FullBacktestResult:
    """Full PD backtest: binomial, traffic-light, and Jeffreys per rating grade.

    Runs a comprehensive backtesting suite for each rating grade, combining
    frequentist (binomial), regulatory (traffic-light), and Bayesian (Jeffreys)
    approaches as recommended in BCBS WP14 and EBA GL/2017/16.

    Args:
        predicted_pds: Predicted PDs per exposure.
        observed_defaults: Binary default indicator (0/1).
        rating_grades: Rating grade labels per exposure (same length).
        confidence: Confidence level for binomial and Jeffreys tests.

    Returns:
        FullBacktestResult with per-grade detail and overall assessment.

    Raises:
        ValueError: If input arrays have mismatched lengths.
    """
    predicted_pds = np.asarray(predicted_pds, dtype=np.float64)
    observed_defaults = np.asarray(observed_defaults, dtype=np.int64)
    rating_grades = np.asarray(rating_grades)

    if not (len(predicted_pds) == len(observed_defaults) == len(rating_grades)):
        raise ValueError(
            "predicted_pds, observed_defaults, and rating_grades must have the same length."
        )

    summary = pd_backtest_summary(predicted_pds, observed_defaults, rating_grades)

    unique_grades = np.unique(rating_grades)
    grade_results: list[GradeBacktestResult] = []
    traffic_light_colours: list[str] = []

    for grade in unique_grades:
        mask = rating_grades == grade
        n_obs = int(np.sum(mask))
        n_def = int(np.sum(observed_defaults[mask]))
        avg_pd = float(np.mean(predicted_pds[mask]))
        obs_dr = n_def / n_obs if n_obs > 0 else 0.0

        binom_result = binomial_test(n_def, n_obs, avg_pd, confidence=confidence)
        tl_result = traffic_light_test(n_def, n_obs, avg_pd)
        jeff_result = jeffreys_test(n_def, n_obs, avg_pd, confidence=confidence)

        traffic_light_colours.append(tl_result)

        grade_results.append(
            GradeBacktestResult(
                grade=str(grade),
                n_observations=n_obs,
                n_defaults=n_def,
                predicted_pd=avg_pd,
                observed_dr=obs_dr,
                binomial=binom_result,
                traffic_light=tl_result,
                jeffreys=jeff_result,
            )
        )
        logger.debug(
            "Grade %s: n=%d, defaults=%d, TL=%s, binom_reject=%s",
            grade,
            n_obs,
            n_def,
            tl_result,
            binom_result.get("reject_h0"),
        )

    # Overall traffic light is the worst across grades
    _tl_order = {"green": 0, "yellow": 1, "red": 2}
    overall_tl = (
        max(traffic_light_colours, key=lambda c: _tl_order.get(c, 0))
        if traffic_light_colours
        else "green"
    )

    # Derive human-readable assessment
    n_red = sum(1 for c in traffic_light_colours if c == "red")
    n_yellow = sum(1 for c in traffic_light_colours if c == "yellow")
    if n_red > 0:
        assessment = (
            f"FAIL: {n_red} grade(s) in red zone — material PD under-estimation detected. "
            "Recalibration required per EBA GL/2017/16 Art. 58."
        )
    elif n_yellow > 0:
        assessment = (
            f"WARNING: {n_yellow} grade(s) in yellow zone — potential PD under-estimation. "
            "Monitor closely; recalibration may be warranted."
        )
    else:
        assessment = "PASS: All grades in green zone — PD calibration adequate."

    logger.info("PD backtest overall: %s (traffic_light=%s)", assessment, overall_tl)

    return FullBacktestResult(
        summary=summary,
        grade_results=grade_results,
        overall_traffic_light=overall_tl,
        overall_assessment=assessment,
    )

pd_backtest_summary(predicted_pds, observed_defaults, rating_grades=None)

Summary statistics for PD backtesting.

Parameters:

Name Type Description Default
predicted_pds ndarray

Predicted PDs per exposure.

required
observed_defaults ndarray

Binary default indicator (0/1).

required
rating_grades ndarray | None

Optional rating grade labels for grouping.

None

Returns:

Type Description
dict[str, float]

Dict with overall metrics.

Source code in creditriskengine\validation\backtesting.py
def pd_backtest_summary(
    predicted_pds: np.ndarray,
    observed_defaults: np.ndarray,
    rating_grades: np.ndarray | None = None,
) -> dict[str, float]:
    """Summary statistics for PD backtesting.

    Args:
        predicted_pds: Predicted PDs per exposure.
        observed_defaults: Binary default indicator (0/1).
        rating_grades: Optional rating grade labels for grouping.

    Returns:
        Dict with overall metrics.
    """
    predicted_pds = np.asarray(predicted_pds, dtype=np.float64)
    observed_defaults = np.asarray(observed_defaults, dtype=np.int64)

    n = len(predicted_pds)
    n_defaults = int(np.sum(observed_defaults))
    avg_pd = float(np.mean(predicted_pds))
    observed_dr = n_defaults / n if n > 0 else 0.0

    return {
        "n_observations": n,
        "n_defaults": n_defaults,
        "average_predicted_pd": avg_pd,
        "observed_default_rate": observed_dr,
        "ratio_observed_to_predicted": observed_dr / avg_pd if avg_pd > 0 else 0.0,
    }

binomial_test(n_defaults, n_observations, predicted_pd, confidence=0.99)

Binomial test for PD calibration.

Tests if observed defaults are consistent with predicted PD, assuming independent Bernoulli trials.

H0: true PD = predicted_pd z = (d - NPD) / sqrt(NPD*(1-PD))

Parameters:

Name Type Description Default
n_defaults int

Observed number of defaults.

required
n_observations int

Total number of observations.

required
predicted_pd float

Predicted (average) PD.

required
confidence float

Confidence level for the test.

0.99

Returns:

Type Description
dict[str, float | bool]

Dict with z_stat, p_value, critical_value, reject_h0.

Source code in creditriskengine\validation\calibration.py
def binomial_test(
    n_defaults: int,
    n_observations: int,
    predicted_pd: float,
    confidence: float = 0.99,
) -> dict[str, float | bool]:
    """Binomial test for PD calibration.

    Tests if observed defaults are consistent with predicted PD,
    assuming independent Bernoulli trials.

    H0: true PD = predicted_pd
    z = (d - N*PD) / sqrt(N*PD*(1-PD))

    Args:
        n_defaults: Observed number of defaults.
        n_observations: Total number of observations.
        predicted_pd: Predicted (average) PD.
        confidence: Confidence level for the test.

    Returns:
        Dict with z_stat, p_value, critical_value, reject_h0.
    """
    if n_observations == 0:
        return {"z_stat": 0.0, "p_value": 1.0, "critical_value": 0.0, "reject_h0": False}

    expected = n_observations * predicted_pd
    std = (n_observations * predicted_pd * (1.0 - predicted_pd)) ** 0.5

    if std < 1e-15:
        return {"z_stat": 0.0, "p_value": 1.0, "critical_value": 0.0, "reject_h0": False}

    z = (n_defaults - expected) / std
    p_value = 1.0 - stats.norm.cdf(z)  # One-sided (upper tail)
    critical = stats.norm.ppf(confidence)

    return {
        "z_stat": float(z),
        "p_value": float(p_value),
        "critical_value": float(critical),
        "reject_h0": z > critical,
    }

brier_score(y_true, y_pred)

Brier Score for probability calibration.

BS = (1/N) * Sum(PD_i - D_i)^2

Lower is better. Perfect calibration: BS approaches PD*(1-PD).

Parameters:

Name Type Description Default
y_true ndarray

Binary outcomes (0/1).

required
y_pred ndarray

Predicted probabilities.

required

Returns:

Type Description
float

Brier score.

Source code in creditriskengine\validation\calibration.py
def brier_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Brier Score for probability calibration.

    BS = (1/N) * Sum(PD_i - D_i)^2

    Lower is better. Perfect calibration: BS approaches PD*(1-PD).

    Args:
        y_true: Binary outcomes (0/1).
        y_pred: Predicted probabilities.

    Returns:
        Brier score.
    """
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    return float(np.mean((y_pred - y_true) ** 2))

hosmer_lemeshow_test(observed_defaults, predicted_pds, group_counts, n_groups=10)

Hosmer-Lemeshow goodness-of-fit test.

H-L = Sum(i=1..g) [(O_i - E_i)^2 / (N_i * pi_i * (1-pi_i))]

Parameters:

Name Type Description Default
observed_defaults ndarray

Observed defaults per group.

required
predicted_pds ndarray

Average predicted PD per group.

required
group_counts ndarray

Number of observations per group.

required
n_groups int

Number of groups (for degrees of freedom).

10

Returns:

Type Description
dict[str, float | bool]

Dict with hl_stat, p_value, df, reject_h0 (at 5% level).

Source code in creditriskengine\validation\calibration.py
def hosmer_lemeshow_test(
    observed_defaults: np.ndarray,
    predicted_pds: np.ndarray,
    group_counts: np.ndarray,
    n_groups: int = 10,
) -> dict[str, float | bool]:
    """Hosmer-Lemeshow goodness-of-fit test.

    H-L = Sum(i=1..g) [(O_i - E_i)^2 / (N_i * pi_i * (1-pi_i))]

    Args:
        observed_defaults: Observed defaults per group.
        predicted_pds: Average predicted PD per group.
        group_counts: Number of observations per group.
        n_groups: Number of groups (for degrees of freedom).

    Returns:
        Dict with hl_stat, p_value, df, reject_h0 (at 5% level).
    """
    expected = group_counts * predicted_pds
    variance = group_counts * predicted_pds * (1.0 - predicted_pds)

    # Avoid division by zero
    mask = variance > 1e-15
    hl = float(np.sum((observed_defaults[mask] - expected[mask]) ** 2 / variance[mask]))

    df = max(n_groups - 2, 1)
    p_value = 1.0 - stats.chi2.cdf(hl, df)

    return {
        "hl_stat": hl,
        "p_value": float(p_value),
        "df": df,
        "reject_h0": p_value < 0.05,
    }

traffic_light_test(n_defaults, n_observations, predicted_pd)

Basel Committee Traffic Light approach.

Reference: BCBS WP14 (May 2005) — "Studies on the Validation of Internal Rating Systems".

Green: observed < 95th percentile of binomial Yellow: 95th-99.99th percentile Red: > 99.99th percentile

Parameters:

Name Type Description Default
n_defaults int

Observed defaults.

required
n_observations int

Total observations.

required
predicted_pd float

Predicted PD.

required

Returns:

Type Description
str

"green", "yellow", or "red".

Source code in creditriskengine\validation\calibration.py
def traffic_light_test(
    n_defaults: int,
    n_observations: int,
    predicted_pd: float,
) -> str:
    """Basel Committee Traffic Light approach.

    Reference: BCBS WP14 (May 2005) — "Studies on the Validation
    of Internal Rating Systems".

    Green: observed < 95th percentile of binomial
    Yellow: 95th-99.99th percentile
    Red: > 99.99th percentile

    Args:
        n_defaults: Observed defaults.
        n_observations: Total observations.
        predicted_pd: Predicted PD.

    Returns:
        "green", "yellow", or "red".
    """
    if n_observations == 0:
        return "green"

    # Binomial percentiles
    p_95 = stats.binom.ppf(0.95, n_observations, predicted_pd)
    p_9999 = stats.binom.ppf(0.9999, n_observations, predicted_pd)

    if n_defaults <= p_95:
        return "green"
    elif n_defaults <= p_9999:
        return "yellow"
    else:
        return "red"

auroc(y_true, y_score)

Area Under the Receiver Operating Characteristic curve.

Uses the Mann-Whitney U-statistic formulation for efficiency.

Parameters:

Name Type Description Default
y_true ndarray

Binary labels (1=default, 0=non-default).

required
y_score ndarray

Predicted scores/PDs (higher = riskier).

required

Returns:

Type Description
float

AUROC value in [0, 1].

Source code in creditriskengine\validation\discrimination.py
def auroc(y_true: np.ndarray, y_score: np.ndarray) -> float:
    """Area Under the Receiver Operating Characteristic curve.

    Uses the Mann-Whitney U-statistic formulation for efficiency.

    Args:
        y_true: Binary labels (1=default, 0=non-default).
        y_score: Predicted scores/PDs (higher = riskier).

    Returns:
        AUROC value in [0, 1].
    """
    y_true = np.asarray(y_true, dtype=np.int64)
    y_score = np.asarray(y_score, dtype=np.float64)

    n_pos = np.sum(y_true == 1)
    n_neg = np.sum(y_true == 0)

    if n_pos == 0 or n_neg == 0:
        return 0.5

    # Sort by score ascending
    order = np.argsort(y_score)
    y_sorted = y_true[order]

    # Mann-Whitney U: for each positive, count negatives ranked below it
    cum_neg = np.cumsum(1 - y_sorted)
    auc = float(np.sum(cum_neg[y_sorted == 1])) / float(n_pos * n_neg)
    return auc

gini_coefficient(y_true, y_score)

Gini coefficient (Accuracy Ratio).

Formula: Gini = 2 * AUC - 1

Standard metric per SR 11-7, ECB Guide.

Parameters:

Name Type Description Default
y_true ndarray

Binary labels.

required
y_score ndarray

Predicted scores/PDs.

required

Returns:

Type Description
float

Gini coefficient in [-1, 1].

Source code in creditriskengine\validation\discrimination.py
def gini_coefficient(y_true: np.ndarray, y_score: np.ndarray) -> float:
    """Gini coefficient (Accuracy Ratio).

    Formula: Gini = 2 * AUC - 1

    Standard metric per SR 11-7, ECB Guide.

    Args:
        y_true: Binary labels.
        y_score: Predicted scores/PDs.

    Returns:
        Gini coefficient in [-1, 1].
    """
    return 2.0 * auroc(y_true, y_score) - 1.0

ks_statistic(y_true, y_score)

Kolmogorov-Smirnov statistic.

Maximum separation between cumulative distributions of defaulters and non-defaulters.

Parameters:

Name Type Description Default
y_true ndarray

Binary labels.

required
y_score ndarray

Predicted scores/PDs.

required

Returns:

Type Description
float

KS statistic in [0, 1].

Source code in creditriskengine\validation\discrimination.py
def ks_statistic(y_true: np.ndarray, y_score: np.ndarray) -> float:
    """Kolmogorov-Smirnov statistic.

    Maximum separation between cumulative distributions
    of defaulters and non-defaulters.

    Args:
        y_true: Binary labels.
        y_score: Predicted scores/PDs.

    Returns:
        KS statistic in [0, 1].
    """
    y_true = np.asarray(y_true, dtype=np.int64)
    y_score = np.asarray(y_score, dtype=np.float64)

    defaults = y_score[y_true == 1]
    non_defaults = y_score[y_true == 0]

    if len(defaults) == 0 or len(non_defaults) == 0:
        return 0.0

    ks_stat, _ = stats.ks_2samp(defaults, non_defaults)
    return float(ks_stat)

characteristic_stability_index(actual, expected, bins=10)

Characteristic Stability Index (CSI).

Same formula as PSI but applied at individual feature level.

Parameters:

Name Type Description Default
actual ndarray

Actual feature distribution.

required
expected ndarray

Expected/reference feature distribution.

required
bins int

Number of bins.

10

Returns:

Type Description
float

CSI value (same interpretation as PSI).

Source code in creditriskengine\validation\stability.py
def characteristic_stability_index(
    actual: np.ndarray,
    expected: np.ndarray,
    bins: int = 10,
) -> float:
    """Characteristic Stability Index (CSI).

    Same formula as PSI but applied at individual feature level.

    Args:
        actual: Actual feature distribution.
        expected: Expected/reference feature distribution.
        bins: Number of bins.

    Returns:
        CSI value (same interpretation as PSI).
    """
    return population_stability_index(actual, expected, bins)

population_stability_index(actual, expected, bins=10, precomputed=False)

Population Stability Index (PSI).

PSI = Sum [(%actual_i - %expected_i) * ln(%actual_i / %expected_i)]

Interpretation: - PSI < 0.10: no significant change - 0.10 <= PSI < 0.25: moderate shift - PSI >= 0.25: significant shift

Parameters:

Name Type Description Default
actual ndarray

Actual distribution values (raw or pre-binned proportions).

required
expected ndarray

Expected/reference distribution (raw or pre-binned proportions).

required
bins int

Number of bins (if not precomputed).

10
precomputed bool

If True, actual/expected are already bin proportions.

False

Returns:

Type Description
float

PSI value.

Source code in creditriskengine\validation\stability.py
def population_stability_index(
    actual: np.ndarray,
    expected: np.ndarray,
    bins: int = 10,
    precomputed: bool = False,
) -> float:
    """Population Stability Index (PSI).

    PSI = Sum [(%actual_i - %expected_i) * ln(%actual_i / %expected_i)]

    Interpretation:
    - PSI < 0.10: no significant change
    - 0.10 <= PSI < 0.25: moderate shift
    - PSI >= 0.25: significant shift

    Args:
        actual: Actual distribution values (raw or pre-binned proportions).
        expected: Expected/reference distribution (raw or pre-binned proportions).
        bins: Number of bins (if not precomputed).
        precomputed: If True, actual/expected are already bin proportions.

    Returns:
        PSI value.
    """
    if precomputed:
        pct_actual = np.asarray(actual, dtype=np.float64)
        pct_expected = np.asarray(expected, dtype=np.float64)
    else:
        actual = np.asarray(actual, dtype=np.float64)
        expected = np.asarray(expected, dtype=np.float64)

        # Create bins from expected distribution
        bin_edges = np.percentile(expected, np.linspace(0, 100, bins + 1))
        bin_edges = np.unique(bin_edges)
        bin_edges[0] = -np.inf
        bin_edges[-1] = np.inf

        actual_counts = np.histogram(actual, bins=bin_edges)[0]
        expected_counts = np.histogram(expected, bins=bin_edges)[0]

        pct_actual = actual_counts / max(len(actual), 1)
        pct_expected = expected_counts / max(len(expected), 1)

    # Replace zeros to avoid log(0)
    pct_actual = np.maximum(pct_actual, 1e-10)
    pct_expected = np.maximum(pct_expected, 1e-10)

    psi = float(np.sum((pct_actual - pct_expected) * np.log(pct_actual / pct_expected)))
    return psi