#!/usr/bin/env python3
"""
Statistical tests for hominin–stability congruence.

Inputs:
- homo_site_metrics.csv
- civilization_site_metrics.csv

Tests performed:
1. Homo vs Civilization stability-gradient comparison
2. Homo vs Civilization mean distance to equilibrium margin
3. Effect sizes (Cliff's delta)
4. ECDF-ready outputs
"""

import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, ks_2samp

# ============================================================
# LOAD DATA
# ============================================================

homo = pd.read_csv("homo_site_metrics.csv")
civ  = pd.read_csv("civilization_site_metrics.csv")

# Drop NaNs defensively
homo = homo.dropna()
civ  = civ.dropna()

# ============================================================
# HELPER: CLIFF'S DELTA (NON-PARAMETRIC EFFECT SIZE)
# ============================================================

def cliffs_delta(x, y):
    """
    Cliff's delta effect size.
    Returns value in [-1, 1].
    """
    x = np.asarray(x)
    y = np.asarray(y)

    gt = sum(xi > yj for xi in x for yj in y)
    lt = sum(xi < yj for xi in x for yj in y)
    return (gt - lt) / (len(x) * len(y))

# ============================================================
# METRICS
# ============================================================

metrics = {
    "Stability gradient": "stability_gradient",
    "Mean distance to zero contour": "mean_zero_contour_distance_deg",
    "Variance of zero-contour distance": "var_zero_contour_distance_deg2"
}

# ============================================================
# RUN TESTS
# ============================================================

results = []

for label, col in metrics.items():
    x = homo[col].values
    y = civ[col].values

    # Mann–Whitney U (location shift)
    u_stat, u_p = mannwhitneyu(x, y, alternative="two-sided")

    # Kolmogorov–Smirnov (distributional difference)
    ks_stat, ks_p = ks_2samp(x, y)

    # Effect size
    delta = cliffs_delta(x, y)

    results.append({
        "metric": label,
        "homo_median": np.median(x),
        "civilization_median": np.median(y),
        "mannwhitney_U": u_stat,
        "mannwhitney_p": u_p,
        "ks_stat": ks_stat,
        "ks_p": ks_p,
        "cliffs_delta": delta
    })

# ============================================================
# OUTPUT RESULTS
# ============================================================

results_df = pd.DataFrame(results)
results_df.to_csv("hominin_stability_statistics.csv", index=False)

print("\n=== HOMININ–STABILITY STATISTICAL TESTS ===\n")
print(results_df.to_string(index=False))

print("\nInterpretation guide:")
print("• Mann–Whitney p < 0.05 → median shift is significant")
print("• KS p < 0.05 → distributions differ")
print("• Cliff's delta:")
print("    |δ| < 0.147  → negligible")
print("    |δ| < 0.33   → small")
print("    |δ| < 0.474  → medium")
print("    |δ| ≥ 0.474  → large")

# ============================================================
# OPTIONAL: SAVE ECDF DATA FOR PLOTTING
# ============================================================

def ecdf(data):
    data = np.sort(data)
    y = np.arange(1, len(data)+1) / len(data)
    return data, y

for label, col in metrics.items():
    hx, hy = ecdf(homo[col].values)
    cx, cy = ecdf(civ[col].values)

    pd.DataFrame({
        "homo_x": hx,
        "homo_ecdf": hy
    }).to_csv(f"ecdf_homo_{col}.csv", index=False)

    pd.DataFrame({
        "civilization_x": cx,
        "civilization_ecdf": cy
    }).to_csv(f"ecdf_civilization_{col}.csv", index=False)

print("\n✓ ECDF CSVs written for plotting")
