# nn_noisy.py
#
# What happens when we train a neural network on noisy data?
# This script adds Gaussian noise to the Cp data, trains with a
# train/validation split, and plots both loss curves to show overfitting.
#
# CHEG 667-013
# E. M. Furst

import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

# ── Load data ─────────────────────────────────────────────────

data = np.loadtxt("data/n2_cp.csv", delimiter=",", skiprows=1)
T_raw = data[:, 0]
Cp_raw = data[:, 1]

# ── Add noise ─────────────────────────────────────────────────

noise_scale = 0.02  # kJ/kg/K — try 0.01, 0.02, 0.05, 0.1
rng = np.random.default_rng(seed=42)
Cp_noisy = Cp_raw + rng.normal(scale=noise_scale, size=Cp_raw.size)

# ── Train/validation split ────────────────────────────────────
#
# Hold out every 4th point for validation.  This gives us 26 training
# points and 9 validation points — enough to see the overfitting signal.

val_mask = np.zeros(len(T_raw), dtype=bool)
val_mask[::4] = True
train_mask = ~val_mask

T_train, Cp_train = T_raw[train_mask], Cp_noisy[train_mask]
T_val, Cp_val = T_raw[val_mask], Cp_noisy[val_mask]

# ── Normalize to [0, 1] using training set statistics ─────────

T_min, T_max = T_train.min(), T_train.max()
Cp_min, Cp_max = Cp_train.min(), Cp_train.max()

def normalize_T(T):
    return (T - T_min) / (T_max - T_min)

def normalize_Cp(Cp):
    return (Cp - Cp_min) / (Cp_max - Cp_min)

def denormalize_Cp(Cp_norm):
    return Cp_norm * (Cp_max - Cp_min) + Cp_min

X_train = torch.tensor(normalize_T(T_train), dtype=torch.float32).reshape(-1, 1)
Y_train = torch.tensor(normalize_Cp(Cp_train), dtype=torch.float32).reshape(-1, 1)
X_val = torch.tensor(normalize_T(T_val), dtype=torch.float32).reshape(-1, 1)
Y_val = torch.tensor(normalize_Cp(Cp_val), dtype=torch.float32).reshape(-1, 1)

# ── Define the network ────────────────────────────────────────

H = 10  # try 10, 20, 50 — watch what happens

model = nn.Sequential(
    nn.Linear(1, H),
    nn.Tanh(),
    nn.Linear(H, 1),
)

n_params = sum(p.numel() for p in model.parameters())
print(f"Network: 1 -> {H} (tanh) -> 1")
print(f"Parameters: {n_params}")
print(f"Training points: {len(T_train)}")
print(f"Validation points: {len(T_val)}")
print(f"Noise scale: {noise_scale} kJ/kg/K\n")

# ── Training ──────────────────────────────────────────────────

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

epochs = 10000
log_interval = 1000
train_losses = []
val_losses = []
best_val_loss = float('inf')
best_epoch = 0

for epoch in range(epochs):
    # --- Training step ---
    model.train()
    Y_pred = model(X_train)
    train_loss = loss_fn(Y_pred, Y_train)

    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    # --- Validation step (no gradient computation) ---
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val)
        val_loss = loss_fn(val_pred, Y_val)

    train_losses.append(train_loss.item())
    val_losses.append(val_loss.item())

    # Track the best validation loss — same idea as nanoGPT's train.py
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        best_epoch = epoch

    if epoch % log_interval == 0 or epoch == epochs - 1:
        print(f"Epoch {epoch:5d}  Train: {train_loss.item():.6f}  "
              f"Val: {val_loss.item():.6f}")

print(f"\nBest validation loss: {best_val_loss:.6f} at epoch {best_epoch}")

# ── Results ───────────────────────────────────────────────────

T_fine = torch.linspace(0, 1, 200).reshape(-1, 1)
model.eval()
with torch.no_grad():
    Cp_pred_norm = model(T_fine)

T_fine_K = T_fine.numpy() * (T_max - T_min) + T_min
Cp_pred = denormalize_Cp(Cp_pred_norm.numpy())

# ── Plot ──────────────────────────────────────────────────────

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Left: the fit
ax = axes[0]
ax.plot(T_train, Cp_train, 'ko', markersize=6, label='Train (noisy)')
ax.plot(T_val, Cp_val, 'bs', markersize=6, label='Validation (noisy)')
ax.plot(T_raw, Cp_raw, 'g--', linewidth=1, alpha=0.7, label='True (NIST)')
ax.plot(T_fine_K, Cp_pred, 'r-', linewidth=2, label=f'NN ({H} neurons)')
ax.set_xlabel('Temperature (K)')
ax.set_ylabel('$C_p$ (kJ/kg/K)')
ax.set_title(f'Noisy $C_p(T)$ — noise = {noise_scale}')
ax.legend(fontsize=8)

# Middle: training loss
ax = axes[1]
ax.semilogy(train_losses, label='Train loss')
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE')
ax.set_title('Training Loss')
ax.legend()

# Right: train vs. validation loss
ax = axes[2]
ax.semilogy(train_losses, label='Train loss')
ax.semilogy(val_losses, label='Validation loss')
ax.axvline(best_epoch, color='gray', linestyle='--', alpha=0.5,
           label=f'Best val (epoch {best_epoch})')
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE')
ax.set_title('Train vs. Validation Loss')
ax.legend(fontsize=8)

plt.tight_layout()
plt.savefig('nn_fit_noisy.png', dpi=150)
plt.show()