# nn_numpy.py
#
# A neural network with one hidden layer, built from scratch using numpy.
# Fits Cp(T) data for nitrogen gas at 1 bar (NIST WebBook).
#
# This demonstrates the core mechanics of a neural network:
#   - Forward pass: input -> hidden layer -> activation -> output
#   - Loss calculation (mean squared error)
#   - Backpropagation: computing gradients of the loss w.r.t. each weight
#   - Gradient descent: updating weights to minimize loss
#
# CHEG 667-013
# E. M. Furst

import numpy as np
import matplotlib.pyplot as plt

# ── Load and prepare data ──────────────────────────────────────

data = np.loadtxt("data/n2_cp.csv", delimiter=",", skiprows=1)
T_raw = data[:, 0]   # Temperature (K)
Cp_raw = data[:, 1]  # Heat capacity (kJ/kg/K)

# Normalize inputs and outputs to [0, 1] range.
# Neural networks train better when values are small and centered.
T_min, T_max = T_raw.min(), T_raw.max()
Cp_min, Cp_max = Cp_raw.min(), Cp_raw.max()

T = (T_raw - T_min) / (T_max - T_min)      # shape: (N,)
Cp = (Cp_raw - Cp_min) / (Cp_max - Cp_min)  # shape: (N,)

# Reshape for matrix operations: each sample is a row
X = T.reshape(-1, 1)    # (N, 1) -- input matrix
Y = Cp.reshape(-1, 1)   # (N, 1) -- target matrix

N = X.shape[0]  # number of data points

# ── Network architecture ───────────────────────────────────────
#
#   Input (1) --> Hidden (H neurons, tanh) --> Output (1)
#
# The hidden layer has H neurons. Each neuron computes:
#   z = w * x + b       (weighted sum)
#   a = tanh(z)          (activation -- introduces nonlinearity)
#
# The output layer combines the hidden activations:
#   y_pred = W2 @ a + b2

H = 10  # number of neurons in the hidden layer

# Initialize weights randomly (small values)
# W1: (1, H) -- connects input to each hidden neuron
# b1: (1, H) -- one bias per hidden neuron
# W2: (H, 1) -- connects hidden neurons to output
# b2: (1, 1) -- output bias
np.random.seed(42)
W1 = np.random.randn(1, H) * 0.5
b1 = np.zeros((1, H))
W2 = np.random.randn(H, 1) * 0.5
b2 = np.zeros((1, 1))

# ── Training parameters ───────────────────────────────────────

learning_rate = 0.01
epochs = 5000
log_interval = 500

# ── Training loop ─────────────────────────────────────────────

losses = []

for epoch in range(epochs):

    # ── Forward pass ──────────────────────────────────────────
    # Step 1: hidden layer pre-activation
    Z1 = X @ W1 + b1          # (N, H)

    # Step 2: hidden layer activation (tanh)
    A1 = np.tanh(Z1)          # (N, H)

    # Step 3: output layer (linear -- no activation)
    Y_pred = A1 @ W2 + b2     # (N, 1)

    # ── Loss ──────────────────────────────────────────────────
    # Mean squared error
    error = Y_pred - Y         # (N, 1)
    loss = np.mean(error ** 2)
    losses.append(loss)

    # ── Backpropagation ───────────────────────────────────────
    # Compute gradients by applying the chain rule, working
    # backward from the loss to each weight.

    # Gradient of loss w.r.t. output
    dL_dYpred = 2 * error / N              # (N, 1)

    # Gradients for output layer weights
    dL_dW2 = A1.T @ dL_dYpred              # (H, 1)
    dL_db2 = np.sum(dL_dYpred, axis=0, keepdims=True)  # (1, 1)

    # Gradient flowing back through the hidden layer
    dL_dA1 = dL_dYpred @ W2.T             # (N, H)

    # Derivative of tanh: d/dz tanh(z) = 1 - tanh(z)^2
    dL_dZ1 = dL_dA1 * (1 - A1 ** 2)       # (N, H)

    # Gradients for hidden layer weights
    dL_dW1 = X.T @ dL_dZ1                  # (1, H)
    dL_db1 = np.sum(dL_dZ1, axis=0, keepdims=True)  # (1, H)

    # ── Gradient descent ──────────────────────────────────────
    # Update each weight in the direction that reduces the loss
    W2 -= learning_rate * dL_dW2
    b2 -= learning_rate * dL_db2
    W1 -= learning_rate * dL_dW1
    b1 -= learning_rate * dL_db1

    if epoch % log_interval == 0 or epoch == epochs - 1:
        print(f"Epoch {epoch:5d}  Loss: {loss:.6f}")

# ── Results ────────────────────────────────────────────────────

# Predict on a fine grid for smooth plotting
T_fine = np.linspace(0, 1, 200).reshape(-1, 1)
A1_fine = np.tanh(T_fine @ W1 + b1)
Cp_pred_norm = A1_fine @ W2 + b2

# Convert back to physical units
T_fine_K = T_fine * (T_max - T_min) + T_min
Cp_pred = Cp_pred_norm * (Cp_max - Cp_min) + Cp_min

# ── Plot ───────────────────────────────────────────────────────

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Left: fit
ax1.plot(T_raw, Cp_raw, 'ko', markersize=6, label='NIST data')
ax1.plot(T_fine_K, Cp_pred, 'r-', linewidth=2, label=f'NN ({H} neurons)')
ax1.set_xlabel('Temperature (K)')
ax1.set_ylabel('$C_p$ (kJ/kg/K)')
ax1.set_title('$C_p(T)$ for N$_2$ at 1 bar')
ax1.legend()

# Right: training loss
ax2.semilogy(losses)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Mean Squared Error')
ax2.set_title('Training Loss')

plt.tight_layout()
plt.savefig('nn_fit.png', dpi=150)
plt.show()

print(f"\nFinal loss: {losses[-1]:.6f}")
print(f"Network: {1} input -> {H} hidden (tanh) -> {1} output")
print(f"Total parameters: {W1.size + b1.size + W2.size + b2.size}")