Add validation set, and gridsearch functionality

This commit is contained in:
Murtadha 2024-09-26 19:58:47 -04:00
parent f7b126c679
commit 99640d88e8

View file

@ -0,0 +1,301 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from itertools import product"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('data/bel_data_test.csv')\n",
"data = np.array(data)\n",
"\n",
"# Split data\n",
"X = data[:, 1:].T\n",
"Y = data[:, 0].astype(int)\n",
"\n",
"# Separate test set (first 1000 rows)\n",
"X_test = X[:, :1000]\n",
"Y_test = Y[:1000]\n",
"\n",
"# Remaining data for training and validation\n",
"X_remain = X[:, 1000:]\n",
"Y_remain = Y[1000:]\n",
"\n",
"# Split remaining data into training and validation sets\n",
"X_train, X_val, Y_train, Y_val = train_test_split(X_remain.T, Y_remain, test_size=0.2, random_state=42)\n",
"X_train, X_val = X_train.T, X_val.T"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Determine input and output layer sizes\n",
"input_size = X_train.shape[0]\n",
"output_size = len(np.unique(Y))-1\n",
"\n",
"print(f\"Input layer size: {input_size}\")\n",
"print(f\"Output layer size: {output_size}\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def init_params(layer_dims):\n",
" params = {}\n",
" L = len(layer_dims)\n",
" \n",
" for l in range(1, L):\n",
" params[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2. / layer_dims[l-1])\n",
" params[f'b{l}'] = np.zeros((layer_dims[l], 1))\n",
" \n",
" return params"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def ReLU(Z):\n",
" return np.maximum(Z, 0)\n",
"\n",
"def softmax(Z):\n",
" A = np.exp(Z) / sum(np.exp(Z))\n",
" return A\n",
"\n",
"def forward_prop(X, params):\n",
" caches = []\n",
" A = X\n",
" L = len(params) // 2\n",
"\n",
" for l in range(1, L):\n",
" A_prev = A\n",
" W = params[f'W{l}']\n",
" b = params[f'b{l}']\n",
" Z = np.dot(W, A_prev) + b\n",
" A = ReLU(Z)\n",
" caches.append((A_prev, W, b, Z))\n",
"\n",
" WL = params[f'W{L}']\n",
" bL = params[f'b{L}']\n",
" ZL = np.dot(WL, A) + bL\n",
" AL = softmax(ZL)\n",
" caches.append((A, WL, bL, ZL))\n",
"\n",
" return AL, caches\n",
"\n",
"def ReLU_deriv(Z):\n",
" return Z > 0\n",
"\n",
"def one_hot(Y):\n",
" one_hot_Y = np.zeros((Y.size, Y.max() + 1))\n",
" one_hot_Y[np.arange(Y.size), Y] = 1\n",
" one_hot_Y = one_hot_Y.T\n",
" return one_hot_Y\n",
"\n",
"def backward_prop(AL, Y, caches):\n",
" grads = {}\n",
" L = len(caches)\n",
" m = AL.shape[1]\n",
" Y = one_hot(Y)\n",
"\n",
" dAL = AL - Y\n",
" current_cache = caches[L-1]\n",
" grads[f\"dW{L}\"] = 1 / m * np.dot(dAL, current_cache[0].T)\n",
" grads[f\"db{L}\"] = 1 / m * np.sum(dAL, axis=1, keepdims=True)\n",
" dA_prev = np.dot(current_cache[1].T, dAL)\n",
"\n",
" for l in reversed(range(L-1)):\n",
" current_cache = caches[l]\n",
" dZ = dA_prev * ReLU_deriv(current_cache[3])\n",
" grads[f\"dW{l+1}\"] = 1 / m * np.dot(dZ, current_cache[0].T)\n",
" grads[f\"db{l+1}\"] = 1 / m * np.sum(dZ, axis=1, keepdims=True)\n",
" if l > 0:\n",
" dA_prev = np.dot(current_cache[1].T, dZ)\n",
"\n",
" return grads\n",
"\n",
"def update_params(params, grads, alpha):\n",
" L = len(params) // 2\n",
"\n",
" for l in range(1, L + 1):\n",
" params[f\"W{l}\"] -= alpha * grads[f\"dW{l}\"]\n",
" params[f\"b{l}\"] -= alpha * grads[f\"db{l}\"]\n",
"\n",
" return params\n",
"\n",
"def get_predictions(AL):\n",
" return np.argmax(AL, axis=0)\n",
"\n",
"def get_accuracy(predictions, Y):\n",
" return np.sum(predictions == Y) / Y.size"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def gradient_descent(X_train, Y_train, X_val, Y_val, layer_dims, alpha, iterations, early_stop_patience=10):\n",
" params = init_params(layer_dims)\n",
" best_val_accuracy = 0\n",
" patience_counter = 0\n",
" acc_store = []\n",
" \n",
" for i in range(iterations):\n",
" AL, caches = forward_prop(X_train, params)\n",
" grads = backward_prop(AL, Y_train, caches)\n",
" params = update_params(params, grads, alpha)\n",
"\n",
" if i % 100 == 0:\n",
" train_predictions = get_predictions(AL)\n",
" train_accuracy = get_accuracy(train_predictions, Y_train)\n",
" \n",
" val_AL, _ = forward_prop(X_val, params)\n",
" val_predictions = get_predictions(val_AL)\n",
" val_accuracy = get_accuracy(val_predictions, Y_val)\n",
" \n",
" print(f\"Iteration {i}: Train Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}\")\n",
" acc_store.append((train_accuracy, val_accuracy))\n",
" \n",
" if val_accuracy > best_val_accuracy:\n",
" best_val_accuracy = val_accuracy\n",
" best_params = params.copy()\n",
" patience_counter = 0\n",
" else:\n",
" patience_counter += 1\n",
" \n",
" if patience_counter >= early_stop_patience:\n",
" print(\"Early stopping triggered.\")\n",
" break\n",
"\n",
" return best_params, best_val_accuracy, acc_store"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def grid_search(X_train, Y_train, X_val, Y_val, layer_configs, alpha, iterations):\n",
" results = []\n",
" \n",
" for layer_config in layer_configs:\n",
" layer_dims = [input_size] + list(layer_config) + [output_size]\n",
" print(f\"Training architecture: {layer_dims}\")\n",
" best_params, accuracy, acc_store = gradient_descent(X_train, Y_train, X_val, Y_val, layer_dims, alpha, iterations)\n",
" results.append((layer_config, accuracy, best_params, acc_store))\n",
" print(f\"Architecture {layer_dims}: Best Validation Accuracy: {accuracy:.4f}\\n\")\n",
" \n",
" return sorted(results, key=lambda x: x[1], reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# def predict(X, parameters):\n",
"# AL, _ = forward_propagation(X, parameters)\n",
"# predictions = (AL > 0.5) # Classify as 1 if greater than 0.5\n",
"# return predictions"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"hidden_layers = [1, 2]\n",
"neurons_per_layer = [64, 128, 256]\n",
"layer_configs = list(product(*[neurons_per_layer] * max(hidden_layers)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Perform grid search\n",
"print(\"Performing grid search...\")\n",
"best_configs = grid_search(X_train, Y_train, X_val, Y_val, layer_configs, alpha=0.01, iterations=4000)\n",
"\n",
"print(\"\\nTop 5 Architectures:\")\n",
"for config, accuracy, _, _ in best_configs[:5]:\n",
" print(f\"Hidden Layers: {config}, Validation Accuracy: {accuracy:.4f}\")\n",
"\n",
"# Select the best configuration\n",
"best_config, best_accuracy, best_params, best_acc_store = best_configs[0]\n",
"best_layer_dims = [input_size] + list(best_config) + [output_size]\n",
"\n",
"print(f\"\\nBest architecture: {best_layer_dims}\")\n",
"print(f\"Best validation accuracy: {best_accuracy:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save the accuracy data for the best model\n",
"df = pd.DataFrame(best_acc_store, columns=['Train Accuracy', 'Validation Accuracy'])\n",
"df.to_csv('results/bel_acc.csv', index=False)\n",
"\n",
"# Save the weights of the best model\n",
"np.savez(\"weights/bel_weights\", **best_params)\n",
"\n",
"# Evaluate on test set\n",
"test_AL, _ = forward_prop(X_test, best_params)\n",
"test_predictions = get_predictions(test_AL)\n",
"test_accuracy = get_accuracy(test_predictions, Y_test)\n",
"print(f\"Test Accuracy: {test_accuracy:.4f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "semantics",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}