diff --git a/bel_semantics.ipynb b/bel_semantics.ipynb
index 4083ea5..9500c3c 100644
--- a/bel_semantics.ipynb
+++ b/bel_semantics.ipynb
@@ -22,9 +22,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n"
+     ]
+    }
+   ],
    "source": [
     "# Check if CUDA is available\n",
     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
@@ -35,18 +43,40 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train set shape: (2860, 1024)\n",
+      "Validation set shape: (715, 1024)\n",
+      "Test set shape: (1000, 1024)\n"
+     ]
+    }
+   ],
    "source": [
     "data = pd.read_csv('data/bel_data_test.csv')\n",
-    "# Load the data\n",
     "data = np.array(data)\n",
     "\n",
     "# Split features and labels\n",
     "X = data[:, 1:]  # All columns except the first one\n",
     "y = data[:, 0].astype(int)  # First column as labels\n",
     "\n",
-    "# Split the data into training and testing sets\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
+    "# Create test set from the first thousand rows\n",
+    "X_test = X[:1000]\n",
+    "y_test = y[:1000]\n",
+    "\n",
+    "# Use the remaining data for train and validation\n",
+    "X_remaining = X[1000:]\n",
+    "y_remaining = y[1000:]\n",
+    "\n",
+    "# Split the remaining data into training and validation sets\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X_remaining, y_remaining, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Print the shapes of the resulting sets\n",
+    "print(f\"Train set shape: {X_train.shape}\")\n",
+    "print(f\"Validation set shape: {X_val.shape}\")\n",
+    "print(f\"Test set shape: {X_test.shape}\")"
    ]
   },
   {
@@ -55,18 +85,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Convert to PyTorch tensors\n",
-    "X_train_tensor = torch.FloatTensor(X_train)\n",
-    "y_train_tensor = torch.LongTensor(y_train)\n",
-    "X_test_tensor = torch.FloatTensor(X_test)\n",
-    "y_test_tensor = torch.LongTensor(y_test)\n",
+    "# # Convert to PyTorch tensors\n",
+    "# X_train_tensor = torch.FloatTensor(X_train)\n",
+    "# y_train_tensor = torch.LongTensor(y_train)\n",
+    "# X_test_tensor = torch.FloatTensor(X_test)\n",
+    "# y_test_tensor = torch.LongTensor(y_test)\n",
     "\n",
-    "# Create DataLoader objects\n",
-    "train_dataset = TensorDataset(X_train_tensor, y_train_tensor)\n",
-    "test_dataset = TensorDataset(X_test_tensor, y_test_tensor)\n",
+    "# # Create DataLoader objects\n",
+    "# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)\n",
+    "# test_dataset = TensorDataset(X_test_tensor, y_test_tensor)\n",
     "\n",
-    "train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
-    "test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)"
+    "# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
+    "# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)"
    ]
   },
   {
@@ -75,45 +105,50 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class SemanticsMLP(nn.Module):\n",
+    "class SemanticsMLP:\n",
     "    def __init__(self, input_size=1024, hidden_sizes=[512, 256, 128], num_classes=62):\n",
-    "        super(SemanticsMLP, self).__init__()\n",
     "        self.input_size = input_size\n",
     "        self.hidden_sizes = hidden_sizes\n",
     "        self.num_classes = num_classes\n",
     "\n",
-    "        # Encoder (feature extractor)\n",
-    "        self.encoder_layers = nn.ModuleList()\n",
+    "        # Initialize weights and biases\n",
+    "        self.encoder_weights = []\n",
+    "        self.encoder_biases = []\n",
     "        prev_size = input_size\n",
     "        for hidden_size in hidden_sizes:\n",
-    "            self.encoder_layers.append(nn.Linear(prev_size, hidden_size))\n",
+    "            self.encoder_weights.append(np.random.randn(prev_size, hidden_size) * np.sqrt(2. / prev_size))\n",
+    "            self.encoder_biases.append(np.zeros(hidden_size))\n",
     "            prev_size = hidden_size\n",
     "\n",
-    "        # Classifier\n",
-    "        self.classifier = nn.Linear(hidden_sizes[-1], num_classes)\n",
+    "        self.classifier_weight = np.random.randn(hidden_sizes[-1], num_classes) * np.sqrt(2. / hidden_sizes[-1])\n",
+    "        self.classifier_bias = np.zeros(num_classes)\n",
     "\n",
-    "        # Decoder\n",
-    "        self.decoder_layers = nn.ModuleList()\n",
+    "        self.decoder_weights = []\n",
+    "        self.decoder_biases = []\n",
     "        reversed_hidden_sizes = list(reversed(hidden_sizes))\n",
     "        prev_size = hidden_sizes[-1]\n",
     "        for hidden_size in reversed_hidden_sizes[1:] + [input_size]:\n",
-    "            self.decoder_layers.append(nn.Linear(prev_size, hidden_size))\n",
+    "            self.decoder_weights.append(np.random.randn(prev_size, hidden_size) * np.sqrt(2. / prev_size))\n",
+    "            self.decoder_biases.append(np.zeros(hidden_size))\n",
     "            prev_size = hidden_size\n",
     "\n",
+    "    def relu(self, x):\n",
+    "        return np.maximum(0, x)\n",
+    "\n",
     "    def encode(self, x):\n",
-    "        for layer in self.encoder_layers:\n",
-    "            x = F.relu(layer(x))\n",
+    "        for weight, bias in zip(self.encoder_weights, self.encoder_biases):\n",
+    "            x = self.relu(np.dot(x, weight) + bias)\n",
     "        return x\n",
     "\n",
     "    def decode(self, x):\n",
-    "        for layer in self.decoder_layers[:-1]:\n",
-    "            x = F.relu(layer(x))\n",
-    "        x = self.decoder_layers[-1](x)  # No activation on the final layer\n",
+    "        for weight, bias in zip(self.decoder_weights[:-1], self.decoder_biases[:-1]):\n",
+    "            x = self.relu(np.dot(x, weight) + bias)\n",
+    "        x = np.dot(x, self.decoder_weights[-1]) + self.decoder_biases[-1]  # No activation on the final layer\n",
     "        return x\n",
     "\n",
     "    def forward(self, x):\n",
     "        encoded = self.encode(x)\n",
-    "        logits = self.classifier(encoded)\n",
+    "        logits = np.dot(encoded, self.classifier_weight) + self.classifier_bias\n",
     "        reconstructed = self.decode(encoded)\n",
     "        return logits, reconstructed"
    ]
@@ -123,6 +158,72 @@
    "execution_count": 6,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "def softmax(x):\n",
+    "    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))\n",
+    "    return exp_x / np.sum(exp_x, axis=1, keepdims=True)\n",
+    "\n",
+    "def cross_entropy_loss(y_pred, y_true):\n",
+    "    m = y_true.shape[0]\n",
+    "    p = softmax(y_pred)\n",
+    "    log_likelihood = -np.log(p[range(m), y_true])\n",
+    "    loss = np.sum(log_likelihood) / m\n",
+    "    return loss\n",
+    "\n",
+    "def cross_entropy_gradient(y_pred, y_true):\n",
+    "    m = y_true.shape[0]\n",
+    "    grad = softmax(y_pred)\n",
+    "    grad[range(m), y_true] -= 1\n",
+    "    grad = grad / m\n",
+    "    return grad\n",
+    "\n",
+    "def mse_loss(y_pred, y_true):\n",
+    "    return np.mean((y_pred - y_true) ** 2)\n",
+    "\n",
+    "def mse_gradient(y_pred, y_true):\n",
+    "    return 2 * (y_pred - y_true) / y_true.shape[0]\n",
+    "\n",
+    "def train_step(model, X, y, learning_rate):\n",
+    "    # Forward pass\n",
+    "    logits, reconstructed = model.forward(X)\n",
+    "    \n",
+    "    # Compute gradients\n",
+    "    ce_grad = cross_entropy_gradient(logits, y)\n",
+    "    mse_grad = mse_gradient(reconstructed, X)\n",
+    "    \n",
+    "    # Backpropagation (simplified, not computing full gradients for all layers)\n",
+    "    encoded = model.encode(X)\n",
+    "    \n",
+    "    # Update classifier\n",
+    "    model.classifier_weight -= learning_rate * np.dot(encoded.T, ce_grad)\n",
+    "    model.classifier_bias -= learning_rate * np.sum(ce_grad, axis=0)\n",
+    "    \n",
+    "    # Update decoder (last layer only for simplicity)\n",
+    "    decoder_grad = np.dot(encoded.T, mse_grad)\n",
+    "    if decoder_grad.shape != model.decoder_weights[-1].shape:\n",
+    "        raise ValueError(f\"Shape mismatch: decoder_grad {decoder_grad.shape}, decoder_weights[-1] {model.decoder_weights[-1].shape}\")\n",
+    "    model.decoder_weights[-1] -= learning_rate * decoder_grad\n",
+    "    model.decoder_biases[-1] -= learning_rate * np.sum(mse_grad, axis=0)\n",
+    "    \n",
+    "    # Compute loss\n",
+    "    ce_loss = cross_entropy_loss(logits, y)\n",
+    "    mse_loss_val = mse_loss(reconstructed, X)\n",
+    "    \n",
+    "    return ce_loss, mse_loss_val\n",
+    "\n",
+    "def evaluate(model, X, y):\n",
+    "    logits, reconstructed = model.forward(X)\n",
+    "    ce_loss = cross_entropy_loss(logits, y)\n",
+    "    mse_loss_val = mse_loss(reconstructed, X)\n",
+    "    accuracy = np.mean(np.argmax(logits, axis=1) == y)\n",
+    "    return ce_loss, mse_loss_val, accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "def show_image_comparison(original, reconstructed, label, prediction):\n",
     "    \"\"\"\n",
@@ -160,14 +261,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = SemanticsMLP(input_size=1024, hidden_sizes=[10], num_classes=62).to(device)\n",
-    "criterion = nn.CrossEntropyLoss()\n",
-    "reconstruction_criterion = nn.MSELoss()\n",
-    "optimizer = optim.Adam(model.parameters(), lr=0.001)"
+    "input_size = X_train.shape[1]\n",
+    "num_classes = len(np.unique(y))\n",
+    "model = SemanticsMLP(input_size=input_size, hidden_sizes=[512, 256, 128], num_classes=num_classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "operands could not be broadcast together with shapes (512,1024) (128,1024) (512,1024) ",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 17\u001b[0m\n\u001b[1;32m     14\u001b[0m     X_batch \u001b[38;5;241m=\u001b[39m X_train[i:i\u001b[38;5;241m+\u001b[39mbatch_size]\n\u001b[1;32m     15\u001b[0m     y_batch \u001b[38;5;241m=\u001b[39m y_train[i:i\u001b[38;5;241m+\u001b[39mbatch_size]\n\u001b[0;32m---> 17\u001b[0m     ce_loss, mse_loss_val \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;66;03m# Evaluate on validation set (using test set as validation for simplicity)\u001b[39;00m\n\u001b[1;32m     20\u001b[0m val_ce_loss, val_mse_loss, val_accuracy \u001b[38;5;241m=\u001b[39m evaluate(model, X_test, y_test)\n",
+      "Cell \u001b[0;32mIn[6], line 41\u001b[0m, in \u001b[0;36mtrain_step\u001b[0;34m(model, X, y, learning_rate)\u001b[0m\n\u001b[1;32m     38\u001b[0m model\u001b[38;5;241m.\u001b[39mclassifier_bias \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39msum(ce_grad, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m     40\u001b[0m \u001b[38;5;66;03m# Update decoder (last layer only for simplicity)\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecoder_weights\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mlearning_rate\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mencoded\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mT\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmse_grad\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     42\u001b[0m model\u001b[38;5;241m.\u001b[39mdecoder_biases[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m np\u001b[38;5;241m.\u001b[39msum(mse_grad, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m     44\u001b[0m \u001b[38;5;66;03m# Compute loss\u001b[39;00m\n",
+      "\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (512,1024) (128,1024) (512,1024) "
+     ]
+    }
+   ],
+   "source": [
+    "epochs = 100\n",
+    "batch_size = 32\n",
+    "learning_rate = 0.001\n",
+    "\n",
+    "for epoch in range(epochs):\n",
+    "    # Shuffle the training data\n",
+    "    indices = np.arange(X_train.shape[0])\n",
+    "    np.random.shuffle(indices)\n",
+    "    X_train = X_train[indices]\n",
+    "    y_train = y_train[indices]\n",
+    "    \n",
+    "    # Mini-batch training\n",
+    "    for i in range(0, X_train.shape[0], batch_size):\n",
+    "        X_batch = X_train[i:i+batch_size]\n",
+    "        y_batch = y_train[i:i+batch_size]\n",
+    "        \n",
+    "        try:\n",
+    "            ce_loss, mse_loss_val = train_step(model, X_batch, y_batch, learning_rate)\n",
+    "        except ValueError as e:\n",
+    "            print(f\"Error in batch {i // batch_size}: {e}\")\n",
+    "            print(f\"X_batch shape: {X_batch.shape}\")\n",
+    "            print(f\"y_batch shape: {y_batch.shape}\")\n",
+    "            raise\n",
+    "    \n",
+    "    # Evaluate on validation set\n",
+    "    val_ce_loss, val_mse_loss, val_accuracy = evaluate(model, X_val, y_val)\n",
+    "    \n",
+    "    if epoch % 10 == 0:\n",
+    "        print(f\"Epoch {epoch}, Val CE Loss: {val_ce_loss:.4f}, Val MSE Loss: {val_mse_loss:.4f}, Val Accuracy: {val_accuracy:.4f}\")\n",
+    "\n"
    ]
   },
   {
@@ -176,64 +327,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "num_epochs = 250\n",
-    "for epoch in range(num_epochs):\n",
-    "    model.train()\n",
-    "    running_loss = 0.0\n",
-    "    \n",
-    "    with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
-    "        for images, labels in tepoch:\n",
-    "            tepoch.set_description(f\"Epoch {epoch+1}\")\n",
-    "            \n",
-    "            images, labels = images.to(device), labels.to(device)\n",
-    "            \n",
-    "            optimizer.zero_grad()\n",
-    "            \n",
-    "            logits, reconstructed = model(images)\n",
-    "            \n",
-    "            classification_loss = criterion(logits, labels)\n",
-    "            reconstruction_loss = reconstruction_criterion(reconstructed, images)\n",
-    "            total_loss = classification_loss + reconstruction_loss\n",
-    "            \n",
-    "            total_loss.backward()\n",
-    "            optimizer.step()\n",
-    "            \n",
-    "            running_loss += total_loss.item()\n",
-    "            \n",
-    "            tepoch.set_postfix(loss=total_loss.item())\n",
-    "    \n",
-    "    epoch_loss = running_loss / len(train_loader)\n",
-    "    # print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.eval()\n",
-    "with torch.no_grad():\n",
-    "    images, labels = next(iter(test_loader))\n",
-    "    images, labels = images.to(device), labels.to(device)\n",
-    "    \n",
-    "    logits, reconstructed = model(images)\n",
-    "    \n",
-    "    _, predicted = torch.max(logits.data, 1)\n",
-    "    \n",
-    "    num_images_to_show = min(5, len(images))\n",
-    "    for i in range(num_images_to_show):\n",
-    "        show_image_comparison(\n",
-    "            images[i], \n",
-    "            reconstructed[i], \n",
-    "            labels[i].item(), \n",
-    "            predicted[i].item()\n",
-    "        )\n",
-    "    \n",
-    "    correct = (predicted == labels).sum().item()\n",
-    "    total = labels.size(0)\n",
-    "    accuracy = 100 * correct / total\n",
-    "    print(f'Test Accuracy: {accuracy:.2f}%')"
+    "test_ce_loss, test_mse_loss, test_accuracy = evaluate(model, X_test, y_test)\n",
+    "print(f\"Final Test CE Loss: {test_ce_loss:.4f}, Test MSE Loss: {test_mse_loss:.4f}, Test Accuracy: {test_accuracy:.4f}\")"
    ]
   },
   {
diff --git a/weights/bel_weights.npz b/weights/bel_weights.npz
index 23ab97c..27346cf 100644
Binary files a/weights/bel_weights.npz and b/weights/bel_weights.npz differ