Narrow hyperparams

This commit is contained in:
Andras Schmelczer 2024-06-04 07:50:02 +01:00
parent 6c2a20c73b
commit cb8a791477
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
2 changed files with 85 additions and 251 deletions

View file

@ -15,13 +15,13 @@ MODELS = {
# "v1": v1,
"SimpleCNN": SimpleCNN,
"Residual": Residual,
"NormalisedCNN": NormalisedCNN,
"SmartRes": SmartRes,
# "NormalisedCNN": NormalisedCNN,
# "SmartRes": SmartRes,
# "AttentionNet": AttentionNet,
"attention2": EnhancedAestheticHistogramNet,
"advanced_attention": advanced_attention,
"Res2": Res2,
"attention1": PhotoEnhanceNetAdvanced,
# "attention2": EnhancedAestheticHistogramNet,
# "advanced_attention": advanced_attention,
# "Res2": Res2,
# "attention1": PhotoEnhanceNetAdvanced,
}

View file

@ -2,25 +2,18 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {
"metadata": {}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-06-03 07:46:08,999 - INFO - PyTorch version: 2.2.2\n"
]
},
{
"data": {
"text/plain": [
"'Using device cuda:0'"
]
},
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@ -51,17 +44,17 @@
"metadata": {},
"outputs": [],
"source": [
"from scipy.stats import loguniform, uniform\n",
"from scipy.stats import loguniform, uniform, randint\n",
"from editor.models import MODELS\n",
"\n",
"common_hyperparameters = {\n",
" \"batch_size\": [16, 32, 64],\n",
" \"edit_count\": [8, 16],\n",
" \"bin_count\": [16, 32],\n",
" \"bin_count\": [16, 24, 32],\n",
" \"clip_gradients\": [True, False],\n",
" \"learning_rate\": loguniform(0.0001, 0.005),\n",
" \"scheduler_gamma\": uniform(0.1, 0.9),\n",
" \"num_epochs\": [5],\n",
" \"learning_rate\": loguniform(0.00001, 0.01),\n",
" \"scheduler_gamma\": uniform(0, 1),\n",
" \"num_epochs\": randint(5, 10),\n",
" \"model_type\": list(MODELS.keys()),\n",
"}\n",
"hyperparameters = [\n",
@ -82,30 +75,7 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing model SimpleCNN\n",
"Test passed! Output shape matches input shape.\n",
"Testing model Residual\n",
"Test passed! Output shape matches input shape.\n",
"Testing model NormalisedCNN\n",
"Test passed! Output shape matches input shape.\n",
"Testing model SmartRes\n",
"Test passed! Output shape matches input shape.\n",
"Testing model attention2\n",
"Test passed! Output shape matches input shape.\n",
"Testing model advanced_attention\n",
"Test passed! Output shape matches input shape.\n",
"Testing model Res2\n",
"Test passed! Output shape matches input shape.\n",
"Testing model attention1\n",
"Test passed! Output shape matches input shape.\n"
]
}
],
"outputs": [],
"source": [
"from typing import Any, Dict\n",
"from torch.utils.tensorboard import SummaryWriter\n",
@ -120,8 +90,10 @@
"from editor.models import create_model, test_models\n",
"from config import DATA, MODELS_PATH\n",
"from datetime import timedelta, datetime\n",
"import json\n",
"\n",
"test_models()\n",
"\n",
"# test_models()\n",
"\n",
"\n",
"def train(\n",
@ -129,6 +101,9 @@
") -> Path:\n",
" start_time = datetime.now()\n",
" model_path = (MODELS_PATH / get_next_run_name(Path(\"runs\"))).with_suffix(\".pth\")\n",
" params_path = (MODELS_PATH / get_next_run_name(Path(\"runs\"))).with_suffix(\".json\")\n",
" with open(params_path, \"w\") as f:\n",
" json.dump(hyperparameters, f, indent=2)\n",
"\n",
" log_dir = Path(\"runs\") / get_next_run_name(Path(\"runs\"))\n",
" with SummaryWriter(log_dir) as writer:\n",
@ -222,6 +197,7 @@
" global_step=epoch,\n",
" run_name=log_dir.absolute(),\n",
" )\n",
" logging.info(f\"Epoch {epoch} loss: {epoch_loss}\")\n",
" with torch.no_grad():\n",
" model.eval()\n",
" loader = iter(test_data_loader)\n",
@ -251,6 +227,7 @@
" except Exception as e:\n",
" raise\n",
" finally:\n",
" logging.info(f\"Saving model to {model_path}\")\n",
" torch.save(model.state_dict(), model_path)\n",
" del model\n",
" torch.cuda.empty_cache()\n",
@ -289,226 +266,88 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2024-06-02 21:42:49,762 - INFO - Starting run_51 with hparams {\n",
"2024-06-03 22:46:07,734 - INFO - Starting run_96 with hparams {\n",
" \"batch_size\": 16,\n",
" \"bin_count\": 64,\n",
" \"clip_gradients\": true,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 0.0019018860481580008,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"Residual\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.5124233085818609\n",
"}\n",
"2024-06-02 21:42:49,787 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-02 23:43:03,467 - WARNING - Timeout, aborting experiment\n",
"2024-06-02 23:43:03,698 - INFO - Starting run_52 with hparams {\n",
" \"batch_size\": 16,\n",
" \"bin_count\": 16,\n",
" \"bin_count\": 32,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 8,\n",
" \"learning_rate\": 2.9976475506468536e-05,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"SmartRes\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.8138813825657673\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 3.291322467520231e-05,\n",
" \"loss\": \"progressive\",\n",
" \"loss_damping\": 1.1967321790868395,\n",
" \"loss_sizes\": [\n",
" 8,\n",
" 32\n",
" ],\n",
" \"model_type\": \"attention2\",\n",
" \"num_epochs\": 5,\n",
" \"scheduler_gamma\": 0.4573812925553331\n",
"}\n",
"2024-06-02 23:43:03,991 - INFO - Loaded 179834 training images and 19982 test images\n",
"2024-06-03 22:46:07,762 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-04 00:08:08,259 - INFO - Epoch 0 loss: 38.42179161275271\n",
"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/matplotlib/collections.py:996: RuntimeWarning: invalid value encountered in sqrt\n",
" scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor\n",
"2024-06-02 23:52:17,393 - INFO - Starting run_53 with hparams {\n",
" \"batch_size\": 8,\n",
" \"bin_count\": 32,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 8,\n",
" \"learning_rate\": 0.0002765101396434423,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"SmartRes\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.8393595799921102\n",
"}\n",
"2024-06-02 23:52:17,413 - INFO - Loaded 179834 training images and 19982 test images\n",
"2024-06-03 00:48:49,485 - INFO - Starting run_54 with hparams {\n",
" \"batch_size\": 16,\n",
" \"bin_count\": 16,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 0.00040493280785202865,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"SmartRes\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.6647838946959123\n",
"}\n",
"2024-06-03 00:48:49,509 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-03 01:10:40,678 - INFO - Starting run_55 with hparams {\n",
" \"batch_size\": 32,\n",
" \"bin_count\": 16,\n",
" \"clip_gradients\": true,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 0.000989324245186775,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"SmartRes\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.6779989111474544\n",
"}\n",
"2024-06-03 01:10:40,704 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-03 01:26:06,028 - INFO - Starting run_56 with hparams {\n",
" \"batch_size\": 8,\n",
" \"bin_count\": 16,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 1.0695951486573912e-05,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"Residual\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.3619561054933521\n",
"}\n",
"2024-06-03 01:26:06,052 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-03 02:03:39,558 - INFO - Starting run_57 with hparams {\n",
" \"batch_size\": 32,\n",
" \"bin_count\": 64,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 0.00024721579172106914,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"attention1\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.7999479970967494\n",
"}\n",
"2024-06-03 02:03:39,585 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-03 02:05:40,747 - ERROR - Error with hparams {'batch_size': 32, 'edit_count': 16, 'bin_count': 64, 'clip_gradients': False, 'learning_rate': 0.00024721579172106914, 'scheduler_gamma': 0.7999479970967494, 'num_epochs': 10, 'model_type': 'attention1', 'loss': 'kl'}:\n",
"\tCUDA out of memory. Tried to allocate 6.00 GiB. GPU 0 has a total capacity of 15.99 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 39.04 GiB is allocated by PyTorch, and 2.02 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n",
"Stack (most recent call last):\n",
" File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
" File \"<frozen runpy>\", line 88, in _run_code\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel_launcher.py\", line 18, in <module>\n",
" app.launch_new_instance()\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/traitlets/config/application.py\", line 1075, in launch_instance\n",
" app.start()\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/kernelapp.py\", line 739, in start\n",
" self.io_loop.start()\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/tornado/platform/asyncio.py\", line 195, in start\n",
" self.asyncio_loop.run_forever()\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/asyncio/base_events.py\", line 639, in run_forever\n",
" self._run_once()\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/asyncio/base_events.py\", line 1985, in _run_once\n",
" handle._run()\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/asyncio/events.py\", line 88, in _run\n",
" self._context.run(self._callback, *self._args)\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 545, in dispatch_queue\n",
" await self.process_one()\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 534, in process_one\n",
" await dispatch(*args)\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 437, in dispatch_shell\n",
" await result\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/ipkernel.py\", line 359, in execute_request\n",
" await super().execute_request(stream, ident, parent)\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 778, in execute_request\n",
" reply_content = await reply_content\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/ipkernel.py\", line 446, in do_execute\n",
" res = shell.run_cell(\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/ipykernel/zmqshell.py\", line 549, in run_cell\n",
" return super().run_cell(*args, **kwargs)\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3075, in run_cell\n",
" result = self._run_cell(\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3130, in _run_cell\n",
" result = runner(coro)\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n",
" coro.send(None)\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3334, in run_cell_async\n",
" has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3517, in run_ast_nodes\n",
" if await self.run_code(code, result, async_=asy):\n",
" File \"/home/andras/miniconda3/envs/bipolaroid/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3577, in run_code\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
" File \"/tmp/ipykernel_141525/1542138470.py\", line 28, in <module>\n",
" logging.error(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CUDA out of memory. Tried to allocate 6.00 GiB. GPU 0 has a total capacity of 15.99 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 39.04 GiB is allocated by PyTorch, and 2.02 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n",
"Error occurs, No graph saved\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-06-03 02:05:41,071 - INFO - Starting run_58 with hparams {\n",
"2024-06-04 01:30:02,938 - INFO - Epoch 1 loss: 34.078268383513205\n",
"2024-06-04 02:46:08,066 - INFO - Saving model to /home/andras/projects/bipolaroid/models/run_96.pth\n",
"2024-06-04 02:46:08,182 - WARNING - Timeout, aborting experiment\n",
"2024-06-04 02:46:08,479 - INFO - Starting run_97 with hparams {\n",
" \"batch_size\": 64,\n",
" \"bin_count\": 16,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 5.8262398455352215e-05,\n",
" \"loss\": \"kl\",\n",
" \"bin_count\": 32,\n",
" \"clip_gradients\": true,\n",
" \"edit_count\": 8,\n",
" \"learning_rate\": 5.96886240713341e-05,\n",
" \"loss\": \"progressive\",\n",
" \"loss_damping\": 2.8893045711729517,\n",
" \"loss_sizes\": [\n",
" 4,\n",
" 8,\n",
" 16,\n",
" 32\n",
" ],\n",
" \"model_type\": \"attention2\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.17181073763193916\n",
" \"scheduler_gamma\": 0.9315193474157711\n",
"}\n",
"2024-06-03 02:05:41,262 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-03 03:49:02,268 - INFO - Starting run_59 with hparams {\n",
" \"batch_size\": 16,\n",
" \"bin_count\": 16,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 32,\n",
" \"learning_rate\": 0.00017213076448986518,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"NormalisedCNN\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.1302383221350669\n",
"}\n",
"2024-06-03 03:49:02,397 - INFO - Loaded 719337 training images and 79927 test images\n",
"2024-06-03 04:28:45,612 - INFO - Starting run_60 with hparams {\n",
" \"batch_size\": 16,\n",
" \"bin_count\": 16,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 32,\n",
" \"learning_rate\": 0.00010975854085067054,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"NormalisedCNN\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.5457536006732233\n",
"}\n",
"2024-06-03 04:28:45,645 - INFO - Loaded 719337 training images and 79927 test images\n",
"2024-06-03 05:07:36,501 - INFO - Starting run_61 with hparams {\n",
" \"batch_size\": 16,\n",
" \"bin_count\": 32,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 7.977966217588004e-05,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"Res2\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.5539449021909474\n",
"}\n",
"2024-06-03 05:07:36,526 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-03 06:53:48,871 - INFO - Starting run_62 with hparams {\n",
"2024-06-04 02:46:08,500 - INFO - Loaded 179834 training images and 19982 test images\n",
"2024-06-04 06:46:16,877 - INFO - Saving model to /home/andras/projects/bipolaroid/models/run_97.pth\n",
"2024-06-04 06:46:28,422 - WARNING - Timeout, aborting experiment\n",
"2024-06-04 06:46:28,437 - INFO - Starting run_98 with hparams {\n",
" \"batch_size\": 64,\n",
" \"bin_count\": 16,\n",
" \"clip_gradients\": true,\n",
" \"edit_count\": 32,\n",
" \"learning_rate\": 0.0014725778411180288,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 0.0019552772361485543,\n",
" \"loss\": \"kl\",\n",
" \"model_type\": \"SimpleCNN\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.981077298963819\n",
" \"num_epochs\": 5,\n",
" \"scheduler_gamma\": 0.022346077394851838\n",
"}\n",
"2024-06-03 06:53:49,078 - INFO - Loaded 719337 training images and 79927 test images\n",
"2024-06-03 07:26:40,577 - INFO - Starting run_63 with hparams {\n",
"2024-06-04 06:46:28,475 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-04 06:48:11,407 - INFO - Epoch 0 loss: 20430.093976140022\n",
"2024-06-04 06:49:52,288 - INFO - Epoch 1 loss: 14717.722860097885\n",
"2024-06-04 06:51:32,993 - INFO - Epoch 2 loss: 13855.800803661346\n",
"2024-06-04 06:53:13,588 - INFO - Epoch 3 loss: 13853.357687234879\n",
"2024-06-04 06:54:54,389 - INFO - Epoch 4 loss: 13853.240978479385\n",
"2024-06-04 06:54:56,519 - INFO - Saving model to /home/andras/projects/bipolaroid/models/run_98.pth\n",
"2024-06-04 06:54:57,057 - INFO - Starting run_99 with hparams {\n",
" \"batch_size\": 32,\n",
" \"bin_count\": 64,\n",
" \"bin_count\": 32,\n",
" \"clip_gradients\": false,\n",
" \"edit_count\": 16,\n",
" \"learning_rate\": 0.0002723042772767375,\n",
" \"loss\": \"kl\",\n",
" \"learning_rate\": 0.00041782149104212284,\n",
" \"loss\": \"progressive\",\n",
" \"loss_damping\": 2.393572363792762,\n",
" \"loss_sizes\": [\n",
" 8,\n",
" 16,\n",
" 32\n",
" ],\n",
" \"model_type\": \"attention2\",\n",
" \"num_epochs\": 10,\n",
" \"scheduler_gamma\": 0.9651950429647194\n",
" \"scheduler_gamma\": 0.3478968531660309\n",
"}\n",
"2024-06-03 07:26:40,602 - INFO - Loaded 359668 training images and 39964 test images\n"
"2024-06-04 06:54:57,082 - INFO - Loaded 359668 training images and 39964 test images\n",
"2024-06-04 07:28:59,180 - INFO - Saving model to /home/andras/projects/bipolaroid/models/run_99.pth\n",
"2024-06-04 07:28:59,341 - INFO - Interrupted, stopping\n"
]
}
],
@ -518,22 +357,17 @@
"import json\n",
"\n",
"\n",
"tried = set()\n",
"\n",
"for _ in count():\n",
" current_hyperparameters = {\n",
" k: v.rvs() if hasattr(v, \"rvs\") else choice(v)\n",
" for k, v in choice(hyperparameters).items()\n",
" }\n",
" key = json.dumps(current_hyperparameters, indent=2, sort_keys=True)\n",
" if key in tried:\n",
" continue\n",
" tried.add(key)\n",
" logging.info(\n",
" f\"Starting {get_next_run_name(Path(\"runs\"))} with hparams {key}\"\n",
" )\n",
" try:\n",
" train(current_hyperparameters, max_duration=timedelta(hours=2), use_tqdm=False)\n",
" train(current_hyperparameters, max_duration=timedelta(hours=4), use_tqdm=False)\n",
" except KeyboardInterrupt as e:\n",
" logging.info(\"Interrupted, stopping\")\n",
" break\n",