diff --git a/src/config.py b/src/config.py
index af21908..93b2cce 100644
--- a/src/config.py
+++ b/src/config.py
@@ -9,7 +9,7 @@ TRAIN_SIZE = 0.95
CACHE_PATH = Path("/mnt/wsl/PHYSICALDRIVE2/data/cache")
MODELS_PATH = Path("/home/andras/projects/bipolaroid/saved_models")
LOGS_PATH = Path("/home/andras/projects/bipolaroid/logs")
-RUNS_PATH = Path("/home/andras/projects/bipolaroid/runs")
+RUNS_PATH = Path("/home/andras/projects/bipolaroid/runs2")
for path in [CACHE_PATH, MODELS_PATH, LOGS_PATH, RUNS_PATH]:
diff --git a/src/train.ipynb b/src/train.ipynb
index d2acf1f..91cf5fb 100644
--- a/src/train.ipynb
+++ b/src/train.ipynb
@@ -11,7 +11,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "2024-09-03 22:20:43,878\tINFO worker.py:1774 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n"
+ "2024-09-05 22:18:20,653\tINFO worker.py:1774 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://172.29.235.222:8265 \u001b[39m\u001b[22m\n"
]
},
{
@@ -27,16 +27,11 @@
],
"source": [
"import os\n",
- "import matplotlib\n",
"\n",
"os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = (\n",
" \"expandable_segments:True\" # avoid fragmented CUDA memory\n",
")\n",
"\n",
- "matplotlib.use(\n",
- " \"agg\"\n",
- ") # avoid \"UserWarning: Starting a Matplotlib GUI outside of the main thread will likely fail\" warnings\n",
- "\n",
"from config import LOGS_PATH, RUNS_PATH, TRAIN_DATA, TEST_DATA\n",
"from utils import set_up_logging\n",
"\n",
@@ -50,9 +45,9 @@
"from ray.air import RunConfig\n",
"\n",
"\n",
- "TRIAL_COUNT = 100\n",
- "CHUNK_COUNT = 40\n",
- "EPOCH_COUNT = 2\n",
+ "TRIAL_COUNT = 50\n",
+ "EPOCH_COUNT = 4\n",
+ "CHUNK_COUNT = EPOCH_COUNT * 40\n",
"\n",
"ray.init(include_dashboard=True, dashboard_host=\"0.0.0.0\")\n",
"\n",
@@ -74,29 +69,72 @@
"
Tune Status
\n",
" \n",
"\n",
- "| Current time: | 2024-09-01 22:06:10 |
\n",
- "| Running for: | 00:02:06.64 |
\n",
- "| Memory: | 22.4/47.0 GiB |
\n",
+ "| Current time: | 2024-09-05 22:23:47 |
\n",
+ "| Running for: | 00:05:24.00 |
\n",
+ "| Memory: | 22.7/54.9 GiB |
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
"
System Info
\n",
- " Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: -5.516964137554169
Logical resource usage: 32.0/32 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)\n",
+ " Using AsyncHyperBand: num_stopped=0
Bracket: Iter 128.000: None | Iter 32.000: None | Iter 8.000: None | Iter 2.000: None
Logical resource usage: 32.0/32 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)\n",
" \n",
- " \n",
+ " \n",
+ "\n",
+ "
Messages
\n",
+ " \n",
+ " ... 30 more trials not shown (30 PENDING)\n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
" \n",
" \n",
" \n",
"
Trial Status
\n",
"
\n",
"\n",
- "| Trial name | status | loc | batch_size | dropout_prob | elu_alpha | features | kernel_size | leaky_relu_alpha | leaky_relu_slope | learning_rate | model_type | scheduler_gamma | use_elu | use_residual | iter | total time (s) | chunk_test_loss | chunk_training_loss |
\n",
+ "| Trial name | status | loc | dropout_prob | features | kernel_size | leaky_relu_alpha | leaky_relu_slope | learning_rate | scheduler_gamma |
\n",
"\n",
"\n",
- "| train_with_ray_b7d3c_00000 | TERMINATED | 172.29.235.222:1134109 | 32 | 0.00395892 | 1.35107 | [8, 16, 32] | 5 | 0.0448715 | 0.00664526 | 0.0029226 | HistogramNet | 0.816752 | True | True | 3 | 70.652 | 5.22149 | 56.5472 |
\n",
- "| train_with_ray_b7d3c_00001 | TERMINATED | 172.29.235.222:1140440 | 32 | 0.0439061 | 1.74642 | [8, 8, 8, 8, 8,_c140 | 3 | 0.00579656 | 0.0125715 | 0.00155182 | HistogramNet | 0.898059 | True | False | 2 | 45.4228 | 5.79311 | 64.0481 |
\n",
+ "| train_with_ray_61b01_00000 | RUNNING | 172.29.235.222:3274755 | 0.0240677 | [16, 32, 64, 12_25c0 | 3 | 0.0147726 | 0.0164835 | 0.00409419 | 0.977701 |
\n",
+ "| train_with_ray_61b01_00001 | PENDING | | 0.0252525 | [16, 32, 64, 12_9940 | 3 | 0.0607274 | 0.00769014 | 0.00175452 | 0.971626 |
\n",
+ "| train_with_ray_61b01_00002 | PENDING | | 0.0492015 | [16, 32, 64, 12_0940 | 5 | 0.0522751 | 0.00532775 | 0.000102449 | 0.985839 |
\n",
+ "| train_with_ray_61b01_00003 | PENDING | | 0.0091843 | [16, 32, 64, 12_8e80 | 5 | 0.0215584 | 0.00986109 | 0.0046811 | 0.969517 |
\n",
+ "| train_with_ray_61b01_00004 | PENDING | | 0.0200707 | [16, 32, 64, 12_e200 | 5 | 0.00839403 | 0.0236732 | 0.00456458 | 0.949592 |
\n",
+ "| train_with_ray_61b01_00005 | PENDING | | 0.071518 | [16, 32, 64, 12_4200 | 5 | 0.00955745 | 0.000338297 | 0.0012211 | 0.959114 |
\n",
+ "| train_with_ray_61b01_00006 | PENDING | | 0.00712943 | [16, 32, 64, 12_f8c0 | 3 | 0.0128474 | 0.0214861 | 0.000319318 | 0.954027 |
\n",
+ "| train_with_ray_61b01_00007 | PENDING | | 0.0554306 | [16, 32, 64, 12_1900 | 3 | 0.0686442 | 0.0103842 | 0.00856337 | 0.966035 |
\n",
+ "| train_with_ray_61b01_00008 | PENDING | | 0.0637642 | [16, 32, 64, 12_8f80 | 5 | 0.0125659 | 0.00386933 | 0.00135947 | 0.955525 |
\n",
+ "| train_with_ray_61b01_00009 | PENDING | | 0.0526107 | [16, 32, 64, 12_16c0 | 5 | 0.00576232 | 0.0120612 | 0.000143394 | 0.983664 |
\n",
+ "| train_with_ray_61b01_00010 | PENDING | | 0.0321631 | [16, 32, 64, 12_fc00 | 3 | 0.0100027 | 0.016943 | 0.000461478 | 0.981847 |
\n",
+ "| train_with_ray_61b01_00011 | PENDING | | 0.015057 | [16, 32, 64, 12_f080 | 3 | 0.0599078 | 0.0293855 | 0.00182771 | 0.965469 |
\n",
+ "| train_with_ray_61b01_00012 | PENDING | | 0.0995986 | [16, 32, 64, 12_d240 | 3 | 0.0349584 | 0.000219269 | 0.000288736 | 0.966034 |
\n",
+ "| train_with_ray_61b01_00013 | PENDING | | 0.0227281 | [16, 32, 64, 12_c600 | 5 | 0.0324662 | 0.0125853 | 0.000173035 | 0.948412 |
\n",
+ "| train_with_ray_61b01_00014 | PENDING | | 0.0307992 | [16, 32, 64, 12_d040 | 5 | 0.062771 | 0.0198744 | 0.000639558 | 0.94965 |
\n",
+ "| train_with_ray_61b01_00015 | PENDING | | 0.0896525 | [16, 32, 64, 12_2e80 | 3 | 0.0150036 | 0.0171698 | 0.00206888 | 0.99602 |
\n",
+ "| train_with_ray_61b01_00016 | PENDING | | 0.0932697 | [16, 32, 64, 12_6540 | 5 | 0.0699308 | 0.0154819 | 0.00384655 | 0.954913 |
\n",
+ "| train_with_ray_61b01_00017 | PENDING | | 0.00240956 | [16, 32, 64, 12_8640 | 3 | 0.0318032 | 0.00959876 | 0.000135257 | 0.952462 |
\n",
+ "| train_with_ray_61b01_00018 | PENDING | | 0.0883309 | [16, 32, 64, 12_b800 | 3 | 0.0455664 | 0.00565 | 0.00066994 | 0.997372 |
\n",
+ "| train_with_ray_61b01_00019 | PENDING | | 0.0500287 | [16, 32, 64, 12_8440 | 3 | 0.0674193 | 0.0220701 | 0.000987128 | 0.94602 |
\n",
"\n",
"
\n",
"
\n",
@@ -140,33 +178,21 @@
"output_type": "display_data"
},
{
- "name": "stderr",
+ "name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[36m(train_with_ray pid=1134109)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000000)\n",
- "\u001b[36m(train_with_ray pid=1134109)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000001)\n",
- "\u001b[36m(train_with_ray pid=1134109)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000002)\n",
- "\u001b[36m(train_with_ray pid=1140440)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00001_1_batch_size=32,dropout_prob=0.0439,elu_alpha=1.7464,features=8_8_8_8_8_8_8,kernel_size=3,leaky_relu_al_2024-09-01_22-04-03/checkpoint_000000)\n",
- "2024-09-01 22:06:10,132\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/andras/projects/bipolaroid/runs5/tune' in 0.0020s.\n",
- "2024-09-01 22:06:10,135\tINFO tune.py:1041 -- Total run time: 126.67 seconds (126.64 seconds for the tuning loop).\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[36m(train_with_ray pid=1140440)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00001_1_batch_size=32,dropout_prob=0.0439,elu_alpha=1.7464,features=8_8_8_8_8_8_8,kernel_size=3,leaky_relu_al_2024-09-01_22-04-03/checkpoint_000001)\n"
+ "\u001b[33m(raylet)\u001b[0m Warning: The actor ImplicitFunc is very large (26 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.\n"
]
}
],
"source": [
"config = {\n",
- " \"batch_size\": 32,\n",
+ " \"batch_size\": 48,\n",
" \"edit_count\": EPOCH_COUNT,\n",
" \"bin_count\": 32,\n",
- " \"learning_rate\": tune.loguniform(5e-4, 5e-3),\n",
- " \"scheduler_gamma\": tune.uniform(0.8, 0.95),\n",
- " \"elu_alpha\": tune.uniform(0.5, 2),\n",
+ " \"learning_rate\": tune.loguniform(1e-4, 1e-2),\n",
+ " \"scheduler_gamma\": tune.uniform(0.94, 0.9999),\n",
+ " # \"elu_alpha\": tune.uniform(0.5, 2),\n",
" \"leaky_relu_slope\": tune.uniform(0, 0.03),\n",
" \"dropout_prob\": tune.uniform(0, 0.1),\n",
" \"chunk_count\": CHUNK_COUNT,\n",
@@ -174,25 +200,29 @@
" [\n",
" [16, 32, 64],\n",
" [16, 32, 64, 128],\n",
+ " [16, 32, 64, 128, 256],\n",
+ " [16, 32, 32, 32, 64],\n",
" [32, 64],\n",
" [32, 128],\n",
- " [8, 16, 32],\n",
- " [8, 8, 8, 8, 8],\n",
- " [8, 8, 8, 8, 8, 8, 8],\n",
- " [16, 16, 16],\n",
+ " [32, 64, 128],\n",
+ " [32, 64, 128, 256],\n",
" [16, 16, 16, 16, 16],\n",
+ " [16, 16, 16, 16, 16, 16, 16, 16],\n",
+ " [16, 16, 16, 16, 16, 16, 16, 16, 16, 16],\n",
" [32, 32, 32],\n",
" [32, 32, 32, 32],\n",
- " [64, 64],\n",
" [64, 64, 64],\n",
+ " [64, 64, 64, 64],\n",
+ " [64, 64, 64, 64, 64],\n",
+ " [256, 64, 256],\n",
" ]\n",
" ),\n",
- " \"use_residual\": tune.choice([True, False]),\n",
+ " \"use_residual\": True,\n",
" \"kernel_size\": tune.choice([3, 5]),\n",
- " \"model_type\": tune.choice([\"HistogramNet\"]),\n",
+ " \"model_type\": \"HistogramNet\",\n",
" \"use_instance_norm\": True,\n",
- " \"use_elu\": tune.choice([True, False]),\n",
- " \"leaky_relu_alpha\": tune.uniform(0, 0.05),\n",
+ " \"use_elu\": False,\n",
+ " \"leaky_relu_alpha\": tune.uniform(0, 0.07),\n",
"}\n",
"\n",
"tuner = tune.Tuner(\n",
@@ -223,7 +253,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -248,7 +278,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
diff --git a/src/training/train_with_ray.py b/src/training/train_with_ray.py
index d7a3f46..727bcd1 100644
--- a/src/training/train_with_ray.py
+++ b/src/training/train_with_ray.py
@@ -15,6 +15,7 @@ import logging
from more_itertools import divide
EPSILON = 1e-5
+EXAMPLE_COUNT = 5
def train_with_ray_factory(
@@ -30,6 +31,12 @@ def train_with_ray_factory(
chunk_count: int,
**_,
) -> torch.nn.Module:
+ import matplotlib
+
+ matplotlib.use(
+ "agg"
+ ) # avoid "UserWarning: Starting a Matplotlib GUI outside of the main thread will likely fail" warnings
+
model, optimizer, scheduler, start_chunk_id, run_name = (
load_or_create_state(
device=device,
@@ -48,7 +55,7 @@ def train_with_ray_factory(
test_data_loader = get_data_loader(
test_data_paths, **{**hyperparameters, "edit_count": 1}
)
- examples = next(iter(test_data_loader))
+ examples = next(iter(test_data_loader))[:EXAMPLE_COUNT]
with SummaryWriter(log_dir=log_dir / run_name) as writer:
writer.add_graph(model, examples[0].to(device))