diff --git a/src/config.py b/src/config.py index af21908..93b2cce 100644 --- a/src/config.py +++ b/src/config.py @@ -9,7 +9,7 @@ TRAIN_SIZE = 0.95 CACHE_PATH = Path("/mnt/wsl/PHYSICALDRIVE2/data/cache") MODELS_PATH = Path("/home/andras/projects/bipolaroid/saved_models") LOGS_PATH = Path("/home/andras/projects/bipolaroid/logs") -RUNS_PATH = Path("/home/andras/projects/bipolaroid/runs") +RUNS_PATH = Path("/home/andras/projects/bipolaroid/runs2") for path in [CACHE_PATH, MODELS_PATH, LOGS_PATH, RUNS_PATH]: diff --git a/src/train.ipynb b/src/train.ipynb index d2acf1f..91cf5fb 100644 --- a/src/train.ipynb +++ b/src/train.ipynb @@ -11,7 +11,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-09-03 22:20:43,878\tINFO worker.py:1774 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n" + "2024-09-05 22:18:20,653\tINFO worker.py:1774 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://172.29.235.222:8265 \u001b[39m\u001b[22m\n" ] }, { @@ -27,16 +27,11 @@ ], "source": [ "import os\n", - "import matplotlib\n", "\n", "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = (\n", " \"expandable_segments:True\" # avoid fragmented CUDA memory\n", ")\n", "\n", - "matplotlib.use(\n", - " \"agg\"\n", - ") # avoid \"UserWarning: Starting a Matplotlib GUI outside of the main thread will likely fail\" warnings\n", - "\n", "from config import LOGS_PATH, RUNS_PATH, TRAIN_DATA, TEST_DATA\n", "from utils import set_up_logging\n", "\n", @@ -50,9 +45,9 @@ "from ray.air import RunConfig\n", "\n", "\n", - "TRIAL_COUNT = 100\n", - "CHUNK_COUNT = 40\n", - "EPOCH_COUNT = 2\n", + "TRIAL_COUNT = 50\n", + "EPOCH_COUNT = 4\n", + "CHUNK_COUNT = EPOCH_COUNT * 40\n", "\n", "ray.init(include_dashboard=True, dashboard_host=\"0.0.0.0\")\n", "\n", @@ -74,29 +69,72 @@ "

Tune Status

\n", " \n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "
Current time:2024-09-01 22:06:10
Running for: 00:02:06.64
Memory: 22.4/47.0 GiB
Current time:2024-09-05 22:23:47
Running for: 00:05:24.00
Memory: 22.7/54.9 GiB
\n", " \n", "
\n", "
\n", "

System Info

\n", - " Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: -5.516964137554169
Logical resource usage: 32.0/32 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)\n", + " Using AsyncHyperBand: num_stopped=0
Bracket: Iter 128.000: None | Iter 32.000: None | Iter 8.000: None | Iter 2.000: None
Logical resource usage: 32.0/32 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)\n", "
\n", - " \n", + "
\n", + "
\n", + "

Messages

\n", + " \n", + " ... 30 more trials not shown (30 PENDING)\n", + " \n", + "
\n", + "\n", + "\n", " \n", "
\n", "
\n", "

Trial Status

\n", " \n", "\n", - "\n", + "\n", "\n", "\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "\n", "
Trial name status loc batch_size dropout_prob elu_alphafeatures kernel_size leaky_relu_alpha leaky_relu_slope learning_ratemodel_type scheduler_gammause_elu use_residual iter total time (s) chunk_test_loss chunk_training_loss
Trial name status loc dropout_probfeatures kernel_size leaky_relu_alpha leaky_relu_slope learning_rate scheduler_gamma
train_with_ray_b7d3c_00000TERMINATED172.29.235.222:1134109 32 0.00395892 1.35107[8, 16, 32] 5 0.0448715 0.00664526 0.0029226 HistogramNet 0.816752True True 3 70.652 5.22149 56.5472
train_with_ray_b7d3c_00001TERMINATED172.29.235.222:1140440 32 0.0439061 1.74642[8, 8, 8, 8, 8,_c140 3 0.00579656 0.0125715 0.00155182HistogramNet 0.898059True False 2 45.4228 5.79311 64.0481
train_with_ray_61b01_00000RUNNING 172.29.235.222:3274755 0.0240677 [16, 32, 64, 12_25c0 3 0.0147726 0.0164835 0.00409419 0.977701
train_with_ray_61b01_00001PENDING 0.0252525 [16, 32, 64, 12_9940 3 0.0607274 0.00769014 0.00175452 0.971626
train_with_ray_61b01_00002PENDING 0.0492015 [16, 32, 64, 12_0940 5 0.0522751 0.00532775 0.000102449 0.985839
train_with_ray_61b01_00003PENDING 0.0091843 [16, 32, 64, 12_8e80 5 0.0215584 0.00986109 0.0046811 0.969517
train_with_ray_61b01_00004PENDING 0.0200707 [16, 32, 64, 12_e200 5 0.00839403 0.0236732 0.00456458 0.949592
train_with_ray_61b01_00005PENDING 0.071518 [16, 32, 64, 12_4200 5 0.00955745 0.000338297 0.0012211 0.959114
train_with_ray_61b01_00006PENDING 0.00712943[16, 32, 64, 12_f8c0 3 0.0128474 0.0214861 0.000319318 0.954027
train_with_ray_61b01_00007PENDING 0.0554306 [16, 32, 64, 12_1900 3 0.0686442 0.0103842 0.00856337 0.966035
train_with_ray_61b01_00008PENDING 0.0637642 [16, 32, 64, 12_8f80 5 0.0125659 0.00386933 0.00135947 0.955525
train_with_ray_61b01_00009PENDING 0.0526107 [16, 32, 64, 12_16c0 5 0.00576232 0.0120612 0.000143394 0.983664
train_with_ray_61b01_00010PENDING 0.0321631 [16, 32, 64, 12_fc00 3 0.0100027 0.016943 0.000461478 0.981847
train_with_ray_61b01_00011PENDING 0.015057 [16, 32, 64, 12_f080 3 0.0599078 0.0293855 0.00182771 0.965469
train_with_ray_61b01_00012PENDING 0.0995986 [16, 32, 64, 12_d240 3 0.0349584 0.000219269 0.000288736 0.966034
train_with_ray_61b01_00013PENDING 0.0227281 [16, 32, 64, 12_c600 5 0.0324662 0.0125853 0.000173035 0.948412
train_with_ray_61b01_00014PENDING 0.0307992 [16, 32, 64, 12_d040 5 0.062771 0.0198744 0.000639558 0.94965
train_with_ray_61b01_00015PENDING 0.0896525 [16, 32, 64, 12_2e80 3 0.0150036 0.0171698 0.00206888 0.99602
train_with_ray_61b01_00016PENDING 0.0932697 [16, 32, 64, 12_6540 5 0.0699308 0.0154819 0.00384655 0.954913
train_with_ray_61b01_00017PENDING 0.00240956[16, 32, 64, 12_8640 3 0.0318032 0.00959876 0.000135257 0.952462
train_with_ray_61b01_00018PENDING 0.0883309 [16, 32, 64, 12_b800 3 0.0455664 0.00565 0.00066994 0.997372
train_with_ray_61b01_00019PENDING 0.0500287 [16, 32, 64, 12_8440 3 0.0674193 0.0220701 0.000987128 0.94602
\n", "
\n", @@ -140,33 +178,21 @@ "output_type": "display_data" }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "\u001b[36m(train_with_ray pid=1134109)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000000)\n", - "\u001b[36m(train_with_ray pid=1134109)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000001)\n", - "\u001b[36m(train_with_ray pid=1134109)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000002)\n", - "\u001b[36m(train_with_ray pid=1140440)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00001_1_batch_size=32,dropout_prob=0.0439,elu_alpha=1.7464,features=8_8_8_8_8_8_8,kernel_size=3,leaky_relu_al_2024-09-01_22-04-03/checkpoint_000000)\n", - "2024-09-01 22:06:10,132\tINFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/andras/projects/bipolaroid/runs5/tune' in 0.0020s.\n", - "2024-09-01 22:06:10,135\tINFO tune.py:1041 -- Total run time: 126.67 seconds (126.64 seconds for the tuning loop).\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[36m(train_with_ray pid=1140440)\u001b[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00001_1_batch_size=32,dropout_prob=0.0439,elu_alpha=1.7464,features=8_8_8_8_8_8_8,kernel_size=3,leaky_relu_al_2024-09-01_22-04-03/checkpoint_000001)\n" + "\u001b[33m(raylet)\u001b[0m Warning: The actor ImplicitFunc is very large (26 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.\n" ] } ], "source": [ "config = {\n", - " \"batch_size\": 32,\n", + " \"batch_size\": 48,\n", " \"edit_count\": EPOCH_COUNT,\n", " \"bin_count\": 32,\n", - " \"learning_rate\": tune.loguniform(5e-4, 5e-3),\n", - " \"scheduler_gamma\": tune.uniform(0.8, 0.95),\n", - " \"elu_alpha\": tune.uniform(0.5, 2),\n", + " \"learning_rate\": tune.loguniform(1e-4, 1e-2),\n", + " \"scheduler_gamma\": tune.uniform(0.94, 0.9999),\n", + " # \"elu_alpha\": tune.uniform(0.5, 2),\n", " \"leaky_relu_slope\": tune.uniform(0, 0.03),\n", " \"dropout_prob\": tune.uniform(0, 0.1),\n", " \"chunk_count\": CHUNK_COUNT,\n", @@ -174,25 +200,29 @@ " [\n", " [16, 32, 64],\n", " [16, 32, 64, 128],\n", + " [16, 32, 64, 128, 256],\n", + " [16, 32, 32, 32, 64],\n", " [32, 64],\n", " [32, 128],\n", - " [8, 16, 32],\n", - " [8, 8, 8, 8, 8],\n", - " [8, 8, 8, 8, 8, 8, 8],\n", - " [16, 16, 16],\n", + " [32, 64, 128],\n", + " [32, 64, 128, 256],\n", " [16, 16, 16, 16, 16],\n", + " [16, 16, 16, 16, 16, 16, 16, 16],\n", + " [16, 16, 16, 16, 16, 16, 16, 16, 16, 16],\n", " [32, 32, 32],\n", " [32, 32, 32, 32],\n", - " [64, 64],\n", " [64, 64, 64],\n", + " [64, 64, 64, 64],\n", + " [64, 64, 64, 64, 64],\n", + " [256, 64, 256],\n", " ]\n", " ),\n", - " \"use_residual\": tune.choice([True, False]),\n", + " \"use_residual\": True,\n", " \"kernel_size\": tune.choice([3, 5]),\n", - " \"model_type\": tune.choice([\"HistogramNet\"]),\n", + " \"model_type\": \"HistogramNet\",\n", " \"use_instance_norm\": True,\n", - " \"use_elu\": tune.choice([True, False]),\n", - " \"leaky_relu_alpha\": tune.uniform(0, 0.05),\n", + " \"use_elu\": False,\n", + " \"leaky_relu_alpha\": tune.uniform(0, 0.07),\n", "}\n", "\n", "tuner = tune.Tuner(\n", @@ -223,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -248,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/src/training/train_with_ray.py b/src/training/train_with_ray.py index d7a3f46..727bcd1 100644 --- a/src/training/train_with_ray.py +++ b/src/training/train_with_ray.py @@ -15,6 +15,7 @@ import logging from more_itertools import divide EPSILON = 1e-5 +EXAMPLE_COUNT = 5 def train_with_ray_factory( @@ -30,6 +31,12 @@ def train_with_ray_factory( chunk_count: int, **_, ) -> torch.nn.Module: + import matplotlib + + matplotlib.use( + "agg" + ) # avoid "UserWarning: Starting a Matplotlib GUI outside of the main thread will likely fail" warnings + model, optimizer, scheduler, start_chunk_id, run_name = ( load_or_create_state( device=device, @@ -48,7 +55,7 @@ def train_with_ray_factory( test_data_loader = get_data_loader( test_data_paths, **{**hyperparameters, "edit_count": 1} ) - examples = next(iter(test_data_loader)) + examples = next(iter(test_data_loader))[:EXAMPLE_COUNT] with SummaryWriter(log_dir=log_dir / run_name) as writer: writer.add_graph(model, examples[0].to(device))