diff --git a/src/config.py b/src/config.py index 2a26230..af61e3b 100644 --- a/src/config.py +++ b/src/config.py @@ -1,7 +1,9 @@ import random from pathlib import Path -DATA = sorted(Path("/mnt/wsl/PHYSICALDRIVE1/data/unsplash").glob("*.jpg")) +# DATA = sorted(Path("/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash").glob("*")) +DATA = sorted(Path("/mnt/wsl/PHYSICALDRIVE0p1/featured").glob("*")) + TRAIN_SIZE = 0.9 CACHE_PATH = Path("/mnt/wsl/PHYSICALDRIVE1/data/cache2") diff --git a/src/create_photo_set.ipynb b/src/create_photo_set.ipynb new file mode 100644 index 0000000..ac9ce8d --- /dev/null +++ b/src/create_photo_set.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Selected 294403 photos as mask'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "from utils import set_up_logging\n", + "import os\n", + "import polars as pl\n", + "import shutil\n", + "from tqdm import tqdm\n", + "\n", + "set_up_logging(Path(\"../logs\"))\n", + "\n", + "SOURCE_PATH = Path(\"/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash\")\n", + "TARGET_PATH = Path(\"/mnt/wsl/PHYSICALDRIVE0p1/featured\")\n", + "\n", + "SOURCE_PATH = SOURCE_PATH.resolve()\n", + "assert SOURCE_PATH.exists()\n", + "shutil.rmtree(TARGET_PATH, ignore_errors=True)\n", + "TARGET_PATH.mkdir(exist_ok=True)\n", + "\n", + "photos = (\n", + " pl.scan_csv(\n", + " \"../data/unsplash-full/photos.tsv000\",\n", + " separator=\"\\t\",\n", + " infer_schema_length=100000,\n", + " )\n", + " .filter(pl.col(\"photo_featured\") == \"t\")\n", + " .sort(\"photo_id\")\n", + " .select(\"photo_id\", \"photo_image_url\")\n", + " .collect()\n", + ")\n", + "\n", + "f\"Selected {len(photos)} photos as mask\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1927632 downloaded photos found'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_paths = list(SOURCE_PATH.glob(\"*\"))\n", + "f\"{len(all_paths)} downloaded photos found\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| photo_id | path |
|---|---|
| str | str |
| "HjVrVy3KTPM" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "0tQX63dH_oU" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "HFOjFLIgE64" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "1TiRQeEgzlM" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "85HrJ4N00dg" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "5ouomWc_8kY" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "1Zo2W5tW-VU" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "DG7Bv6V6-2Q" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "GLmohhX8vVQ" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |
| "8Un0gXhEL0c" | "/mnt/wsl/PHYSICALDRIVE0p1/down… |