From ead6498c1c92b55453d5e6889bbdd555fc5c8620 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 25 Aug 2024 22:14:53 +0100 Subject: [PATCH] Add photo subset creator --- src/config.py | 4 +- src/create_photo_set.ipynb | 186 +++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 src/create_photo_set.ipynb diff --git a/src/config.py b/src/config.py index 2a26230..af61e3b 100644 --- a/src/config.py +++ b/src/config.py @@ -1,7 +1,9 @@ import random from pathlib import Path -DATA = sorted(Path("/mnt/wsl/PHYSICALDRIVE1/data/unsplash").glob("*.jpg")) +# DATA = sorted(Path("/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash").glob("*")) +DATA = sorted(Path("/mnt/wsl/PHYSICALDRIVE0p1/featured").glob("*")) + TRAIN_SIZE = 0.9 CACHE_PATH = Path("/mnt/wsl/PHYSICALDRIVE1/data/cache2") diff --git a/src/create_photo_set.ipynb b/src/create_photo_set.ipynb new file mode 100644 index 0000000..ac9ce8d --- /dev/null +++ b/src/create_photo_set.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Selected 294403 photos as mask'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "from utils import set_up_logging\n", + "import os\n", + "import polars as pl\n", + "import shutil\n", + "from tqdm import tqdm\n", + "\n", + "set_up_logging(Path(\"../logs\"))\n", + "\n", + "SOURCE_PATH = Path(\"/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash\")\n", + "TARGET_PATH = Path(\"/mnt/wsl/PHYSICALDRIVE0p1/featured\")\n", + "\n", + "SOURCE_PATH = SOURCE_PATH.resolve()\n", + "assert SOURCE_PATH.exists()\n", + "shutil.rmtree(TARGET_PATH, ignore_errors=True)\n", + "TARGET_PATH.mkdir(exist_ok=True)\n", + "\n", + "photos = (\n", + " pl.scan_csv(\n", + " \"../data/unsplash-full/photos.tsv000\",\n", + " separator=\"\\t\",\n", + " infer_schema_length=100000,\n", + " )\n", + " .filter(pl.col(\"photo_featured\") == \"t\")\n", + " .sort(\"photo_id\")\n", + " .select(\"photo_id\", \"photo_image_url\")\n", + " .collect()\n", + ")\n", + "\n", + "f\"Selected {len(photos)} photos as mask\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1927632 downloaded photos found'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_paths = list(SOURCE_PATH.glob(\"*\"))\n", + "f\"{len(all_paths)} downloaded photos found\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 2)
photo_idpath
strstr
"HjVrVy3KTPM""/mnt/wsl/PHYSICALDRIVE0p1/down…
"0tQX63dH_oU""/mnt/wsl/PHYSICALDRIVE0p1/down…
"HFOjFLIgE64""/mnt/wsl/PHYSICALDRIVE0p1/down…
"1TiRQeEgzlM""/mnt/wsl/PHYSICALDRIVE0p1/down…
"85HrJ4N00dg""/mnt/wsl/PHYSICALDRIVE0p1/down…
"5ouomWc_8kY""/mnt/wsl/PHYSICALDRIVE0p1/down…
"1Zo2W5tW-VU""/mnt/wsl/PHYSICALDRIVE0p1/down…
"DG7Bv6V6-2Q""/mnt/wsl/PHYSICALDRIVE0p1/down…
"GLmohhX8vVQ""/mnt/wsl/PHYSICALDRIVE0p1/down…
"8Un0gXhEL0c""/mnt/wsl/PHYSICALDRIVE0p1/down…
" + ], + "text/plain": [ + "shape: (10, 2)\n", + "┌─────────────┬─────────────────────────────────┐\n", + "│ photo_id ┆ path │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞═════════════╪═════════════════════════════════╡\n", + "│ HjVrVy3KTPM ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ 0tQX63dH_oU ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ HFOjFLIgE64 ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ 1TiRQeEgzlM ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ 85HrJ4N00dg ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ 5ouomWc_8kY ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ 1Zo2W5tW-VU ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ DG7Bv6V6-2Q ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ GLmohhX8vVQ ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "│ 8Un0gXhEL0c ┆ /mnt/wsl/PHYSICALDRIVE0p1/down… │\n", + "└─────────────┴─────────────────────────────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "files = pl.DataFrame(\n", + " {\n", + " \"photo_id\": [path.name.split(\".\")[0] for path in all_paths],\n", + " \"path\": [str(path) for path in all_paths],\n", + " }\n", + ")\n", + "files.limit(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'294367 photos found in the mask locally downloaded'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected = files.join(photos, on=\"photo_id\", how=\"inner\")\n", + "f\"{len(selected)} photos found in the mask locally downloaded\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 294367/294367 [30:04<00:00, 163.09it/s] \n" + ] + } + ], + "source": [ + "for path in tqdm(selected[\"path\"]):\n", + " os.symlink(path, TARGET_PATH / path.split(\"/\")[-1])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bipolaroid", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}