{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "from pathlib import Path\n", "import logging\n", "from utils import set_up_logging, delete_corrupt_images\n", "\n", "set_up_logging(Path(\"../logs\"))\n", "\n", "RETRY_COUNT = 10\n", "WORKER_COUNT = 16\n", "HTTP_TIMEOUT = 120\n", "TARGET_PATH = Path(\"/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash\")\n", "\n", "TARGET_PATH = TARGET_PATH.resolve()\n", "assert TARGET_PATH.exists()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (10, 2)
photo_idphoto_image_url
strstr
"---jvVJZ34o""https://images.unsplash.com/ph…
"--0-I4GpLZU""https://images.unsplash.com/ph…
"--2IBUMom1I""https://images.unsplash.com/ph…
"--2sDoKRgCg""https://images.unsplash.com/ph…
"--5QEAiAfgE""https://images.unsplash.com/ph…
"--6JlGcHl-w""https://images.unsplash.com/ph…
"--6sqOMUDs8""https://images.unsplash.com/ph…
"--97ozlPF1A""https://images.unsplash.com/ph…
"--D4Gg8RhIk""https://images.unsplash.com/ph…
"--EUYLhCTdc""https://images.unsplash.com/ph…
" ], "text/plain": [ "shape: (10, 2)\n", "┌─────────────┬─────────────────────────────────┐\n", "│ photo_id ┆ photo_image_url │\n", "│ --- ┆ --- │\n", "│ str ┆ str │\n", "╞═════════════╪═════════════════════════════════╡\n", "│ ---jvVJZ34o ┆ https://images.unsplash.com/ph… │\n", "│ --0-I4GpLZU ┆ https://images.unsplash.com/ph… │\n", "│ --2IBUMom1I ┆ https://images.unsplash.com/ph… │\n", "│ --2sDoKRgCg ┆ https://images.unsplash.com/ph… │\n", "│ --5QEAiAfgE ┆ https://images.unsplash.com/ph… │\n", "│ --6JlGcHl-w ┆ https://images.unsplash.com/ph… │\n", "│ --6sqOMUDs8 ┆ https://images.unsplash.com/ph… │\n", "│ --97ozlPF1A ┆ https://images.unsplash.com/ph… │\n", "│ --D4Gg8RhIk ┆ https://images.unsplash.com/ph… │\n", "│ --EUYLhCTdc ┆ https://images.unsplash.com/ph… │\n", "└─────────────┴─────────────────────────────────┘" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "photos = (\n", " pl.scan_csv(\n", " \"../data/unsplash-full/photos.tsv000\",\n", " separator=\"\\t\",\n", " infer_schema_length=100000,\n", " )\n", " .filter(pl.col(\"photo_featured\") == \"t\")\n", " .sort(\"photo_id\")\n", " .select(\"photo_id\", \"photo_image_url\")\n", " .collect()\n", ")\n", "\n", "photos.limit(10)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos\n" ] } ], "source": [ "keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n", "photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n", "logging.info(f\"Found {len(photos)} missing photos\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/273293 [00:00