bipolaroidbipolaroid/src/fetch_from_unsplash.ipynb

208 lines
7.4 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"from pathlib import Path\n",
"import logging\n",
"from utils import set_up_logging, delete_corrupt_images\n",
"\n",
"set_up_logging(Path(\"../logs\"))\n",
"\n",
"RETRY_COUNT = 10\n",
"WORKER_COUNT = 16\n",
"HTTP_TIMEOUT = 120\n",
"TARGET_PATH = Path(\"/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash\")\n",
"\n",
"TARGET_PATH = TARGET_PATH.resolve()\n",
"assert TARGET_PATH.exists()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (10, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>photo_id</th><th>photo_image_url</th></tr><tr><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;---jvVJZ34o&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--0-I4GpLZU&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--2IBUMom1I&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--2sDoKRgCg&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--5QEAiAfgE&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--6JlGcHl-w&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--6sqOMUDs8&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--97ozlPF1A&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--D4Gg8RhIk&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--EUYLhCTdc&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (10, 2)\n",
"┌─────────────┬─────────────────────────────────┐\n",
"│ photo_id ┆ photo_image_url │\n",
"│ --- ┆ --- │\n",
"│ str ┆ str │\n",
"╞═════════════╪═════════════════════════════════╡\n",
"│ ---jvVJZ34o ┆ https://images.unsplash.com/ph… │\n",
"│ --0-I4GpLZU ┆ https://images.unsplash.com/ph… │\n",
"│ --2IBUMom1I ┆ https://images.unsplash.com/ph… │\n",
"│ --2sDoKRgCg ┆ https://images.unsplash.com/ph… │\n",
"│ --5QEAiAfgE ┆ https://images.unsplash.com/ph… │\n",
"│ --6JlGcHl-w ┆ https://images.unsplash.com/ph… │\n",
"│ --6sqOMUDs8 ┆ https://images.unsplash.com/ph… │\n",
"│ --97ozlPF1A ┆ https://images.unsplash.com/ph… │\n",
"│ --D4Gg8RhIk ┆ https://images.unsplash.com/ph… │\n",
"│ --EUYLhCTdc ┆ https://images.unsplash.com/ph… │\n",
"└─────────────┴─────────────────────────────────┘"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"photos = (\n",
" pl.scan_csv(\n",
" \"../data/unsplash-full/photos.tsv000\",\n",
" separator=\"\\t\",\n",
" infer_schema_length=100000,\n",
" )\n",
" .filter(pl.col(\"photo_featured\") == \"t\")\n",
" .sort(\"photo_id\")\n",
" .select(\"photo_id\", \"photo_image_url\")\n",
" .collect()\n",
")\n",
"\n",
"photos.limit(10)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos\n"
]
}
],
"source": [
"keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n",
"photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n",
"logging.info(f\"Found {len(photos)} missing photos\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/273293 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 70/273293 [00:18<30:01:41, 2.53it/s]"
]
}
],
"source": [
"import concurrent.futures\n",
"import requests\n",
"from tqdm import tqdm\n",
"from typing import List\n",
"from time import sleep\n",
"\n",
"progress = tqdm(total=len(photos))\n",
"\n",
"\n",
"def download_image(row):\n",
" for retry_count in range(RETRY_COUNT):\n",
" try:\n",
" url = row[\"photo_image_url\"]\n",
" photo_id = row[\"photo_id\"]\n",
" logging.debug(f\"Downloading {photo_id} from {url}\")\n",
" response = requests.get(url, timeout=HTTP_TIMEOUT)\n",
" response.raise_for_status()\n",
" extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n",
" filename = TARGET_PATH / f\"{photo_id}.{extension}\"\n",
" with open(filename, \"wb\") as f:\n",
" f.write(response.content)\n",
" logging.debug(f\"Downloaded {photo_id} to {filename}\")\n",
" with progress.get_lock():\n",
" progress.update(1)\n",
" return\n",
" except Exception as e:\n",
" logging.error(\n",
" f\"Error downloading {photo_id} from {url} (retry {retry_count}): {e}\",\n",
" exc_info=True,\n",
" stack_info=True,\n",
" )\n",
" sleep(retry_count * 0.5)\n",
"\n",
"\n",
"with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_COUNT) as executor:\n",
" futures: List[concurrent.futures.Future] = []\n",
" for row in photos.to_dicts():\n",
" future = executor.submit(download_image, row)\n",
" futures.append(future)\n",
"\n",
" progress.display()\n",
" concurrent.futures.wait(futures)\n",
"progress.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "bipolaroid",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}