208 lines
7.4 KiB
Text
208 lines
7.4 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import polars as pl\n",
|
|
"from pathlib import Path\n",
|
|
"import logging\n",
|
|
"from utils import set_up_logging, delete_corrupt_images\n",
|
|
"\n",
|
|
"set_up_logging(Path(\"../logs\"))\n",
|
|
"\n",
|
|
"RETRY_COUNT = 10\n",
|
|
"WORKER_COUNT = 16\n",
|
|
"HTTP_TIMEOUT = 120\n",
|
|
"TARGET_PATH = Path(\"/mnt/wsl/PHYSICALDRIVE0p1/downloaded-unsplash\")\n",
|
|
"\n",
|
|
"TARGET_PATH = TARGET_PATH.resolve()\n",
|
|
"assert TARGET_PATH.exists()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div><style>\n",
|
|
".dataframe > thead > tr,\n",
|
|
".dataframe > tbody > tr {\n",
|
|
" text-align: right;\n",
|
|
" white-space: pre-wrap;\n",
|
|
"}\n",
|
|
"</style>\n",
|
|
"<small>shape: (10, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>photo_id</th><th>photo_image_url</th></tr><tr><td>str</td><td>str</td></tr></thead><tbody><tr><td>"---jvVJZ34o"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--0-I4GpLZU"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--2IBUMom1I"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--2sDoKRgCg"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--5QEAiAfgE"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--6JlGcHl-w"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--6sqOMUDs8"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--97ozlPF1A"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--D4Gg8RhIk"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--EUYLhCTdc"</td><td>"https://images.unsplash.com/ph…</td></tr></tbody></table></div>"
|
|
],
|
|
"text/plain": [
|
|
"shape: (10, 2)\n",
|
|
"┌─────────────┬─────────────────────────────────┐\n",
|
|
"│ photo_id ┆ photo_image_url │\n",
|
|
"│ --- ┆ --- │\n",
|
|
"│ str ┆ str │\n",
|
|
"╞═════════════╪═════════════════════════════════╡\n",
|
|
"│ ---jvVJZ34o ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --0-I4GpLZU ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --2IBUMom1I ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --2sDoKRgCg ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --5QEAiAfgE ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --6JlGcHl-w ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --6sqOMUDs8 ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --97ozlPF1A ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --D4Gg8RhIk ┆ https://images.unsplash.com/ph… │\n",
|
|
"│ --EUYLhCTdc ┆ https://images.unsplash.com/ph… │\n",
|
|
"└─────────────┴─────────────────────────────────┘"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"photos = (\n",
|
|
" pl.scan_csv(\n",
|
|
" \"../data/unsplash-full/photos.tsv000\",\n",
|
|
" separator=\"\\t\",\n",
|
|
" infer_schema_length=100000,\n",
|
|
" )\n",
|
|
" .filter(pl.col(\"photo_featured\") == \"t\")\n",
|
|
" .sort(\"photo_id\")\n",
|
|
" .select(\"photo_id\", \"photo_image_url\")\n",
|
|
" .collect()\n",
|
|
")\n",
|
|
"\n",
|
|
"photos.limit(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n",
|
|
"photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n",
|
|
"logging.info(f\"Found {len(photos)} missing photos\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" 0%| | 0/273293 [00:00<?, ?it/s]"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" 0%| | 70/273293 [00:18<30:01:41, 2.53it/s]"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import concurrent.futures\n",
|
|
"import requests\n",
|
|
"from tqdm import tqdm\n",
|
|
"from typing import List\n",
|
|
"from time import sleep\n",
|
|
"\n",
|
|
"progress = tqdm(total=len(photos))\n",
|
|
"\n",
|
|
"\n",
|
|
"def download_image(row):\n",
|
|
" for retry_count in range(RETRY_COUNT):\n",
|
|
" try:\n",
|
|
" url = row[\"photo_image_url\"]\n",
|
|
" photo_id = row[\"photo_id\"]\n",
|
|
" logging.debug(f\"Downloading {photo_id} from {url}\")\n",
|
|
" response = requests.get(url, timeout=HTTP_TIMEOUT)\n",
|
|
" response.raise_for_status()\n",
|
|
" extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n",
|
|
" filename = TARGET_PATH / f\"{photo_id}.{extension}\"\n",
|
|
" with open(filename, \"wb\") as f:\n",
|
|
" f.write(response.content)\n",
|
|
" logging.debug(f\"Downloaded {photo_id} to {filename}\")\n",
|
|
" with progress.get_lock():\n",
|
|
" progress.update(1)\n",
|
|
" return\n",
|
|
" except Exception as e:\n",
|
|
" logging.error(\n",
|
|
" f\"Error downloading {photo_id} from {url} (retry {retry_count}): {e}\",\n",
|
|
" exc_info=True,\n",
|
|
" stack_info=True,\n",
|
|
" )\n",
|
|
" sleep(retry_count * 0.5)\n",
|
|
"\n",
|
|
"\n",
|
|
"with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_COUNT) as executor:\n",
|
|
" futures: List[concurrent.futures.Future] = []\n",
|
|
" for row in photos.to_dicts():\n",
|
|
" future = executor.submit(download_image, row)\n",
|
|
" futures.append(future)\n",
|
|
"\n",
|
|
" progress.display()\n",
|
|
" concurrent.futures.wait(futures)\n",
|
|
"progress.close()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "bipolaroid",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|