This commit is contained in:
Andras Schmelczer 2024-07-08 20:56:01 +01:00
parent a7cdd22619
commit 28425d53af
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C

View file

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -19,15 +19,60 @@
"TARGET_PATH = Path(\"/bulk2/downloaded-unsplash\")\n",
"\n",
"TARGET_PATH = TARGET_PATH.resolve()\n",
"assert TARGET_PATH.exists()\n",
"assert TARGET_PATH.exists()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (10, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>photo_id</th><th>photo_image_url</th></tr><tr><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;---jvVJZ34o&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--0-I4GpLZU&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--2IBUMom1I&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--2sDoKRgCg&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--5QEAiAfgE&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--6JlGcHl-w&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--6sqOMUDs8&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--97ozlPF1A&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--D4Gg8RhIk&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr><tr><td>&quot;--EUYLhCTdc&quot;</td><td>&quot;https://images.unsplash.com/ph…</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (10, 2)\n",
"┌─────────────┬─────────────────────────────────┐\n",
"│ photo_id ┆ photo_image_url │\n",
"│ --- ┆ --- │\n",
"│ str ┆ str │\n",
"╞═════════════╪═════════════════════════════════╡\n",
"│ ---jvVJZ34o ┆ https://images.unsplash.com/ph… │\n",
"│ --0-I4GpLZU ┆ https://images.unsplash.com/ph… │\n",
"│ --2IBUMom1I ┆ https://images.unsplash.com/ph… │\n",
"│ --2sDoKRgCg ┆ https://images.unsplash.com/ph… │\n",
"│ --5QEAiAfgE ┆ https://images.unsplash.com/ph… │\n",
"│ --6JlGcHl-w ┆ https://images.unsplash.com/ph… │\n",
"│ --6sqOMUDs8 ┆ https://images.unsplash.com/ph… │\n",
"│ --97ozlPF1A ┆ https://images.unsplash.com/ph… │\n",
"│ --D4Gg8RhIk ┆ https://images.unsplash.com/ph… │\n",
"│ --EUYLhCTdc ┆ https://images.unsplash.com/ph… │\n",
"└─────────────┴─────────────────────────────────┘"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"photos = (\n",
" pl.scan_csv(\n",
@ -37,7 +82,7 @@
" )\n",
" .filter(pl.col(\"photo_featured\") == \"t\")\n",
" .sort(\"photo_id\")\n",
" .select(\"photo_id\", \"photo_url\")\n",
" .select(\"photo_id\", \"photo_image_url\")\n",
" .collect()\n",
")\n",
"\n",
@ -46,9 +91,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos\n"
]
}
],
"source": [
"keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n",
"photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n",
@ -57,9 +110,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/273293 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 70/273293 [00:18<30:01:41, 2.53it/s]"
]
}
],
"source": [
"import concurrent.futures\n",
"import requests\n",
@ -69,23 +137,28 @@
"\n",
"progress = tqdm(total=len(photos))\n",
"\n",
"\n",
"def download_image(row):\n",
" for retry_count in range(RETRY_COUNT):\n",
" try:\n",
" logging.debug(f\"Downloading {row['photo_id']} from {row['photo_url']}\")\n",
" response = requests.get(row[\"photo_image_url\"], timeout=HTTP_TIMEOUT)\n",
" url = row[\"photo_image_url\"]\n",
" photo_id = row[\"photo_id\"]\n",
" logging.debug(f\"Downloading {photo_id} from {url}\")\n",
" response = requests.get(url, timeout=HTTP_TIMEOUT)\n",
" response.raise_for_status()\n",
" extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n",
" filename = TARGET_PATH / f\"{row['photo_id']}.{extension}\"\n",
" filename = TARGET_PATH / f\"{photo_id}.{extension}\"\n",
" with open(filename, \"wb\") as f:\n",
" f.write(response.content)\n",
" logging.debug(f\"Downloaded {row['photo_id']} to {filename}\")\n",
" logging.debug(f\"Downloaded {photo_id} to {filename}\")\n",
" with progress.get_lock():\n",
" progress.update(1)\n",
" return\n",
" except Exception as e:\n",
" logging.error(\n",
" f\"Error downloading {row['photo_id']} from {row['photo_url']} (retry {retry_count}): {e}\"\n",
" f\"Error downloading {photo_id} from {url} (retry {retry_count}): {e}\",\n",
" exc_info=True,\n",
" stack_info=True,\n",
" )\n",
" sleep(retry_count * 0.5)\n",
"\n",