diff --git a/src/fetch_from_unsplash.ipynb b/src/fetch_from_unsplash.ipynb
index c7bbfef..4052391 100644
--- a/src/fetch_from_unsplash.ipynb
+++ b/src/fetch_from_unsplash.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -19,15 +19,60 @@
"TARGET_PATH = Path(\"/bulk2/downloaded-unsplash\")\n",
"\n",
"TARGET_PATH = TARGET_PATH.resolve()\n",
- "assert TARGET_PATH.exists()\n",
+ "assert TARGET_PATH.exists()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
shape: (10, 2)| photo_id | photo_image_url |
|---|
| str | str |
| "---jvVJZ34o" | "https://images.unsplash.com/ph… |
| "--0-I4GpLZU" | "https://images.unsplash.com/ph… |
| "--2IBUMom1I" | "https://images.unsplash.com/ph… |
| "--2sDoKRgCg" | "https://images.unsplash.com/ph… |
| "--5QEAiAfgE" | "https://images.unsplash.com/ph… |
| "--6JlGcHl-w" | "https://images.unsplash.com/ph… |
| "--6sqOMUDs8" | "https://images.unsplash.com/ph… |
| "--97ozlPF1A" | "https://images.unsplash.com/ph… |
| "--D4Gg8RhIk" | "https://images.unsplash.com/ph… |
| "--EUYLhCTdc" | "https://images.unsplash.com/ph… |
"
+ ],
+ "text/plain": [
+ "shape: (10, 2)\n",
+ "┌─────────────┬─────────────────────────────────┐\n",
+ "│ photo_id ┆ photo_image_url │\n",
+ "│ --- ┆ --- │\n",
+ "│ str ┆ str │\n",
+ "╞═════════════╪═════════════════════════════════╡\n",
+ "│ ---jvVJZ34o ┆ https://images.unsplash.com/ph… │\n",
+ "│ --0-I4GpLZU ┆ https://images.unsplash.com/ph… │\n",
+ "│ --2IBUMom1I ┆ https://images.unsplash.com/ph… │\n",
+ "│ --2sDoKRgCg ┆ https://images.unsplash.com/ph… │\n",
+ "│ --5QEAiAfgE ┆ https://images.unsplash.com/ph… │\n",
+ "│ --6JlGcHl-w ┆ https://images.unsplash.com/ph… │\n",
+ "│ --6sqOMUDs8 ┆ https://images.unsplash.com/ph… │\n",
+ "│ --97ozlPF1A ┆ https://images.unsplash.com/ph… │\n",
+ "│ --D4Gg8RhIk ┆ https://images.unsplash.com/ph… │\n",
+ "│ --EUYLhCTdc ┆ https://images.unsplash.com/ph… │\n",
+ "└─────────────┴─────────────────────────────────┘"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"photos = (\n",
" pl.scan_csv(\n",
@@ -37,7 +82,7 @@
" )\n",
" .filter(pl.col(\"photo_featured\") == \"t\")\n",
" .sort(\"photo_id\")\n",
- " .select(\"photo_id\", \"photo_url\")\n",
+ " .select(\"photo_id\", \"photo_image_url\")\n",
" .collect()\n",
")\n",
"\n",
@@ -46,9 +91,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos\n"
+ ]
+ }
+ ],
"source": [
"keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n",
"photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n",
@@ -57,9 +110,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/273293 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 70/273293 [00:18<30:01:41, 2.53it/s]"
+ ]
+ }
+ ],
"source": [
"import concurrent.futures\n",
"import requests\n",
@@ -69,23 +137,28 @@
"\n",
"progress = tqdm(total=len(photos))\n",
"\n",
+ "\n",
"def download_image(row):\n",
" for retry_count in range(RETRY_COUNT):\n",
" try:\n",
- " logging.debug(f\"Downloading {row['photo_id']} from {row['photo_url']}\")\n",
- " response = requests.get(row[\"photo_image_url\"], timeout=HTTP_TIMEOUT)\n",
+ " url = row[\"photo_image_url\"]\n",
+ " photo_id = row[\"photo_id\"]\n",
+ " logging.debug(f\"Downloading {photo_id} from {url}\")\n",
+ " response = requests.get(url, timeout=HTTP_TIMEOUT)\n",
" response.raise_for_status()\n",
" extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n",
- " filename = TARGET_PATH / f\"{row['photo_id']}.{extension}\"\n",
+ " filename = TARGET_PATH / f\"{photo_id}.{extension}\"\n",
" with open(filename, \"wb\") as f:\n",
" f.write(response.content)\n",
- " logging.debug(f\"Downloaded {row['photo_id']} to {filename}\")\n",
+ " logging.debug(f\"Downloaded {photo_id} to {filename}\")\n",
" with progress.get_lock():\n",
" progress.update(1)\n",
" return\n",
" except Exception as e:\n",
" logging.error(\n",
- " f\"Error downloading {row['photo_id']} from {row['photo_url']} (retry {retry_count}): {e}\"\n",
+ " f\"Error downloading {photo_id} from {url} (retry {retry_count}): {e}\",\n",
+ " exc_info=True,\n",
+ " stack_info=True,\n",
" )\n",
" sleep(retry_count * 0.5)\n",
"\n",