Improve
This commit is contained in:
parent
a7cdd22619
commit
28425d53af
1 changed files with 87 additions and 14 deletions
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -19,15 +19,60 @@
|
|||
"TARGET_PATH = Path(\"/bulk2/downloaded-unsplash\")\n",
|
||||
"\n",
|
||||
"TARGET_PATH = TARGET_PATH.resolve()\n",
|
||||
"assert TARGET_PATH.exists()\n",
|
||||
"assert TARGET_PATH.exists()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div><style>\n",
|
||||
".dataframe > thead > tr,\n",
|
||||
".dataframe > tbody > tr {\n",
|
||||
" text-align: right;\n",
|
||||
" white-space: pre-wrap;\n",
|
||||
"}\n",
|
||||
"</style>\n",
|
||||
"<small>shape: (10, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>photo_id</th><th>photo_image_url</th></tr><tr><td>str</td><td>str</td></tr></thead><tbody><tr><td>"---jvVJZ34o"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--0-I4GpLZU"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--2IBUMom1I"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--2sDoKRgCg"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--5QEAiAfgE"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--6JlGcHl-w"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--6sqOMUDs8"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--97ozlPF1A"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--D4Gg8RhIk"</td><td>"https://images.unsplash.com/ph…</td></tr><tr><td>"--EUYLhCTdc"</td><td>"https://images.unsplash.com/ph…</td></tr></tbody></table></div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"shape: (10, 2)\n",
|
||||
"┌─────────────┬─────────────────────────────────┐\n",
|
||||
"│ photo_id ┆ photo_image_url │\n",
|
||||
"│ --- ┆ --- │\n",
|
||||
"│ str ┆ str │\n",
|
||||
"╞═════════════╪═════════════════════════════════╡\n",
|
||||
"│ ---jvVJZ34o ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --0-I4GpLZU ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --2IBUMom1I ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --2sDoKRgCg ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --5QEAiAfgE ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --6JlGcHl-w ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --6sqOMUDs8 ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --97ozlPF1A ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --D4Gg8RhIk ┆ https://images.unsplash.com/ph… │\n",
|
||||
"│ --EUYLhCTdc ┆ https://images.unsplash.com/ph… │\n",
|
||||
"└─────────────┴─────────────────────────────────┘"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"photos = (\n",
|
||||
" pl.scan_csv(\n",
|
||||
|
|
@ -37,7 +82,7 @@
|
|||
" )\n",
|
||||
" .filter(pl.col(\"photo_featured\") == \"t\")\n",
|
||||
" .sort(\"photo_id\")\n",
|
||||
" .select(\"photo_id\", \"photo_url\")\n",
|
||||
" .select(\"photo_id\", \"photo_image_url\")\n",
|
||||
" .collect()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
|
|
@ -46,9 +91,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n",
|
||||
"photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n",
|
||||
|
|
@ -57,9 +110,24 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 0/273293 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 70/273293 [00:18<30:01:41, 2.53it/s]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import concurrent.futures\n",
|
||||
"import requests\n",
|
||||
|
|
@ -69,23 +137,28 @@
|
|||
"\n",
|
||||
"progress = tqdm(total=len(photos))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_image(row):\n",
|
||||
" for retry_count in range(RETRY_COUNT):\n",
|
||||
" try:\n",
|
||||
" logging.debug(f\"Downloading {row['photo_id']} from {row['photo_url']}\")\n",
|
||||
" response = requests.get(row[\"photo_image_url\"], timeout=HTTP_TIMEOUT)\n",
|
||||
" url = row[\"photo_image_url\"]\n",
|
||||
" photo_id = row[\"photo_id\"]\n",
|
||||
" logging.debug(f\"Downloading {photo_id} from {url}\")\n",
|
||||
" response = requests.get(url, timeout=HTTP_TIMEOUT)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n",
|
||||
" filename = TARGET_PATH / f\"{row['photo_id']}.{extension}\"\n",
|
||||
" filename = TARGET_PATH / f\"{photo_id}.{extension}\"\n",
|
||||
" with open(filename, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
" logging.debug(f\"Downloaded {row['photo_id']} to {filename}\")\n",
|
||||
" logging.debug(f\"Downloaded {photo_id} to {filename}\")\n",
|
||||
" with progress.get_lock():\n",
|
||||
" progress.update(1)\n",
|
||||
" return\n",
|
||||
" except Exception as e:\n",
|
||||
" logging.error(\n",
|
||||
" f\"Error downloading {row['photo_id']} from {row['photo_url']} (retry {retry_count}): {e}\"\n",
|
||||
" f\"Error downloading {photo_id} from {url} (retry {retry_count}): {e}\",\n",
|
||||
" exc_info=True,\n",
|
||||
" stack_info=True,\n",
|
||||
" )\n",
|
||||
" sleep(retry_count * 0.5)\n",
|
||||
"\n",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue