From 28425d53af7a7acb93504a2cd2b2edc38b09a2e7 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Mon, 8 Jul 2024 20:56:01 +0100 Subject: [PATCH] Improve --- src/fetch_from_unsplash.ipynb | 101 +++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 14 deletions(-) diff --git a/src/fetch_from_unsplash.ipynb b/src/fetch_from_unsplash.ipynb index c7bbfef..4052391 100644 --- a/src/fetch_from_unsplash.ipynb +++ b/src/fetch_from_unsplash.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -19,15 +19,60 @@ "TARGET_PATH = Path(\"/bulk2/downloaded-unsplash\")\n", "\n", "TARGET_PATH = TARGET_PATH.resolve()\n", - "assert TARGET_PATH.exists()\n", + "assert TARGET_PATH.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ "delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 2)
photo_idphoto_image_url
strstr
"---jvVJZ34o""https://images.unsplash.com/ph…
"--0-I4GpLZU""https://images.unsplash.com/ph…
"--2IBUMom1I""https://images.unsplash.com/ph…
"--2sDoKRgCg""https://images.unsplash.com/ph…
"--5QEAiAfgE""https://images.unsplash.com/ph…
"--6JlGcHl-w""https://images.unsplash.com/ph…
"--6sqOMUDs8""https://images.unsplash.com/ph…
"--97ozlPF1A""https://images.unsplash.com/ph…
"--D4Gg8RhIk""https://images.unsplash.com/ph…
"--EUYLhCTdc""https://images.unsplash.com/ph…
" + ], + "text/plain": [ + "shape: (10, 2)\n", + "┌─────────────┬─────────────────────────────────┐\n", + "│ photo_id ┆ photo_image_url │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞═════════════╪═════════════════════════════════╡\n", + "│ ---jvVJZ34o ┆ https://images.unsplash.com/ph… │\n", + "│ --0-I4GpLZU ┆ https://images.unsplash.com/ph… │\n", + "│ --2IBUMom1I ┆ https://images.unsplash.com/ph… │\n", + "│ --2sDoKRgCg ┆ https://images.unsplash.com/ph… │\n", + "│ --5QEAiAfgE ┆ https://images.unsplash.com/ph… │\n", + "│ --6JlGcHl-w ┆ https://images.unsplash.com/ph… │\n", + "│ --6sqOMUDs8 ┆ https://images.unsplash.com/ph… │\n", + "│ --97ozlPF1A ┆ https://images.unsplash.com/ph… │\n", + "│ --D4Gg8RhIk ┆ https://images.unsplash.com/ph… │\n", + "│ --EUYLhCTdc ┆ https://images.unsplash.com/ph… │\n", + "└─────────────┴─────────────────────────────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "photos = (\n", " pl.scan_csv(\n", @@ -37,7 +82,7 @@ " )\n", " .filter(pl.col(\"photo_featured\") == \"t\")\n", " .sort(\"photo_id\")\n", - " .select(\"photo_id\", \"photo_url\")\n", + " .select(\"photo_id\", \"photo_image_url\")\n", " .collect()\n", ")\n", "\n", @@ -46,9 +91,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-08 19:43:32,887 - INFO - Found 273293 missing photos\n" + ] + } + ], "source": [ "keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n", "photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n", @@ -57,9 +110,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/273293 [00:00