diff --git a/src/fetch_from_unsplash.ipynb b/src/fetch_from_unsplash.ipynb index 1fd51e9..c7bbfef 100644 --- a/src/fetch_from_unsplash.ipynb +++ b/src/fetch_from_unsplash.ipynb @@ -2,792 +2,97 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
photo_idphoto_urlphoto_image_urlphoto_submitted_atphoto_featuredphoto_widthphoto_heightphoto_aspect_ratiophoto_descriptionphotographer_username...photo_location_countryphoto_location_citystats_viewsstats_downloadsai_descriptionai_primary_landmark_nameai_primary_landmark_latitudeai_primary_landmark_longitudeai_primary_landmark_confidenceblur_hash
3690XFmznQhx9lMhttps://unsplash.com/photos/XFmznQhx9lMhttps://images.unsplash.com/photo-156347321301...2019-07-18 18:07:14.031684t444329621.50Fall color in the countryside of Eastern Washi...timothyeberly...NaNNaN2978748547304950orange leaf treesNaNNaNNaNNaNLBJPSa4o0hW?pI4;-.R*E459O?sk
6103YDNvydD1jAYhttps://unsplash.com/photos/YDNvydD1jAYhttps://images.unsplash.com/photo-149034936815...2017-03-24 09:56:57.505262t450030001.50Flowers in springmaartendeckers...BelgiumNaN2722857886416983pink, yellow and brown petaled flowersNaNNaNNaNNaNLQJInG*JMyIm^ROpxbNFyCNGnln4
33894oovIxttThAhttps://unsplash.com/photos/4oovIxttThAhttps://images.unsplash.com/photo-1560850038-f...2019-06-18 09:36:35.94311t502531411.60NaNa8ka...NaNNaN2190084956253730aerial view of houses near oceanNaNNaNNaNNaNLaCt8}~BwNIpozoLofofWBWBaef6
18789BkR842UVXqkhttps://unsplash.com/photos/BkR842UVXqkhttps://images.unsplash.com/photo-1558816280-d...2019-05-25 20:32:08.153319t400060000.67NaNolenkasergienko...NaNNaN1934025254294785pink petaled flowerNaNNaNNaNNaNLA71AxX50_xHt7j[S1ju0_nm^8NZ
21742GRLN5FC4cLghttps://unsplash.com/photos/GRLN5FC4cLghttps://images.unsplash.com/photo-1552300977-c...2019-03-11 10:50:25.9311t299239920.75NaNturner_imagery...NaNNaN1916027735306073high angle photography of cliffNaNNaNNaNNaNLKCr=#~VNat7X-%M%1j?9tNbxaay
..................................................................
32725zsw1PjXg8khttps://unsplash.com/photos/5zsw1PjXg8khttps://images.unsplash.com/photo-142372152343...2015-02-12 06:12:09.092905f244832640.75NaNmelissaaskew...NaNNaN14088812waterfalls in the middle of the forestNaNNaNNaNNaNLG8#NK.84m4mt6f#RjkD9EM_%N-=
3273gqa-fnYASIQhttps://unsplash.com/photos/gqa-fnYASIQhttps://images.unsplash.com/photo-142302719730...2015-02-04 05:19:59.869141f508633911.50NaNwilstewart3...NaNNaN13384858a street sign sitting on the side of a body of...NaNNaNNaNNaNLvKKi,RjM{j[_NWBWBfk5EoLoeaz
12956Cq62qvCW8bMhttps://unsplash.com/photos/Cq62qvCW8bMhttps://images.unsplash.com/photo-142245228993...2015-01-28 13:38:18.071331f489632641.50NaNkseny...NaNNaN130681054man in black shirt and blue pants sitting on b...NaNNaNNaNNaNLGEW2ko~M{%N0;ofnhRkwvozt8of
17074Py8vZdCw35Uhttps://unsplash.com/photos/Py8vZdCw35Uhttps://images.unsplash.com/photo-142034363140...2015-01-04 03:54:41.031772f232015531.49NaNmrbrodeur...NaNNaN12617581man in black jacket standing on brown sand und...NaNNaNNaNNaNLuI~ZRogaxR*0?Rjofj[Mxs.a|fP
1394SpRN0qZPLr8https://unsplash.com/photos/SpRN0qZPLr8https://images.unsplash.com/photo-141621393610...2014-11-17 08:47:33.427134f601640001.50NaNtarunccet...NaNNaN8266101brown wooden house on green grass field near b...NaNNaNNaNNaNLjEppYn}j]kC%jj[f6f6x8fPaxay
\n", - "

25000 rows × 31 columns

\n", - "
" - ], - "text/plain": [ - " photo_id photo_url \\\n", - "3690 XFmznQhx9lM https://unsplash.com/photos/XFmznQhx9lM \n", - "6103 YDNvydD1jAY https://unsplash.com/photos/YDNvydD1jAY \n", - "3389 4oovIxttThA https://unsplash.com/photos/4oovIxttThA \n", - "18789 BkR842UVXqk https://unsplash.com/photos/BkR842UVXqk \n", - "21742 GRLN5FC4cLg https://unsplash.com/photos/GRLN5FC4cLg \n", - "... ... ... \n", - "3272 5zsw1PjXg8k https://unsplash.com/photos/5zsw1PjXg8k \n", - "3273 gqa-fnYASIQ https://unsplash.com/photos/gqa-fnYASIQ \n", - "12956 Cq62qvCW8bM https://unsplash.com/photos/Cq62qvCW8bM \n", - "17074 Py8vZdCw35U https://unsplash.com/photos/Py8vZdCw35U \n", - "1394 SpRN0qZPLr8 https://unsplash.com/photos/SpRN0qZPLr8 \n", - "\n", - " photo_image_url \\\n", - "3690 https://images.unsplash.com/photo-156347321301... \n", - "6103 https://images.unsplash.com/photo-149034936815... \n", - "3389 https://images.unsplash.com/photo-1560850038-f... \n", - "18789 https://images.unsplash.com/photo-1558816280-d... \n", - "21742 https://images.unsplash.com/photo-1552300977-c... \n", - "... ... \n", - "3272 https://images.unsplash.com/photo-142372152343... \n", - "3273 https://images.unsplash.com/photo-142302719730... \n", - "12956 https://images.unsplash.com/photo-142245228993... \n", - "17074 https://images.unsplash.com/photo-142034363140... \n", - "1394 https://images.unsplash.com/photo-141621393610... \n", - "\n", - " photo_submitted_at photo_featured photo_width photo_height \\\n", - "3690 2019-07-18 18:07:14.031684 t 4443 2962 \n", - "6103 2017-03-24 09:56:57.505262 t 4500 3000 \n", - "3389 2019-06-18 09:36:35.94311 t 5025 3141 \n", - "18789 2019-05-25 20:32:08.153319 t 4000 6000 \n", - "21742 2019-03-11 10:50:25.9311 t 2992 3992 \n", - "... ... ... ... ... \n", - "3272 2015-02-12 06:12:09.092905 f 2448 3264 \n", - "3273 2015-02-04 05:19:59.869141 f 5086 3391 \n", - "12956 2015-01-28 13:38:18.071331 f 4896 3264 \n", - "17074 2015-01-04 03:54:41.031772 f 2320 1553 \n", - "1394 2014-11-17 08:47:33.427134 f 6016 4000 \n", - "\n", - " photo_aspect_ratio photo_description \\\n", - "3690 1.50 Fall color in the countryside of Eastern Washi... \n", - "6103 1.50 Flowers in spring \n", - "3389 1.60 NaN \n", - "18789 0.67 NaN \n", - "21742 0.75 NaN \n", - "... ... ... \n", - "3272 0.75 NaN \n", - "3273 1.50 NaN \n", - "12956 1.50 NaN \n", - "17074 1.49 NaN \n", - "1394 1.50 NaN \n", - "\n", - " photographer_username ... photo_location_country photo_location_city \\\n", - "3690 timothyeberly ... NaN NaN \n", - "6103 maartendeckers ... Belgium NaN \n", - "3389 a8ka ... NaN NaN \n", - "18789 olenkasergienko ... NaN NaN \n", - "21742 turner_imagery ... NaN NaN \n", - "... ... ... ... ... \n", - "3272 melissaaskew ... NaN NaN \n", - "3273 wilstewart3 ... NaN NaN \n", - "12956 kseny ... NaN NaN \n", - "17074 mrbrodeur ... NaN NaN \n", - "1394 tarunccet ... NaN NaN \n", - "\n", - " stats_views stats_downloads \\\n", - "3690 2978748547 304950 \n", - "6103 2722857886 416983 \n", - "3389 2190084956 253730 \n", - "18789 1934025254 294785 \n", - "21742 1916027735 306073 \n", - "... ... ... \n", - "3272 14088 812 \n", - "3273 13384 858 \n", - "12956 13068 1054 \n", - "17074 12617 581 \n", - "1394 8266 101 \n", - "\n", - " ai_description \\\n", - "3690 orange leaf trees \n", - "6103 pink, yellow and brown petaled flowers \n", - "3389 aerial view of houses near ocean \n", - "18789 pink petaled flower \n", - "21742 high angle photography of cliff \n", - "... ... \n", - "3272 waterfalls in the middle of the forest \n", - "3273 a street sign sitting on the side of a body of... \n", - "12956 man in black shirt and blue pants sitting on b... \n", - "17074 man in black jacket standing on brown sand und... \n", - "1394 brown wooden house on green grass field near b... \n", - "\n", - " ai_primary_landmark_name ai_primary_landmark_latitude \\\n", - "3690 NaN NaN \n", - "6103 NaN NaN \n", - "3389 NaN NaN \n", - "18789 NaN NaN \n", - "21742 NaN NaN \n", - "... ... ... \n", - "3272 NaN NaN \n", - "3273 NaN NaN \n", - "12956 NaN NaN \n", - "17074 NaN NaN \n", - "1394 NaN NaN \n", - "\n", - " ai_primary_landmark_longitude ai_primary_landmark_confidence \\\n", - "3690 NaN NaN \n", - "6103 NaN NaN \n", - "3389 NaN NaN \n", - "18789 NaN NaN \n", - "21742 NaN NaN \n", - "... ... ... \n", - "3272 NaN NaN \n", - "3273 NaN NaN \n", - "12956 NaN NaN \n", - "17074 NaN NaN \n", - "1394 NaN NaN \n", - "\n", - " blur_hash \n", - "3690 LBJPSa4o0hW?pI4;-.R*E459O?sk \n", - "6103 LQJInG*JMyIm^ROpxbNFyCNGnln4 \n", - "3389 LaCt8}~BwNIpozoLofofWBWBaef6 \n", - "18789 LA71AxX50_xHt7j[S1ju0_nm^8NZ \n", - "21742 LKCr=#~VNat7X-%M%1j?9tNbxaay \n", - "... ... \n", - "3272 LG8#NK.84m4mt6f#RjkD9EM_%N-= \n", - "3273 LvKKi,RjM{j[_NWBWBfk5EoLoeaz \n", - "12956 LGEW2ko~M{%N0;ofnhRkwvozt8of \n", - "17074 LuI~ZRogaxR*0?Rjofj[Mxs.a|fP \n", - "1394 LjEppYn}j]kC%jj[f6f6x8fPaxay \n", - "\n", - "[25000 rows x 31 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "import pandas as pd\n", + "import polars as pl\n", "from pathlib import Path\n", + "import logging\n", + "from utils import set_up_logging, delete_corrupt_images\n", "\n", - "DATA_PATH = Path('/mnt/wsl/PHYSICALDRIVE1/data/unsplash')\n", - "DATA_PATH.mkdir(exist_ok=True, parents=True)\n", + "set_up_logging(Path(\"../logs\"))\n", "\n", - "unsplash_dataset_path = \"/home/andras/projects/bipolaroid/unsplash-research-dataset-lite-latest/photos.tsv000\"\n", - "unsplash_dataset = pd.read_csv(unsplash_dataset_path, sep=\"\\t\")\n", - "unsplash_dataset.sort_values(by=\"stats_views\", ascending=False, inplace=True)\n", - "unsplash_dataset" + "RETRY_COUNT = 10\n", + "WORKER_COUNT = 64\n", + "HTTP_TIMEOUT = 30\n", + "TARGET_PATH = Path(\"/bulk2/downloaded-unsplash\")\n", + "\n", + "TARGET_PATH = TARGET_PATH.resolve()\n", + "assert TARGET_PATH.exists()\n", + "delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 113/25000 [00:30<1:02:36, 6.62it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading toPRrcyAIUY: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 1%| | 184/25000 [00:48<1:19:49, 5.18it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading t7YycgAoVSw: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 1%| | 219/25000 [00:58<1:40:16, 4.12it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading LOlMe8HfofI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 3%|▎ | 744/25000 [03:24<1:18:52, 5.12it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 12%|█▏ | 2885/25000 [12:37<1:37:39, 3.77it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", - "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 12%|█▏ | 2909/25000 [12:43<1:15:43, 4.86it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading NcociWzk23A: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 14%|█▍ | 3505/25000 [15:13<1:46:00, 3.38it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", - "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 29%|██▉ | 7352/25000 [31:15<1:03:57, 4.60it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading T2LEdBxpm54: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 34%|███▎ | 8375/25000 [35:53<1:13:19, 3.78it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading q_4pIVaXPEk: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 34%|███▍ | 8568/25000 [36:40<1:08:17, 4.01it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading iGANt1N2ge8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 37%|███▋ | 9282/25000 [40:24<1:41:11, 2.59it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading 2FqpN2CWCLo: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 37%|███▋ | 9343/25000 [40:45<1:22:43, 3.15it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading 4T7-GLBDLKE: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 38%|███▊ | 9398/25000 [41:03<1:43:40, 2.51it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading i-xtI6jD7bQ: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 40%|███▉ | 9992/25000 [44:16<1:03:56, 3.91it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading 0GBafJ-ZenA: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 46%|████▌ | 11508/25000 [51:29<1:16:16, 2.95it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading k2RWB_aPfqI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 47%|████▋ | 11626/25000 [52:01<36:59, 6.03it/s] " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading 7ICXVb10NJs: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 47%|████▋ | 11655/25000 [52:08<1:05:47, 3.38it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading PgBTaq-AgVI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 62%|██████▏ | 15477/25000 [1:10:12<47:47, 3.32it/s] " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error downloading zS_b76LrEL8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 70%|██████▉ | 17470/25000 [1:19:44<19:51, 6.32it/s] " - ] - } - ], + "outputs": [], + "source": [ + "photos = (\n", + " pl.scan_csv(\n", + " \"../data/unsplash-full/photos.tsv000\",\n", + " separator=\"\\t\",\n", + " infer_schema_length=100000,\n", + " )\n", + " .filter(pl.col(\"photo_featured\") == \"t\")\n", + " .sort(\"photo_id\")\n", + " .select(\"photo_id\", \"photo_url\")\n", + " .collect()\n", + ")\n", + "\n", + "photos.limit(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n", + "photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n", + "logging.info(f\"Found {len(photos)} missing photos\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import concurrent.futures\n", "import requests\n", "from tqdm import tqdm\n", "from typing import List\n", + "from time import sleep\n", "\n", - "progress = tqdm(total=len(unsplash_dataset))\n", + "progress = tqdm(total=len(photos))\n", "\n", "def download_image(row):\n", - " filename = DATA_PATH / f\"{row['photo_id']}.jpg\"\n", - " for _ in range(10):\n", + " for retry_count in range(RETRY_COUNT):\n", " try:\n", - " response = requests.get(row[\"photo_image_url\"], timeout=10)\n", + " logging.debug(f\"Downloading {row['photo_id']} from {row['photo_url']}\")\n", + " response = requests.get(row[\"photo_image_url\"], timeout=HTTP_TIMEOUT)\n", + " response.raise_for_status()\n", + " extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n", + " filename = TARGET_PATH / f\"{row['photo_id']}.{extension}\"\n", " with open(filename, \"wb\") as f:\n", " f.write(response.content)\n", + " logging.debug(f\"Downloaded {row['photo_id']} to {filename}\")\n", " with progress.get_lock():\n", " progress.update(1)\n", - " break\n", + " return\n", " except Exception as e:\n", - " print(f\"Error downloading {row['photo_id']}: {e}\")\n", + " logging.error(\n", + " f\"Error downloading {row['photo_id']} from {row['photo_url']} (retry {retry_count}): {e}\"\n", + " )\n", + " sleep(retry_count * 0.5)\n", "\n", - "with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:\n", + "\n", + "with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_COUNT) as executor:\n", " futures: List[concurrent.futures.Future] = []\n", - " for row in unsplash_dataset.iterrows():\n", - " row = row[1]\n", + " for row in photos.to_dicts():\n", " future = executor.submit(download_image, row)\n", " futures.append(future)\n", "\n", @@ -795,6 +100,15 @@ " concurrent.futures.wait(futures)\n", "progress.close()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))" + ] } ], "metadata": {