{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
photo_idphoto_urlphoto_image_urlphoto_submitted_atphoto_featuredphoto_widthphoto_heightphoto_aspect_ratiophoto_descriptionphotographer_username...photo_location_countryphoto_location_citystats_viewsstats_downloadsai_descriptionai_primary_landmark_nameai_primary_landmark_latitudeai_primary_landmark_longitudeai_primary_landmark_confidenceblur_hash
3690XFmznQhx9lMhttps://unsplash.com/photos/XFmznQhx9lMhttps://images.unsplash.com/photo-156347321301...2019-07-18 18:07:14.031684t444329621.50Fall color in the countryside of Eastern Washi...timothyeberly...NaNNaN2978748547304950orange leaf treesNaNNaNNaNNaNLBJPSa4o0hW?pI4;-.R*E459O?sk
6103YDNvydD1jAYhttps://unsplash.com/photos/YDNvydD1jAYhttps://images.unsplash.com/photo-149034936815...2017-03-24 09:56:57.505262t450030001.50Flowers in springmaartendeckers...BelgiumNaN2722857886416983pink, yellow and brown petaled flowersNaNNaNNaNNaNLQJInG*JMyIm^ROpxbNFyCNGnln4
33894oovIxttThAhttps://unsplash.com/photos/4oovIxttThAhttps://images.unsplash.com/photo-1560850038-f...2019-06-18 09:36:35.94311t502531411.60NaNa8ka...NaNNaN2190084956253730aerial view of houses near oceanNaNNaNNaNNaNLaCt8}~BwNIpozoLofofWBWBaef6
18789BkR842UVXqkhttps://unsplash.com/photos/BkR842UVXqkhttps://images.unsplash.com/photo-1558816280-d...2019-05-25 20:32:08.153319t400060000.67NaNolenkasergienko...NaNNaN1934025254294785pink petaled flowerNaNNaNNaNNaNLA71AxX50_xHt7j[S1ju0_nm^8NZ
21742GRLN5FC4cLghttps://unsplash.com/photos/GRLN5FC4cLghttps://images.unsplash.com/photo-1552300977-c...2019-03-11 10:50:25.9311t299239920.75NaNturner_imagery...NaNNaN1916027735306073high angle photography of cliffNaNNaNNaNNaNLKCr=#~VNat7X-%M%1j?9tNbxaay
..................................................................
32725zsw1PjXg8khttps://unsplash.com/photos/5zsw1PjXg8khttps://images.unsplash.com/photo-142372152343...2015-02-12 06:12:09.092905f244832640.75NaNmelissaaskew...NaNNaN14088812waterfalls in the middle of the forestNaNNaNNaNNaNLG8#NK.84m4mt6f#RjkD9EM_%N-=
3273gqa-fnYASIQhttps://unsplash.com/photos/gqa-fnYASIQhttps://images.unsplash.com/photo-142302719730...2015-02-04 05:19:59.869141f508633911.50NaNwilstewart3...NaNNaN13384858a street sign sitting on the side of a body of...NaNNaNNaNNaNLvKKi,RjM{j[_NWBWBfk5EoLoeaz
12956Cq62qvCW8bMhttps://unsplash.com/photos/Cq62qvCW8bMhttps://images.unsplash.com/photo-142245228993...2015-01-28 13:38:18.071331f489632641.50NaNkseny...NaNNaN130681054man in black shirt and blue pants sitting on b...NaNNaNNaNNaNLGEW2ko~M{%N0;ofnhRkwvozt8of
17074Py8vZdCw35Uhttps://unsplash.com/photos/Py8vZdCw35Uhttps://images.unsplash.com/photo-142034363140...2015-01-04 03:54:41.031772f232015531.49NaNmrbrodeur...NaNNaN12617581man in black jacket standing on brown sand und...NaNNaNNaNNaNLuI~ZRogaxR*0?Rjofj[Mxs.a|fP
1394SpRN0qZPLr8https://unsplash.com/photos/SpRN0qZPLr8https://images.unsplash.com/photo-141621393610...2014-11-17 08:47:33.427134f601640001.50NaNtarunccet...NaNNaN8266101brown wooden house on green grass field near b...NaNNaNNaNNaNLjEppYn}j]kC%jj[f6f6x8fPaxay
\n", "

25000 rows × 31 columns

\n", "
" ], "text/plain": [ " photo_id photo_url \\\n", "3690 XFmznQhx9lM https://unsplash.com/photos/XFmznQhx9lM \n", "6103 YDNvydD1jAY https://unsplash.com/photos/YDNvydD1jAY \n", "3389 4oovIxttThA https://unsplash.com/photos/4oovIxttThA \n", "18789 BkR842UVXqk https://unsplash.com/photos/BkR842UVXqk \n", "21742 GRLN5FC4cLg https://unsplash.com/photos/GRLN5FC4cLg \n", "... ... ... \n", "3272 5zsw1PjXg8k https://unsplash.com/photos/5zsw1PjXg8k \n", "3273 gqa-fnYASIQ https://unsplash.com/photos/gqa-fnYASIQ \n", "12956 Cq62qvCW8bM https://unsplash.com/photos/Cq62qvCW8bM \n", "17074 Py8vZdCw35U https://unsplash.com/photos/Py8vZdCw35U \n", "1394 SpRN0qZPLr8 https://unsplash.com/photos/SpRN0qZPLr8 \n", "\n", " photo_image_url \\\n", "3690 https://images.unsplash.com/photo-156347321301... \n", "6103 https://images.unsplash.com/photo-149034936815... \n", "3389 https://images.unsplash.com/photo-1560850038-f... \n", "18789 https://images.unsplash.com/photo-1558816280-d... \n", "21742 https://images.unsplash.com/photo-1552300977-c... \n", "... ... \n", "3272 https://images.unsplash.com/photo-142372152343... \n", "3273 https://images.unsplash.com/photo-142302719730... \n", "12956 https://images.unsplash.com/photo-142245228993... \n", "17074 https://images.unsplash.com/photo-142034363140... \n", "1394 https://images.unsplash.com/photo-141621393610... \n", "\n", " photo_submitted_at photo_featured photo_width photo_height \\\n", "3690 2019-07-18 18:07:14.031684 t 4443 2962 \n", "6103 2017-03-24 09:56:57.505262 t 4500 3000 \n", "3389 2019-06-18 09:36:35.94311 t 5025 3141 \n", "18789 2019-05-25 20:32:08.153319 t 4000 6000 \n", "21742 2019-03-11 10:50:25.9311 t 2992 3992 \n", "... ... ... ... ... \n", "3272 2015-02-12 06:12:09.092905 f 2448 3264 \n", "3273 2015-02-04 05:19:59.869141 f 5086 3391 \n", "12956 2015-01-28 13:38:18.071331 f 4896 3264 \n", "17074 2015-01-04 03:54:41.031772 f 2320 1553 \n", "1394 2014-11-17 08:47:33.427134 f 6016 4000 \n", "\n", " photo_aspect_ratio photo_description \\\n", "3690 1.50 Fall color in the countryside of Eastern Washi... \n", "6103 1.50 Flowers in spring \n", "3389 1.60 NaN \n", "18789 0.67 NaN \n", "21742 0.75 NaN \n", "... ... ... \n", "3272 0.75 NaN \n", "3273 1.50 NaN \n", "12956 1.50 NaN \n", "17074 1.49 NaN \n", "1394 1.50 NaN \n", "\n", " photographer_username ... photo_location_country photo_location_city \\\n", "3690 timothyeberly ... NaN NaN \n", "6103 maartendeckers ... Belgium NaN \n", "3389 a8ka ... NaN NaN \n", "18789 olenkasergienko ... NaN NaN \n", "21742 turner_imagery ... NaN NaN \n", "... ... ... ... ... \n", "3272 melissaaskew ... NaN NaN \n", "3273 wilstewart3 ... NaN NaN \n", "12956 kseny ... NaN NaN \n", "17074 mrbrodeur ... NaN NaN \n", "1394 tarunccet ... NaN NaN \n", "\n", " stats_views stats_downloads \\\n", "3690 2978748547 304950 \n", "6103 2722857886 416983 \n", "3389 2190084956 253730 \n", "18789 1934025254 294785 \n", "21742 1916027735 306073 \n", "... ... ... \n", "3272 14088 812 \n", "3273 13384 858 \n", "12956 13068 1054 \n", "17074 12617 581 \n", "1394 8266 101 \n", "\n", " ai_description \\\n", "3690 orange leaf trees \n", "6103 pink, yellow and brown petaled flowers \n", "3389 aerial view of houses near ocean \n", "18789 pink petaled flower \n", "21742 high angle photography of cliff \n", "... ... \n", "3272 waterfalls in the middle of the forest \n", "3273 a street sign sitting on the side of a body of... \n", "12956 man in black shirt and blue pants sitting on b... \n", "17074 man in black jacket standing on brown sand und... \n", "1394 brown wooden house on green grass field near b... \n", "\n", " ai_primary_landmark_name ai_primary_landmark_latitude \\\n", "3690 NaN NaN \n", "6103 NaN NaN \n", "3389 NaN NaN \n", "18789 NaN NaN \n", "21742 NaN NaN \n", "... ... ... \n", "3272 NaN NaN \n", "3273 NaN NaN \n", "12956 NaN NaN \n", "17074 NaN NaN \n", "1394 NaN NaN \n", "\n", " ai_primary_landmark_longitude ai_primary_landmark_confidence \\\n", "3690 NaN NaN \n", "6103 NaN NaN \n", "3389 NaN NaN \n", "18789 NaN NaN \n", "21742 NaN NaN \n", "... ... ... \n", "3272 NaN NaN \n", "3273 NaN NaN \n", "12956 NaN NaN \n", "17074 NaN NaN \n", "1394 NaN NaN \n", "\n", " blur_hash \n", "3690 LBJPSa4o0hW?pI4;-.R*E459O?sk \n", "6103 LQJInG*JMyIm^ROpxbNFyCNGnln4 \n", "3389 LaCt8}~BwNIpozoLofofWBWBaef6 \n", "18789 LA71AxX50_xHt7j[S1ju0_nm^8NZ \n", "21742 LKCr=#~VNat7X-%M%1j?9tNbxaay \n", "... ... \n", "3272 LG8#NK.84m4mt6f#RjkD9EM_%N-= \n", "3273 LvKKi,RjM{j[_NWBWBfk5EoLoeaz \n", "12956 LGEW2ko~M{%N0;ofnhRkwvozt8of \n", "17074 LuI~ZRogaxR*0?Rjofj[Mxs.a|fP \n", "1394 LjEppYn}j]kC%jj[f6f6x8fPaxay \n", "\n", "[25000 rows x 31 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "\n", "DATA_PATH = Path('/mnt/wsl/PHYSICALDRIVE1/data/unsplash')\n", "DATA_PATH.mkdir(exist_ok=True, parents=True)\n", "\n", "unsplash_dataset_path = \"/home/andras/projects/bipolaroid/unsplash-research-dataset-lite-latest/photos.tsv000\"\n", "unsplash_dataset = pd.read_csv(unsplash_dataset_path, sep=\"\\t\")\n", "unsplash_dataset.sort_values(by=\"stats_views\", ascending=False, inplace=True)\n", "unsplash_dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 113/25000 [00:30<1:02:36, 6.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading toPRrcyAIUY: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%| | 184/25000 [00:48<1:19:49, 5.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading t7YycgAoVSw: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%| | 219/25000 [00:58<1:40:16, 4.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading LOlMe8HfofI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 3%|▎ | 744/25000 [03:24<1:18:52, 5.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 12%|█▏ | 2885/25000 [12:37<1:37:39, 3.77it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n", "Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 12%|█▏ | 2909/25000 [12:43<1:15:43, 4.86it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading NcociWzk23A: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 14%|█▍ | 3505/25000 [15:13<1:46:00, 3.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n", "Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\": Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 29%|██▉ | 7352/25000 [31:15<1:03:57, 4.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading T2LEdBxpm54: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 34%|███▎ | 8375/25000 [35:53<1:13:19, 3.78it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading q_4pIVaXPEk: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 34%|███▍ | 8568/25000 [36:40<1:08:17, 4.01it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading iGANt1N2ge8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 37%|███▋ | 9282/25000 [40:24<1:41:11, 2.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading 2FqpN2CWCLo: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 37%|███▋ | 9343/25000 [40:45<1:22:43, 3.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading 4T7-GLBDLKE: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 38%|███▊ | 9398/25000 [41:03<1:43:40, 2.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading i-xtI6jD7bQ: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 40%|███▉ | 9992/25000 [44:16<1:03:56, 3.91it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading 0GBafJ-ZenA: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 46%|████▌ | 11508/25000 [51:29<1:16:16, 2.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading k2RWB_aPfqI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 47%|████▋ | 11626/25000 [52:01<36:59, 6.03it/s] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading 7ICXVb10NJs: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 47%|████▋ | 11655/25000 [52:08<1:05:47, 3.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading PgBTaq-AgVI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 62%|██████▏ | 15477/25000 [1:10:12<47:47, 3.32it/s] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error downloading zS_b76LrEL8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 70%|██████▉ | 17470/25000 [1:19:44<19:51, 6.32it/s] " ] } ], "source": [ "import concurrent.futures\n", "import requests\n", "from tqdm import tqdm\n", "from typing import List\n", "\n", "progress = tqdm(total=len(unsplash_dataset))\n", "\n", "def download_image(row):\n", " filename = DATA_PATH / f\"{row['photo_id']}.jpg\"\n", " for _ in range(10):\n", " try:\n", " response = requests.get(row[\"photo_image_url\"], timeout=10)\n", " with open(filename, \"wb\") as f:\n", " f.write(response.content)\n", " with progress.get_lock():\n", " progress.update(1)\n", " break\n", " except Exception as e:\n", " print(f\"Error downloading {row['photo_id']}: {e}\")\n", "\n", "with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:\n", " futures: List[concurrent.futures.Future] = []\n", " for row in unsplash_dataset.iterrows():\n", " row = row[1]\n", " future = executor.submit(download_image, row)\n", " futures.append(future)\n", "\n", " progress.display()\n", " concurrent.futures.wait(futures)\n", "progress.close()" ] } ], "metadata": { "kernelspec": { "display_name": "bipolaroid", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }