Update fetch script

This commit is contained in:
Andras Schmelczer 2024-07-08 07:37:36 +01:00
parent 564134145a
commit 97e6fd8cd2
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C

View file

@ -2,792 +2,97 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>photo_id</th>\n",
" <th>photo_url</th>\n",
" <th>photo_image_url</th>\n",
" <th>photo_submitted_at</th>\n",
" <th>photo_featured</th>\n",
" <th>photo_width</th>\n",
" <th>photo_height</th>\n",
" <th>photo_aspect_ratio</th>\n",
" <th>photo_description</th>\n",
" <th>photographer_username</th>\n",
" <th>...</th>\n",
" <th>photo_location_country</th>\n",
" <th>photo_location_city</th>\n",
" <th>stats_views</th>\n",
" <th>stats_downloads</th>\n",
" <th>ai_description</th>\n",
" <th>ai_primary_landmark_name</th>\n",
" <th>ai_primary_landmark_latitude</th>\n",
" <th>ai_primary_landmark_longitude</th>\n",
" <th>ai_primary_landmark_confidence</th>\n",
" <th>blur_hash</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3690</th>\n",
" <td>XFmznQhx9lM</td>\n",
" <td>https://unsplash.com/photos/XFmznQhx9lM</td>\n",
" <td>https://images.unsplash.com/photo-156347321301...</td>\n",
" <td>2019-07-18 18:07:14.031684</td>\n",
" <td>t</td>\n",
" <td>4443</td>\n",
" <td>2962</td>\n",
" <td>1.50</td>\n",
" <td>Fall color in the countryside of Eastern Washi...</td>\n",
" <td>timothyeberly</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2978748547</td>\n",
" <td>304950</td>\n",
" <td>orange leaf trees</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LBJPSa4o0hW?pI4;-.R*E459O?sk</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6103</th>\n",
" <td>YDNvydD1jAY</td>\n",
" <td>https://unsplash.com/photos/YDNvydD1jAY</td>\n",
" <td>https://images.unsplash.com/photo-149034936815...</td>\n",
" <td>2017-03-24 09:56:57.505262</td>\n",
" <td>t</td>\n",
" <td>4500</td>\n",
" <td>3000</td>\n",
" <td>1.50</td>\n",
" <td>Flowers in spring</td>\n",
" <td>maartendeckers</td>\n",
" <td>...</td>\n",
" <td>Belgium</td>\n",
" <td>NaN</td>\n",
" <td>2722857886</td>\n",
" <td>416983</td>\n",
" <td>pink, yellow and brown petaled flowers</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LQJInG*JMyIm^ROpxbNFyCNGnln4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3389</th>\n",
" <td>4oovIxttThA</td>\n",
" <td>https://unsplash.com/photos/4oovIxttThA</td>\n",
" <td>https://images.unsplash.com/photo-1560850038-f...</td>\n",
" <td>2019-06-18 09:36:35.94311</td>\n",
" <td>t</td>\n",
" <td>5025</td>\n",
" <td>3141</td>\n",
" <td>1.60</td>\n",
" <td>NaN</td>\n",
" <td>a8ka</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2190084956</td>\n",
" <td>253730</td>\n",
" <td>aerial view of houses near ocean</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LaCt8}~BwNIpozoLofofWBWBaef6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18789</th>\n",
" <td>BkR842UVXqk</td>\n",
" <td>https://unsplash.com/photos/BkR842UVXqk</td>\n",
" <td>https://images.unsplash.com/photo-1558816280-d...</td>\n",
" <td>2019-05-25 20:32:08.153319</td>\n",
" <td>t</td>\n",
" <td>4000</td>\n",
" <td>6000</td>\n",
" <td>0.67</td>\n",
" <td>NaN</td>\n",
" <td>olenkasergienko</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1934025254</td>\n",
" <td>294785</td>\n",
" <td>pink petaled flower</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LA71AxX50_xHt7j[S1ju0_nm^8NZ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21742</th>\n",
" <td>GRLN5FC4cLg</td>\n",
" <td>https://unsplash.com/photos/GRLN5FC4cLg</td>\n",
" <td>https://images.unsplash.com/photo-1552300977-c...</td>\n",
" <td>2019-03-11 10:50:25.9311</td>\n",
" <td>t</td>\n",
" <td>2992</td>\n",
" <td>3992</td>\n",
" <td>0.75</td>\n",
" <td>NaN</td>\n",
" <td>turner_imagery</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1916027735</td>\n",
" <td>306073</td>\n",
" <td>high angle photography of cliff</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LKCr=#~VNat7X-%M%1j?9tNbxaay</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3272</th>\n",
" <td>5zsw1PjXg8k</td>\n",
" <td>https://unsplash.com/photos/5zsw1PjXg8k</td>\n",
" <td>https://images.unsplash.com/photo-142372152343...</td>\n",
" <td>2015-02-12 06:12:09.092905</td>\n",
" <td>f</td>\n",
" <td>2448</td>\n",
" <td>3264</td>\n",
" <td>0.75</td>\n",
" <td>NaN</td>\n",
" <td>melissaaskew</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>14088</td>\n",
" <td>812</td>\n",
" <td>waterfalls in the middle of the forest</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LG8#NK.84m4mt6f#RjkD9EM_%N-=</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3273</th>\n",
" <td>gqa-fnYASIQ</td>\n",
" <td>https://unsplash.com/photos/gqa-fnYASIQ</td>\n",
" <td>https://images.unsplash.com/photo-142302719730...</td>\n",
" <td>2015-02-04 05:19:59.869141</td>\n",
" <td>f</td>\n",
" <td>5086</td>\n",
" <td>3391</td>\n",
" <td>1.50</td>\n",
" <td>NaN</td>\n",
" <td>wilstewart3</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13384</td>\n",
" <td>858</td>\n",
" <td>a street sign sitting on the side of a body of...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LvKKi,RjM{j[_NWBWBfk5EoLoeaz</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12956</th>\n",
" <td>Cq62qvCW8bM</td>\n",
" <td>https://unsplash.com/photos/Cq62qvCW8bM</td>\n",
" <td>https://images.unsplash.com/photo-142245228993...</td>\n",
" <td>2015-01-28 13:38:18.071331</td>\n",
" <td>f</td>\n",
" <td>4896</td>\n",
" <td>3264</td>\n",
" <td>1.50</td>\n",
" <td>NaN</td>\n",
" <td>kseny</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13068</td>\n",
" <td>1054</td>\n",
" <td>man in black shirt and blue pants sitting on b...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LGEW2ko~M{%N0;ofnhRkwvozt8of</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17074</th>\n",
" <td>Py8vZdCw35U</td>\n",
" <td>https://unsplash.com/photos/Py8vZdCw35U</td>\n",
" <td>https://images.unsplash.com/photo-142034363140...</td>\n",
" <td>2015-01-04 03:54:41.031772</td>\n",
" <td>f</td>\n",
" <td>2320</td>\n",
" <td>1553</td>\n",
" <td>1.49</td>\n",
" <td>NaN</td>\n",
" <td>mrbrodeur</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>12617</td>\n",
" <td>581</td>\n",
" <td>man in black jacket standing on brown sand und...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LuI~ZRogaxR*0?Rjofj[Mxs.a|fP</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1394</th>\n",
" <td>SpRN0qZPLr8</td>\n",
" <td>https://unsplash.com/photos/SpRN0qZPLr8</td>\n",
" <td>https://images.unsplash.com/photo-141621393610...</td>\n",
" <td>2014-11-17 08:47:33.427134</td>\n",
" <td>f</td>\n",
" <td>6016</td>\n",
" <td>4000</td>\n",
" <td>1.50</td>\n",
" <td>NaN</td>\n",
" <td>tarunccet</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8266</td>\n",
" <td>101</td>\n",
" <td>brown wooden house on green grass field near b...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LjEppYn}j]kC%jj[f6f6x8fPaxay</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>25000 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" photo_id photo_url \\\n",
"3690 XFmznQhx9lM https://unsplash.com/photos/XFmznQhx9lM \n",
"6103 YDNvydD1jAY https://unsplash.com/photos/YDNvydD1jAY \n",
"3389 4oovIxttThA https://unsplash.com/photos/4oovIxttThA \n",
"18789 BkR842UVXqk https://unsplash.com/photos/BkR842UVXqk \n",
"21742 GRLN5FC4cLg https://unsplash.com/photos/GRLN5FC4cLg \n",
"... ... ... \n",
"3272 5zsw1PjXg8k https://unsplash.com/photos/5zsw1PjXg8k \n",
"3273 gqa-fnYASIQ https://unsplash.com/photos/gqa-fnYASIQ \n",
"12956 Cq62qvCW8bM https://unsplash.com/photos/Cq62qvCW8bM \n",
"17074 Py8vZdCw35U https://unsplash.com/photos/Py8vZdCw35U \n",
"1394 SpRN0qZPLr8 https://unsplash.com/photos/SpRN0qZPLr8 \n",
"\n",
" photo_image_url \\\n",
"3690 https://images.unsplash.com/photo-156347321301... \n",
"6103 https://images.unsplash.com/photo-149034936815... \n",
"3389 https://images.unsplash.com/photo-1560850038-f... \n",
"18789 https://images.unsplash.com/photo-1558816280-d... \n",
"21742 https://images.unsplash.com/photo-1552300977-c... \n",
"... ... \n",
"3272 https://images.unsplash.com/photo-142372152343... \n",
"3273 https://images.unsplash.com/photo-142302719730... \n",
"12956 https://images.unsplash.com/photo-142245228993... \n",
"17074 https://images.unsplash.com/photo-142034363140... \n",
"1394 https://images.unsplash.com/photo-141621393610... \n",
"\n",
" photo_submitted_at photo_featured photo_width photo_height \\\n",
"3690 2019-07-18 18:07:14.031684 t 4443 2962 \n",
"6103 2017-03-24 09:56:57.505262 t 4500 3000 \n",
"3389 2019-06-18 09:36:35.94311 t 5025 3141 \n",
"18789 2019-05-25 20:32:08.153319 t 4000 6000 \n",
"21742 2019-03-11 10:50:25.9311 t 2992 3992 \n",
"... ... ... ... ... \n",
"3272 2015-02-12 06:12:09.092905 f 2448 3264 \n",
"3273 2015-02-04 05:19:59.869141 f 5086 3391 \n",
"12956 2015-01-28 13:38:18.071331 f 4896 3264 \n",
"17074 2015-01-04 03:54:41.031772 f 2320 1553 \n",
"1394 2014-11-17 08:47:33.427134 f 6016 4000 \n",
"\n",
" photo_aspect_ratio photo_description \\\n",
"3690 1.50 Fall color in the countryside of Eastern Washi... \n",
"6103 1.50 Flowers in spring \n",
"3389 1.60 NaN \n",
"18789 0.67 NaN \n",
"21742 0.75 NaN \n",
"... ... ... \n",
"3272 0.75 NaN \n",
"3273 1.50 NaN \n",
"12956 1.50 NaN \n",
"17074 1.49 NaN \n",
"1394 1.50 NaN \n",
"\n",
" photographer_username ... photo_location_country photo_location_city \\\n",
"3690 timothyeberly ... NaN NaN \n",
"6103 maartendeckers ... Belgium NaN \n",
"3389 a8ka ... NaN NaN \n",
"18789 olenkasergienko ... NaN NaN \n",
"21742 turner_imagery ... NaN NaN \n",
"... ... ... ... ... \n",
"3272 melissaaskew ... NaN NaN \n",
"3273 wilstewart3 ... NaN NaN \n",
"12956 kseny ... NaN NaN \n",
"17074 mrbrodeur ... NaN NaN \n",
"1394 tarunccet ... NaN NaN \n",
"\n",
" stats_views stats_downloads \\\n",
"3690 2978748547 304950 \n",
"6103 2722857886 416983 \n",
"3389 2190084956 253730 \n",
"18789 1934025254 294785 \n",
"21742 1916027735 306073 \n",
"... ... ... \n",
"3272 14088 812 \n",
"3273 13384 858 \n",
"12956 13068 1054 \n",
"17074 12617 581 \n",
"1394 8266 101 \n",
"\n",
" ai_description \\\n",
"3690 orange leaf trees \n",
"6103 pink, yellow and brown petaled flowers \n",
"3389 aerial view of houses near ocean \n",
"18789 pink petaled flower \n",
"21742 high angle photography of cliff \n",
"... ... \n",
"3272 waterfalls in the middle of the forest \n",
"3273 a street sign sitting on the side of a body of... \n",
"12956 man in black shirt and blue pants sitting on b... \n",
"17074 man in black jacket standing on brown sand und... \n",
"1394 brown wooden house on green grass field near b... \n",
"\n",
" ai_primary_landmark_name ai_primary_landmark_latitude \\\n",
"3690 NaN NaN \n",
"6103 NaN NaN \n",
"3389 NaN NaN \n",
"18789 NaN NaN \n",
"21742 NaN NaN \n",
"... ... ... \n",
"3272 NaN NaN \n",
"3273 NaN NaN \n",
"12956 NaN NaN \n",
"17074 NaN NaN \n",
"1394 NaN NaN \n",
"\n",
" ai_primary_landmark_longitude ai_primary_landmark_confidence \\\n",
"3690 NaN NaN \n",
"6103 NaN NaN \n",
"3389 NaN NaN \n",
"18789 NaN NaN \n",
"21742 NaN NaN \n",
"... ... ... \n",
"3272 NaN NaN \n",
"3273 NaN NaN \n",
"12956 NaN NaN \n",
"17074 NaN NaN \n",
"1394 NaN NaN \n",
"\n",
" blur_hash \n",
"3690 LBJPSa4o0hW?pI4;-.R*E459O?sk \n",
"6103 LQJInG*JMyIm^ROpxbNFyCNGnln4 \n",
"3389 LaCt8}~BwNIpozoLofofWBWBaef6 \n",
"18789 LA71AxX50_xHt7j[S1ju0_nm^8NZ \n",
"21742 LKCr=#~VNat7X-%M%1j?9tNbxaay \n",
"... ... \n",
"3272 LG8#NK.84m4mt6f#RjkD9EM_%N-= \n",
"3273 LvKKi,RjM{j[_NWBWBfk5EoLoeaz \n",
"12956 LGEW2ko~M{%N0;ofnhRkwvozt8of \n",
"17074 LuI~ZRogaxR*0?Rjofj[Mxs.a|fP \n",
"1394 LjEppYn}j]kC%jj[f6f6x8fPaxay \n",
"\n",
"[25000 rows x 31 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"import pandas as pd\n",
"import polars as pl\n",
"from pathlib import Path\n",
"import logging\n",
"from utils import set_up_logging, delete_corrupt_images\n",
"\n",
"DATA_PATH = Path('/mnt/wsl/PHYSICALDRIVE1/data/unsplash')\n",
"DATA_PATH.mkdir(exist_ok=True, parents=True)\n",
"set_up_logging(Path(\"../logs\"))\n",
"\n",
"unsplash_dataset_path = \"/home/andras/projects/bipolaroid/unsplash-research-dataset-lite-latest/photos.tsv000\"\n",
"unsplash_dataset = pd.read_csv(unsplash_dataset_path, sep=\"\\t\")\n",
"unsplash_dataset.sort_values(by=\"stats_views\", ascending=False, inplace=True)\n",
"unsplash_dataset"
"RETRY_COUNT = 10\n",
"WORKER_COUNT = 64\n",
"HTTP_TIMEOUT = 30\n",
"TARGET_PATH = Path(\"/bulk2/downloaded-unsplash\")\n",
"\n",
"TARGET_PATH = TARGET_PATH.resolve()\n",
"assert TARGET_PATH.exists()\n",
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 113/25000 [00:30<1:02:36, 6.62it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading toPRrcyAIUY: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 1%| | 184/25000 [00:48<1:19:49, 5.18it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading t7YycgAoVSw: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 1%| | 219/25000 [00:58<1:40:16, 4.12it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading LOlMe8HfofI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 3%|▎ | 744/25000 [03:24<1:18:52, 5.12it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48c37350>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f50686e0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df49e0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df7b60>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48dc81d0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48dcac90>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df5850>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df7cb0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48c351f0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f4f6b4a0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 12%|█▏ | 2885/25000 [12:37<1:37:39, 3.77it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481fd370>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483a9580>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48380560>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48382ba0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48381940>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483828a0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483aaf60>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481fee10>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f56cbfb0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48383920>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 12%|█▏ | 2909/25000 [12:43<1:15:43, 4.86it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading NcociWzk23A: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 14%|█▍ | 3505/25000 [15:13<1:46:00, 3.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481af4d0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481ad160>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481943b0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa4818bcb0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48189fa0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48189a00>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48196c60>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481ac710>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481693a0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f56e0110>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 29%|██▉ | 7352/25000 [31:15<1:03:57, 4.60it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading T2LEdBxpm54: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 34%|███▎ | 8375/25000 [35:53<1:13:19, 3.78it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading q_4pIVaXPEk: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 34%|███▍ | 8568/25000 [36:40<1:08:17, 4.01it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading iGANt1N2ge8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 37%|███▋ | 9282/25000 [40:24<1:41:11, 2.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 2FqpN2CWCLo: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 37%|███▋ | 9343/25000 [40:45<1:22:43, 3.15it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 4T7-GLBDLKE: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 38%|███▊ | 9398/25000 [41:03<1:43:40, 2.51it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading i-xtI6jD7bQ: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 40%|███▉ | 9992/25000 [44:16<1:03:56, 3.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 0GBafJ-ZenA: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 46%|████▌ | 11508/25000 [51:29<1:16:16, 2.95it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading k2RWB_aPfqI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 47%|████▋ | 11626/25000 [52:01<36:59, 6.03it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 7ICXVb10NJs: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 47%|████▋ | 11655/25000 [52:08<1:05:47, 3.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading PgBTaq-AgVI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 62%|██████▏ | 15477/25000 [1:10:12<47:47, 3.32it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading zS_b76LrEL8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 70%|██████▉ | 17470/25000 [1:19:44<19:51, 6.32it/s] "
]
}
],
"outputs": [],
"source": [
"photos = (\n",
" pl.scan_csv(\n",
" \"../data/unsplash-full/photos.tsv000\",\n",
" separator=\"\\t\",\n",
" infer_schema_length=100000,\n",
" )\n",
" .filter(pl.col(\"photo_featured\") == \"t\")\n",
" .sort(\"photo_id\")\n",
" .select(\"photo_id\", \"photo_url\")\n",
" .collect()\n",
")\n",
"\n",
"photos.limit(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n",
"photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n",
"logging.info(f\"Found {len(photos)} missing photos\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import concurrent.futures\n",
"import requests\n",
"from tqdm import tqdm\n",
"from typing import List\n",
"from time import sleep\n",
"\n",
"progress = tqdm(total=len(unsplash_dataset))\n",
"progress = tqdm(total=len(photos))\n",
"\n",
"def download_image(row):\n",
" filename = DATA_PATH / f\"{row['photo_id']}.jpg\"\n",
" for _ in range(10):\n",
" for retry_count in range(RETRY_COUNT):\n",
" try:\n",
" response = requests.get(row[\"photo_image_url\"], timeout=10)\n",
" logging.debug(f\"Downloading {row['photo_id']} from {row['photo_url']}\")\n",
" response = requests.get(row[\"photo_image_url\"], timeout=HTTP_TIMEOUT)\n",
" response.raise_for_status()\n",
" extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n",
" filename = TARGET_PATH / f\"{row['photo_id']}.{extension}\"\n",
" with open(filename, \"wb\") as f:\n",
" f.write(response.content)\n",
" logging.debug(f\"Downloaded {row['photo_id']} to {filename}\")\n",
" with progress.get_lock():\n",
" progress.update(1)\n",
" break\n",
" return\n",
" except Exception as e:\n",
" print(f\"Error downloading {row['photo_id']}: {e}\")\n",
" logging.error(\n",
" f\"Error downloading {row['photo_id']} from {row['photo_url']} (retry {retry_count}): {e}\"\n",
" )\n",
" sleep(retry_count * 0.5)\n",
"\n",
"with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:\n",
"\n",
"with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_COUNT) as executor:\n",
" futures: List[concurrent.futures.Future] = []\n",
" for row in unsplash_dataset.iterrows():\n",
" row = row[1]\n",
" for row in photos.to_dicts():\n",
" future = executor.submit(download_image, row)\n",
" futures.append(future)\n",
"\n",
@ -795,6 +100,15 @@
" concurrent.futures.wait(futures)\n",
"progress.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
]
}
],
"metadata": {