Update fetch script
This commit is contained in:
parent
564134145a
commit
97e6fd8cd2
1 changed files with 74 additions and 760 deletions
|
|
@ -2,792 +2,97 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>photo_id</th>\n",
|
||||
" <th>photo_url</th>\n",
|
||||
" <th>photo_image_url</th>\n",
|
||||
" <th>photo_submitted_at</th>\n",
|
||||
" <th>photo_featured</th>\n",
|
||||
" <th>photo_width</th>\n",
|
||||
" <th>photo_height</th>\n",
|
||||
" <th>photo_aspect_ratio</th>\n",
|
||||
" <th>photo_description</th>\n",
|
||||
" <th>photographer_username</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>photo_location_country</th>\n",
|
||||
" <th>photo_location_city</th>\n",
|
||||
" <th>stats_views</th>\n",
|
||||
" <th>stats_downloads</th>\n",
|
||||
" <th>ai_description</th>\n",
|
||||
" <th>ai_primary_landmark_name</th>\n",
|
||||
" <th>ai_primary_landmark_latitude</th>\n",
|
||||
" <th>ai_primary_landmark_longitude</th>\n",
|
||||
" <th>ai_primary_landmark_confidence</th>\n",
|
||||
" <th>blur_hash</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>3690</th>\n",
|
||||
" <td>XFmznQhx9lM</td>\n",
|
||||
" <td>https://unsplash.com/photos/XFmznQhx9lM</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-156347321301...</td>\n",
|
||||
" <td>2019-07-18 18:07:14.031684</td>\n",
|
||||
" <td>t</td>\n",
|
||||
" <td>4443</td>\n",
|
||||
" <td>2962</td>\n",
|
||||
" <td>1.50</td>\n",
|
||||
" <td>Fall color in the countryside of Eastern Washi...</td>\n",
|
||||
" <td>timothyeberly</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2978748547</td>\n",
|
||||
" <td>304950</td>\n",
|
||||
" <td>orange leaf trees</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LBJPSa4o0hW?pI4;-.R*E459O?sk</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6103</th>\n",
|
||||
" <td>YDNvydD1jAY</td>\n",
|
||||
" <td>https://unsplash.com/photos/YDNvydD1jAY</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-149034936815...</td>\n",
|
||||
" <td>2017-03-24 09:56:57.505262</td>\n",
|
||||
" <td>t</td>\n",
|
||||
" <td>4500</td>\n",
|
||||
" <td>3000</td>\n",
|
||||
" <td>1.50</td>\n",
|
||||
" <td>Flowers in spring</td>\n",
|
||||
" <td>maartendeckers</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Belgium</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2722857886</td>\n",
|
||||
" <td>416983</td>\n",
|
||||
" <td>pink, yellow and brown petaled flowers</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LQJInG*JMyIm^ROpxbNFyCNGnln4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3389</th>\n",
|
||||
" <td>4oovIxttThA</td>\n",
|
||||
" <td>https://unsplash.com/photos/4oovIxttThA</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-1560850038-f...</td>\n",
|
||||
" <td>2019-06-18 09:36:35.94311</td>\n",
|
||||
" <td>t</td>\n",
|
||||
" <td>5025</td>\n",
|
||||
" <td>3141</td>\n",
|
||||
" <td>1.60</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>a8ka</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2190084956</td>\n",
|
||||
" <td>253730</td>\n",
|
||||
" <td>aerial view of houses near ocean</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LaCt8}~BwNIpozoLofofWBWBaef6</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18789</th>\n",
|
||||
" <td>BkR842UVXqk</td>\n",
|
||||
" <td>https://unsplash.com/photos/BkR842UVXqk</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-1558816280-d...</td>\n",
|
||||
" <td>2019-05-25 20:32:08.153319</td>\n",
|
||||
" <td>t</td>\n",
|
||||
" <td>4000</td>\n",
|
||||
" <td>6000</td>\n",
|
||||
" <td>0.67</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>olenkasergienko</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1934025254</td>\n",
|
||||
" <td>294785</td>\n",
|
||||
" <td>pink petaled flower</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LA71AxX50_xHt7j[S1ju0_nm^8NZ</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21742</th>\n",
|
||||
" <td>GRLN5FC4cLg</td>\n",
|
||||
" <td>https://unsplash.com/photos/GRLN5FC4cLg</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-1552300977-c...</td>\n",
|
||||
" <td>2019-03-11 10:50:25.9311</td>\n",
|
||||
" <td>t</td>\n",
|
||||
" <td>2992</td>\n",
|
||||
" <td>3992</td>\n",
|
||||
" <td>0.75</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>turner_imagery</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1916027735</td>\n",
|
||||
" <td>306073</td>\n",
|
||||
" <td>high angle photography of cliff</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LKCr=#~VNat7X-%M%1j?9tNbxaay</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3272</th>\n",
|
||||
" <td>5zsw1PjXg8k</td>\n",
|
||||
" <td>https://unsplash.com/photos/5zsw1PjXg8k</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-142372152343...</td>\n",
|
||||
" <td>2015-02-12 06:12:09.092905</td>\n",
|
||||
" <td>f</td>\n",
|
||||
" <td>2448</td>\n",
|
||||
" <td>3264</td>\n",
|
||||
" <td>0.75</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>melissaaskew</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>14088</td>\n",
|
||||
" <td>812</td>\n",
|
||||
" <td>waterfalls in the middle of the forest</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LG8#NK.84m4mt6f#RjkD9EM_%N-=</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3273</th>\n",
|
||||
" <td>gqa-fnYASIQ</td>\n",
|
||||
" <td>https://unsplash.com/photos/gqa-fnYASIQ</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-142302719730...</td>\n",
|
||||
" <td>2015-02-04 05:19:59.869141</td>\n",
|
||||
" <td>f</td>\n",
|
||||
" <td>5086</td>\n",
|
||||
" <td>3391</td>\n",
|
||||
" <td>1.50</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>wilstewart3</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>13384</td>\n",
|
||||
" <td>858</td>\n",
|
||||
" <td>a street sign sitting on the side of a body of...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LvKKi,RjM{j[_NWBWBfk5EoLoeaz</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12956</th>\n",
|
||||
" <td>Cq62qvCW8bM</td>\n",
|
||||
" <td>https://unsplash.com/photos/Cq62qvCW8bM</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-142245228993...</td>\n",
|
||||
" <td>2015-01-28 13:38:18.071331</td>\n",
|
||||
" <td>f</td>\n",
|
||||
" <td>4896</td>\n",
|
||||
" <td>3264</td>\n",
|
||||
" <td>1.50</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>kseny</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>13068</td>\n",
|
||||
" <td>1054</td>\n",
|
||||
" <td>man in black shirt and blue pants sitting on b...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LGEW2ko~M{%N0;ofnhRkwvozt8of</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17074</th>\n",
|
||||
" <td>Py8vZdCw35U</td>\n",
|
||||
" <td>https://unsplash.com/photos/Py8vZdCw35U</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-142034363140...</td>\n",
|
||||
" <td>2015-01-04 03:54:41.031772</td>\n",
|
||||
" <td>f</td>\n",
|
||||
" <td>2320</td>\n",
|
||||
" <td>1553</td>\n",
|
||||
" <td>1.49</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>mrbrodeur</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>12617</td>\n",
|
||||
" <td>581</td>\n",
|
||||
" <td>man in black jacket standing on brown sand und...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LuI~ZRogaxR*0?Rjofj[Mxs.a|fP</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1394</th>\n",
|
||||
" <td>SpRN0qZPLr8</td>\n",
|
||||
" <td>https://unsplash.com/photos/SpRN0qZPLr8</td>\n",
|
||||
" <td>https://images.unsplash.com/photo-141621393610...</td>\n",
|
||||
" <td>2014-11-17 08:47:33.427134</td>\n",
|
||||
" <td>f</td>\n",
|
||||
" <td>6016</td>\n",
|
||||
" <td>4000</td>\n",
|
||||
" <td>1.50</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>tarunccet</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>8266</td>\n",
|
||||
" <td>101</td>\n",
|
||||
" <td>brown wooden house on green grass field near b...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>LjEppYn}j]kC%jj[f6f6x8fPaxay</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>25000 rows × 31 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" photo_id photo_url \\\n",
|
||||
"3690 XFmznQhx9lM https://unsplash.com/photos/XFmznQhx9lM \n",
|
||||
"6103 YDNvydD1jAY https://unsplash.com/photos/YDNvydD1jAY \n",
|
||||
"3389 4oovIxttThA https://unsplash.com/photos/4oovIxttThA \n",
|
||||
"18789 BkR842UVXqk https://unsplash.com/photos/BkR842UVXqk \n",
|
||||
"21742 GRLN5FC4cLg https://unsplash.com/photos/GRLN5FC4cLg \n",
|
||||
"... ... ... \n",
|
||||
"3272 5zsw1PjXg8k https://unsplash.com/photos/5zsw1PjXg8k \n",
|
||||
"3273 gqa-fnYASIQ https://unsplash.com/photos/gqa-fnYASIQ \n",
|
||||
"12956 Cq62qvCW8bM https://unsplash.com/photos/Cq62qvCW8bM \n",
|
||||
"17074 Py8vZdCw35U https://unsplash.com/photos/Py8vZdCw35U \n",
|
||||
"1394 SpRN0qZPLr8 https://unsplash.com/photos/SpRN0qZPLr8 \n",
|
||||
"\n",
|
||||
" photo_image_url \\\n",
|
||||
"3690 https://images.unsplash.com/photo-156347321301... \n",
|
||||
"6103 https://images.unsplash.com/photo-149034936815... \n",
|
||||
"3389 https://images.unsplash.com/photo-1560850038-f... \n",
|
||||
"18789 https://images.unsplash.com/photo-1558816280-d... \n",
|
||||
"21742 https://images.unsplash.com/photo-1552300977-c... \n",
|
||||
"... ... \n",
|
||||
"3272 https://images.unsplash.com/photo-142372152343... \n",
|
||||
"3273 https://images.unsplash.com/photo-142302719730... \n",
|
||||
"12956 https://images.unsplash.com/photo-142245228993... \n",
|
||||
"17074 https://images.unsplash.com/photo-142034363140... \n",
|
||||
"1394 https://images.unsplash.com/photo-141621393610... \n",
|
||||
"\n",
|
||||
" photo_submitted_at photo_featured photo_width photo_height \\\n",
|
||||
"3690 2019-07-18 18:07:14.031684 t 4443 2962 \n",
|
||||
"6103 2017-03-24 09:56:57.505262 t 4500 3000 \n",
|
||||
"3389 2019-06-18 09:36:35.94311 t 5025 3141 \n",
|
||||
"18789 2019-05-25 20:32:08.153319 t 4000 6000 \n",
|
||||
"21742 2019-03-11 10:50:25.9311 t 2992 3992 \n",
|
||||
"... ... ... ... ... \n",
|
||||
"3272 2015-02-12 06:12:09.092905 f 2448 3264 \n",
|
||||
"3273 2015-02-04 05:19:59.869141 f 5086 3391 \n",
|
||||
"12956 2015-01-28 13:38:18.071331 f 4896 3264 \n",
|
||||
"17074 2015-01-04 03:54:41.031772 f 2320 1553 \n",
|
||||
"1394 2014-11-17 08:47:33.427134 f 6016 4000 \n",
|
||||
"\n",
|
||||
" photo_aspect_ratio photo_description \\\n",
|
||||
"3690 1.50 Fall color in the countryside of Eastern Washi... \n",
|
||||
"6103 1.50 Flowers in spring \n",
|
||||
"3389 1.60 NaN \n",
|
||||
"18789 0.67 NaN \n",
|
||||
"21742 0.75 NaN \n",
|
||||
"... ... ... \n",
|
||||
"3272 0.75 NaN \n",
|
||||
"3273 1.50 NaN \n",
|
||||
"12956 1.50 NaN \n",
|
||||
"17074 1.49 NaN \n",
|
||||
"1394 1.50 NaN \n",
|
||||
"\n",
|
||||
" photographer_username ... photo_location_country photo_location_city \\\n",
|
||||
"3690 timothyeberly ... NaN NaN \n",
|
||||
"6103 maartendeckers ... Belgium NaN \n",
|
||||
"3389 a8ka ... NaN NaN \n",
|
||||
"18789 olenkasergienko ... NaN NaN \n",
|
||||
"21742 turner_imagery ... NaN NaN \n",
|
||||
"... ... ... ... ... \n",
|
||||
"3272 melissaaskew ... NaN NaN \n",
|
||||
"3273 wilstewart3 ... NaN NaN \n",
|
||||
"12956 kseny ... NaN NaN \n",
|
||||
"17074 mrbrodeur ... NaN NaN \n",
|
||||
"1394 tarunccet ... NaN NaN \n",
|
||||
"\n",
|
||||
" stats_views stats_downloads \\\n",
|
||||
"3690 2978748547 304950 \n",
|
||||
"6103 2722857886 416983 \n",
|
||||
"3389 2190084956 253730 \n",
|
||||
"18789 1934025254 294785 \n",
|
||||
"21742 1916027735 306073 \n",
|
||||
"... ... ... \n",
|
||||
"3272 14088 812 \n",
|
||||
"3273 13384 858 \n",
|
||||
"12956 13068 1054 \n",
|
||||
"17074 12617 581 \n",
|
||||
"1394 8266 101 \n",
|
||||
"\n",
|
||||
" ai_description \\\n",
|
||||
"3690 orange leaf trees \n",
|
||||
"6103 pink, yellow and brown petaled flowers \n",
|
||||
"3389 aerial view of houses near ocean \n",
|
||||
"18789 pink petaled flower \n",
|
||||
"21742 high angle photography of cliff \n",
|
||||
"... ... \n",
|
||||
"3272 waterfalls in the middle of the forest \n",
|
||||
"3273 a street sign sitting on the side of a body of... \n",
|
||||
"12956 man in black shirt and blue pants sitting on b... \n",
|
||||
"17074 man in black jacket standing on brown sand und... \n",
|
||||
"1394 brown wooden house on green grass field near b... \n",
|
||||
"\n",
|
||||
" ai_primary_landmark_name ai_primary_landmark_latitude \\\n",
|
||||
"3690 NaN NaN \n",
|
||||
"6103 NaN NaN \n",
|
||||
"3389 NaN NaN \n",
|
||||
"18789 NaN NaN \n",
|
||||
"21742 NaN NaN \n",
|
||||
"... ... ... \n",
|
||||
"3272 NaN NaN \n",
|
||||
"3273 NaN NaN \n",
|
||||
"12956 NaN NaN \n",
|
||||
"17074 NaN NaN \n",
|
||||
"1394 NaN NaN \n",
|
||||
"\n",
|
||||
" ai_primary_landmark_longitude ai_primary_landmark_confidence \\\n",
|
||||
"3690 NaN NaN \n",
|
||||
"6103 NaN NaN \n",
|
||||
"3389 NaN NaN \n",
|
||||
"18789 NaN NaN \n",
|
||||
"21742 NaN NaN \n",
|
||||
"... ... ... \n",
|
||||
"3272 NaN NaN \n",
|
||||
"3273 NaN NaN \n",
|
||||
"12956 NaN NaN \n",
|
||||
"17074 NaN NaN \n",
|
||||
"1394 NaN NaN \n",
|
||||
"\n",
|
||||
" blur_hash \n",
|
||||
"3690 LBJPSa4o0hW?pI4;-.R*E459O?sk \n",
|
||||
"6103 LQJInG*JMyIm^ROpxbNFyCNGnln4 \n",
|
||||
"3389 LaCt8}~BwNIpozoLofofWBWBaef6 \n",
|
||||
"18789 LA71AxX50_xHt7j[S1ju0_nm^8NZ \n",
|
||||
"21742 LKCr=#~VNat7X-%M%1j?9tNbxaay \n",
|
||||
"... ... \n",
|
||||
"3272 LG8#NK.84m4mt6f#RjkD9EM_%N-= \n",
|
||||
"3273 LvKKi,RjM{j[_NWBWBfk5EoLoeaz \n",
|
||||
"12956 LGEW2ko~M{%N0;ofnhRkwvozt8of \n",
|
||||
"17074 LuI~ZRogaxR*0?Rjofj[Mxs.a|fP \n",
|
||||
"1394 LjEppYn}j]kC%jj[f6f6x8fPaxay \n",
|
||||
"\n",
|
||||
"[25000 rows x 31 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import polars as pl\n",
|
||||
"from pathlib import Path\n",
|
||||
"import logging\n",
|
||||
"from utils import set_up_logging, delete_corrupt_images\n",
|
||||
"\n",
|
||||
"DATA_PATH = Path('/mnt/wsl/PHYSICALDRIVE1/data/unsplash')\n",
|
||||
"DATA_PATH.mkdir(exist_ok=True, parents=True)\n",
|
||||
"set_up_logging(Path(\"../logs\"))\n",
|
||||
"\n",
|
||||
"unsplash_dataset_path = \"/home/andras/projects/bipolaroid/unsplash-research-dataset-lite-latest/photos.tsv000\"\n",
|
||||
"unsplash_dataset = pd.read_csv(unsplash_dataset_path, sep=\"\\t\")\n",
|
||||
"unsplash_dataset.sort_values(by=\"stats_views\", ascending=False, inplace=True)\n",
|
||||
"unsplash_dataset"
|
||||
"RETRY_COUNT = 10\n",
|
||||
"WORKER_COUNT = 64\n",
|
||||
"HTTP_TIMEOUT = 30\n",
|
||||
"TARGET_PATH = Path(\"/bulk2/downloaded-unsplash\")\n",
|
||||
"\n",
|
||||
"TARGET_PATH = TARGET_PATH.resolve()\n",
|
||||
"assert TARGET_PATH.exists()\n",
|
||||
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 113/25000 [00:30<1:02:36, 6.62it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading toPRrcyAIUY: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 1%| | 184/25000 [00:48<1:19:49, 5.18it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading t7YycgAoVSw: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 1%| | 219/25000 [00:58<1:40:16, 4.12it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading LOlMe8HfofI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 3%|▎ | 744/25000 [03:24<1:18:52, 5.12it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48c37350>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f50686e0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df49e0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df7b60>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48dc81d0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48dcac90>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df5850>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df7cb0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48c351f0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f4f6b4a0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 12%|█▏ | 2885/25000 [12:37<1:37:39, 3.77it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481fd370>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483a9580>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48380560>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48382ba0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48381940>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483828a0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483aaf60>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481fee10>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f56cbfb0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48383920>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 12%|█▏ | 2909/25000 [12:43<1:15:43, 4.86it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading NcociWzk23A: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 14%|█▍ | 3505/25000 [15:13<1:46:00, 3.38it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481af4d0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481ad160>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481943b0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa4818bcb0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48189fa0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48189a00>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48196c60>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481ac710>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481693a0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
|
||||
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f56e0110>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 29%|██▉ | 7352/25000 [31:15<1:03:57, 4.60it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading T2LEdBxpm54: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 34%|███▎ | 8375/25000 [35:53<1:13:19, 3.78it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading q_4pIVaXPEk: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 34%|███▍ | 8568/25000 [36:40<1:08:17, 4.01it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading iGANt1N2ge8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 37%|███▋ | 9282/25000 [40:24<1:41:11, 2.59it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading 2FqpN2CWCLo: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 37%|███▋ | 9343/25000 [40:45<1:22:43, 3.15it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading 4T7-GLBDLKE: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 38%|███▊ | 9398/25000 [41:03<1:43:40, 2.51it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading i-xtI6jD7bQ: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 40%|███▉ | 9992/25000 [44:16<1:03:56, 3.91it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading 0GBafJ-ZenA: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 46%|████▌ | 11508/25000 [51:29<1:16:16, 2.95it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading k2RWB_aPfqI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 47%|████▋ | 11626/25000 [52:01<36:59, 6.03it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading 7ICXVb10NJs: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 47%|████▋ | 11655/25000 [52:08<1:05:47, 3.38it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading PgBTaq-AgVI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 62%|██████▏ | 15477/25000 [1:10:12<47:47, 3.32it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error downloading zS_b76LrEL8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 70%|██████▉ | 17470/25000 [1:19:44<19:51, 6.32it/s] "
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"photos = (\n",
|
||||
" pl.scan_csv(\n",
|
||||
" \"../data/unsplash-full/photos.tsv000\",\n",
|
||||
" separator=\"\\t\",\n",
|
||||
" infer_schema_length=100000,\n",
|
||||
" )\n",
|
||||
" .filter(pl.col(\"photo_featured\") == \"t\")\n",
|
||||
" .sort(\"photo_id\")\n",
|
||||
" .select(\"photo_id\", \"photo_url\")\n",
|
||||
" .collect()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"photos.limit(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"keys = {path.name.split(\".\")[0] for path in TARGET_PATH.glob(\"*\")}\n",
|
||||
"photos = photos.filter(~pl.col(\"photo_id\").is_in(keys))\n",
|
||||
"logging.info(f\"Found {len(photos)} missing photos\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import concurrent.futures\n",
|
||||
"import requests\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from typing import List\n",
|
||||
"from time import sleep\n",
|
||||
"\n",
|
||||
"progress = tqdm(total=len(unsplash_dataset))\n",
|
||||
"progress = tqdm(total=len(photos))\n",
|
||||
"\n",
|
||||
"def download_image(row):\n",
|
||||
" filename = DATA_PATH / f\"{row['photo_id']}.jpg\"\n",
|
||||
" for _ in range(10):\n",
|
||||
" for retry_count in range(RETRY_COUNT):\n",
|
||||
" try:\n",
|
||||
" response = requests.get(row[\"photo_image_url\"], timeout=10)\n",
|
||||
" logging.debug(f\"Downloading {row['photo_id']} from {row['photo_url']}\")\n",
|
||||
" response = requests.get(row[\"photo_image_url\"], timeout=HTTP_TIMEOUT)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" extension = response.headers[\"Content-Type\"].split(\"/\")[-1]\n",
|
||||
" filename = TARGET_PATH / f\"{row['photo_id']}.{extension}\"\n",
|
||||
" with open(filename, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
" logging.debug(f\"Downloaded {row['photo_id']} to {filename}\")\n",
|
||||
" with progress.get_lock():\n",
|
||||
" progress.update(1)\n",
|
||||
" break\n",
|
||||
" return\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error downloading {row['photo_id']}: {e}\")\n",
|
||||
" logging.error(\n",
|
||||
" f\"Error downloading {row['photo_id']} from {row['photo_url']} (retry {retry_count}): {e}\"\n",
|
||||
" )\n",
|
||||
" sleep(retry_count * 0.5)\n",
|
||||
"\n",
|
||||
"with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:\n",
|
||||
"\n",
|
||||
"with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_COUNT) as executor:\n",
|
||||
" futures: List[concurrent.futures.Future] = []\n",
|
||||
" for row in unsplash_dataset.iterrows():\n",
|
||||
" row = row[1]\n",
|
||||
" for row in photos.to_dicts():\n",
|
||||
" future = executor.submit(download_image, row)\n",
|
||||
" futures.append(future)\n",
|
||||
"\n",
|
||||
|
|
@ -795,6 +100,15 @@
|
|||
" concurrent.futures.wait(futures)\n",
|
||||
"progress.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"delete_corrupt_images(list(TARGET_PATH.glob(\"*\")))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue