move files

This commit is contained in:
Andras Schmelczer 2024-05-09 21:22:28 +01:00
parent 1a41fd6829
commit 231e22cac8
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
36 changed files with 15580 additions and 79653 deletions

66
src/colour_lut.ipynb Normal file

File diff suppressed because one or more lines are too long

9
src/config.py Normal file
View file

@ -0,0 +1,9 @@
from pathlib import Path
DATA = sorted(Path("/mnt/wsl/PHYSICALDRIVE1/data/unsplash").glob("*.jpg"))
CACHE_PATH = Path("/mnt/wsl/PHYSICALDRIVE1/data/cache2")
CACHE_PATH.mkdir(exist_ok=True, parents=True)
MODELS_PATH = Path("models")
MODELS_PATH.mkdir(exist_ok=True, parents=True)

59
src/create_edits.ipynb Normal file

File diff suppressed because one or more lines are too long

0
src/editor/__init__.py Normal file
View file

View file

@ -0,0 +1,3 @@
from .regrain import regrain
from .pdf_transfer_1d import pdf_transfer_1d
from .pdf_transfer_3d import pdf_transfer_3d

View file

@ -0,0 +1,13 @@
import numpy as np
def pdf_transfer_1d(pX: np.ndarray, pY: np.ndarray) -> np.ndarray:
PX = np.cumsum(pX + np.finfo(float).eps)
PX /= PX[-1]
PY = np.cumsum(pY + np.finfo(float).eps)
PY /= PY[-1]
f = np.interp(PX, PY, np.arange(len(pX)))
return f

View file

@ -0,0 +1,46 @@
import numpy as np
from editor.utils import generate_rotation_matrices
from editor.histogram_transfer import pdf_transfer_1d
from editor.histogram_transfer import regrain
EPSILON = 1e-6
def pdf_transfer_3d(
source: np.ndarray,
target_flattened: np.ndarray,
relaxation: float = 1,
bin_count: int = 1000,
iterations: int = 25,
smoothness: float = 1,
):
[h, w, c] = source.shape
source_flattened = source.reshape(-1, c).transpose()
rotation_matrices = generate_rotation_matrices(iterations)
for i, rotation in enumerate(rotation_matrices, start=1):
D0R = rotation @ source_flattened
D1R = rotation @ target_flattened
D0R_ = np.zeros_like(source_flattened)
for i in range(rotation.shape[0]):
datamin = min(np.min(D0R[i, :]), np.min(D1R[i, :])) - EPSILON
datamax = max(np.max(D0R[i, :]), np.max(D1R[i, :])) + EPSILON
u = np.linspace(datamin, datamax, bin_count)
p0R, _ = np.histogram(D0R[i, :], bins=u, density=True)
p1R, _ = np.histogram(D1R[i, :], bins=u, density=True)
f = pdf_transfer_1d(p0R, p1R)
mapped_values = (
np.interp(D0R[i, :], u[:-1], f) * (datamax - datamin) / (bin_count - 1)
+ datamin
)
D0R_[i, :] = mapped_values
source_flattened = source_flattened + relaxation * (rotation.T @ (D0R_ - D0R))
source_flattened.clip(0, 255, out=source_flattened)
result = source_flattened.astype(np.uint8).transpose().reshape(h, w, c)
return regrain(source, result, smoothness=smoothness)

View file

@ -0,0 +1,87 @@
from scipy.ndimage import zoom
import numpy as np
NBITS = [4, 16, 32, 64, 64, 64]
def regrain(img_arr_in, img_arr_col, smoothness: float = 1):
"""keep gradient of img_arr_in and color of img_arr_col."""
img_arr_in = img_arr_in / 255.0
img_arr_col = img_arr_col / 255.0
img_arr_out = np.array(img_arr_in)
img_arr_out = _regrain_rec(
img_arr_out, img_arr_in, img_arr_col, NBITS, 0, smoothness
)
img_arr_out[img_arr_out < 0] = 0
img_arr_out[img_arr_out > 1] = 1
img_arr_out = (255.0 * img_arr_out).astype("uint8")
return img_arr_out
def _regrain_rec(img_arr_out, img_arr_in, img_arr_col, nbits, level, smoothness):
[h, w, _] = img_arr_in.shape
h2 = (h + 1) // 2
w2 = (w + 1) // 2
if len(nbits) > 1 and h2 > 20 and w2 > 20:
resize_arr_in = _resize_image(img_arr_in, w2, h2)
resize_arr_col = _resize_image(img_arr_col, w2, h2)
resize_arr_out = _resize_image(img_arr_out, w2, h2)
resize_arr_out = _regrain_rec(
resize_arr_out,
resize_arr_in,
resize_arr_col,
nbits[1:],
level + 1,
smoothness,
)
img_arr_out = _resize_image(resize_arr_out, w, h)
img_arr_out = _solve(
img_arr_out, img_arr_in, img_arr_col, nbits[0], level, smoothness
)
return img_arr_out
def _solve(img_arr_out, img_arr_in, img_arr_col, nbit, level, smoothness):
[width, height, c] = img_arr_in.shape
first_pad_0 = lambda arr: np.concatenate((arr[:1, :], arr[:-1, :]), axis=0)
first_pad_1 = lambda arr: np.concatenate((arr[:, :1], arr[:, :-1]), axis=1)
last_pad_0 = lambda arr: np.concatenate((arr[1:, :], arr[-1:, :]), axis=0)
last_pad_1 = lambda arr: np.concatenate((arr[:, 1:], arr[:, -1:]), axis=1)
delta_x = last_pad_1(img_arr_in) - first_pad_1(img_arr_in)
delta_y = last_pad_0(img_arr_in) - first_pad_0(img_arr_in)
delta = np.sqrt((delta_x**2 + delta_y**2).sum(axis=2, keepdims=True))
psi = 256 * delta / 5
psi[psi > 1] = 1
phi = 30 * 2 ** (-level) / (1 + 10 * delta / smoothness)
phi1 = (last_pad_1(phi) + phi) / 2
phi2 = (last_pad_0(phi) + phi) / 2
phi3 = (first_pad_1(phi) + phi) / 2
phi4 = (first_pad_0(phi) + phi) / 2
rho = 1 / 5.0
for i in range(nbit):
den = psi + phi1 + phi2 + phi3 + phi4
num = (
np.tile(psi, [1, 1, c]) * img_arr_col
+ np.tile(phi1, [1, 1, c])
* (last_pad_1(img_arr_out) - last_pad_1(img_arr_in) + img_arr_in)
+ np.tile(phi2, [1, 1, c])
* (last_pad_0(img_arr_out) - last_pad_0(img_arr_in) + img_arr_in)
+ np.tile(phi3, [1, 1, c])
* (first_pad_1(img_arr_out) - first_pad_1(img_arr_in) + img_arr_in)
+ np.tile(phi4, [1, 1, c])
* (first_pad_0(img_arr_out) - first_pad_0(img_arr_in) + img_arr_in)
)
img_arr_out = (
num / np.tile(den + 1e-6, [1, 1, c]) * (1 - rho) + rho * img_arr_out
)
return img_arr_out
def _resize_image(data, target_width, target_height):
return zoom(data, (target_height / data.shape[0], target_width / data.shape[1], 1))

View file

View file

@ -0,0 +1,3 @@
from .add_noise import add_noise
from .change_temperature import change_temperature
from .add_random_colour_spill import add_random_colour_spill

View file

@ -0,0 +1,11 @@
import numpy as np
from PIL import Image
def add_noise(img: Image, alpha: float) -> Image:
img = img.convert("RGB")
width, height = img.size
random_colors = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
random_img = Image.fromarray(random_colors, mode="RGB")
result = Image.blend(img, random_img, alpha)
return result

View file

@ -0,0 +1,20 @@
from PIL import Image
from ..utils import random
def add_random_colour_spill(image: Image, range: float) -> Image:
matrix = (
random(1 / range, range),
0.0,
0.0,
0.0,
0.0,
random(1 / range, range),
0.0,
0.0,
0.0,
0.0,
random(1 / range, range),
0.0,
)
return image.convert("RGB", matrix)

View file

@ -0,0 +1,42 @@
from PIL import Image
kelvin_table = {
1000: (255, 56, 0),
1500: (255, 109, 0),
2000: (255, 137, 18),
2500: (255, 161, 72),
3000: (255, 180, 107),
3500: (255, 196, 137),
4000: (255, 209, 163),
4500: (255, 219, 186),
5000: (255, 228, 206),
5500: (255, 236, 224),
6000: (255, 243, 239),
6500: (255, 249, 253),
7000: (245, 243, 255),
7500: (235, 238, 255),
8000: (227, 233, 255),
8500: (220, 229, 255),
9000: (214, 225, 255),
9500: (208, 222, 255),
10000: (204, 219, 255),
}
def change_temperature(image: Image, temperature: float) -> Image:
r, g, b = kelvin_table[temperature]
matrix = (
r / 255.0,
0.0,
0.0,
0.0,
0.0,
g / 255.0,
0.0,
0.0,
0.0,
0.0,
b / 255.0,
0.0,
)
return image.convert("RGB", matrix)

View file

@ -0,0 +1,3 @@
from .histogram_dataset import HistogramDataset
from .random_edit import random_edit
from .progressive_pooling_loss import ProgressivePoolingLoss

View file

@ -0,0 +1,89 @@
from torch.utils.data import Dataset
from typing import List, Optional, Tuple
from editor.utils import compute_histogram
from .random_edit import random_edit
from PIL import Image
from tqdm import tqdm
import torch
from pathlib import Path
import PIL.Image
PIL.Image.MAX_IMAGE_PIXELS = None
class HistogramDataset(Dataset):
def __init__(
self,
paths: List[Path],
edit_count: int = 5,
bin_count: int = 32,
target_size=(480, 480),
delete_corrupt_images: bool = False,
cache_path: Optional[Path] = None,
):
self._paths = sorted(paths)
self._edit_count = edit_count
self._bin_count = bin_count
self._target_size = target_size
self._cache_path = cache_path
if delete_corrupt_images:
self._delete_corrupt_images()
def _delete_corrupt_images(self) -> None:
deleted_count = 0
for path in tqdm(self._paths):
try:
Image.open(path)
except:
print(f"Failed to open {path}, deleting...")
deleted_count += 1
path.unlink()
print(f"Deleted {deleted_count} corrupt images")
def __len__(self):
return len(self._paths) * self._edit_count
def get_original_image(self, original_idx: int) -> Image.Image:
original_path = self._paths[original_idx]
original = Image.open(original_path)
original.thumbnail(
self._target_size, Image.Resampling.LANCZOS
) # size will be at most target_size, the aspect ratio is preserved
return original
def get_edited_image(self, original_idx: int, edit_idx: int) -> Image.Image:
original_image = self.get_original_image(original_idx)
return random_edit(original_image, seed=edit_idx)
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
if self._cache_path is not None:
self._cached_data_path = self._cache_path / f"{idx}.pt"
if self._cached_data_path.exists():
try:
return torch.load(self._cached_data_path)
except:
print(f"Failed to load {self._cached_data_path}, regenerating...")
original_idx = idx // self._edit_count
original = self.get_original_image(original_idx)
edited = random_edit(original, seed=idx)
edited_histogram = compute_histogram(
edited, bins=self._bin_count, normalize=True
)
original_histogram = compute_histogram(
original, bins=self._bin_count, normalize=True
)
result = (
torch.tensor(edited_histogram, dtype=torch.float).unsqueeze(0),
torch.tensor(original_histogram, dtype=torch.float).unsqueeze(0),
)
if self._cache_path is not None:
torch.save(result, self._cached_data_path)
return result

View file

@ -0,0 +1,38 @@
from typing import List
import torch
import torch.nn as nn
import torch.nn.functional as F
class ProgressivePoolingLoss(nn.Module):
def __init__(self, target_sizes: List[int], damping: float):
super(ProgressivePoolingLoss, self).__init__()
self._target_sizes = target_sizes
self._damping = damping
def forward(self, tensor_a, tensor_b):
assert (
tensor_a.size() == tensor_b.size()
), f"Input tensors must have the same size, got {tensor_a.size()} and {tensor_b.size()}"
assert (
len(tensor_a.size()) == 5
), f"Input tensors must have 5 dimensions, got {tensor_a.size()}"
_minibatch_size, _channels, depth, height, width = tensor_a.size()
assert depth == height == width, "Input tensors must be cubes."
loss = 0.0
weight = 1
for target_size in self._target_sizes:
pool_size = depth // target_size
pooled_a = F.avg_pool3d(tensor_a, pool_size) * (pool_size**3)
pooled_b = F.avg_pool3d(tensor_b, pool_size) * (pool_size**3)
diff = torch.abs(pooled_a - pooled_b)
loss += diff.mean() * weight
weight *= self._damping
return loss

View file

@ -0,0 +1,19 @@
from PIL import Image, ImageEnhance
from ..utils import random, get_colour_lut, apply_pixel_shader
from ..operations import add_noise, add_random_colour_spill
import numpy as np
def random_edit(img: Image, seed: int = 42) -> Image:
np.random.seed(seed)
img = add_noise(img, random(0, 0.2))
img = ImageEnhance.Contrast(img).enhance(random(0.5, 2))
img = add_random_colour_spill(img, 1.3)
img = img.convert("HSV")
saturation_lut = get_colour_lut(variance=0.3, count=5, type="linear")
brightness_lut = get_colour_lut(variance=0.3, count=5, type="cubic")
img = apply_pixel_shader(
img, lambda h, s, v: (h, saturation_lut[s], brightness_lut[v])
)
img = img.convert("RGB")
return img

View file

@ -0,0 +1,7 @@
from .interpolate import interpolate
from .random import random
from .apply_pixel_shader import apply_pixel_shader
from .get_colour_lut import get_colour_lut
from .compute_histogram import compute_histogram
from .kldiv import kldiv
from .generate_rotation_matrices import generate_rotation_matrices

View file

@ -0,0 +1,14 @@
from typing import Callable, Tuple
from PIL import Image
def apply_pixel_shader(
img: Image, callback: Callable[[int, int, int], Tuple[int, int, int]]
):
width, height = img.size
pixels = img.load()
for x in range(width):
for y in range(height):
r, g, b = pixels[x, y]
pixels[x, y] = callback(r, g, b)
return img

View file

@ -0,0 +1,22 @@
from PIL import Image
import numpy as np
def compute_histogram(
image: Image.Image | np.ndarray,
bins: int,
value_range=(0, 256),
normalize: bool = True,
) -> np.ndarray:
image = np.array(image) if isinstance(image, Image.Image) else image
histogram, _ = np.histogramdd(
image.reshape(-1, 3), bins=bins, range=[value_range, value_range, value_range]
)
histogram = histogram.astype(np.float32)
if normalize:
histogram = histogram / np.sum(histogram)
return histogram

View file

@ -0,0 +1,66 @@
from random import shuffle
from typing import List, Tuple
import numpy as np
from functools import lru_cache
from numpy.typing import NDArray
@lru_cache
def generate_rotation_matrices(count: int) -> List[NDArray[np.float64]]:
axes = fibonacci_sphere(count)
shuffle(axes)
angles = np.linspace(0, 2 * np.pi, count, endpoint=False)
matrices = [_rotation_matrix(axis, angle) for axis, angle in zip(axes, angles)]
for matrix in matrices:
_check_rotation_matrix(matrix)
return matrices
def fibonacci_sphere(samples: int) -> List[Tuple[float, float, float]]:
points = []
phi = np.pi * (3.0 - np.sqrt(5.0)) # Golden angle in radians
for i in range(samples):
y = 1 - (i / float(samples - 1)) * 2 # y goes from 1 to -1
radius = np.sqrt(1 - y * y) # radius at y
theta = phi * i # golden angle increment
x = np.cos(theta) * radius
z = np.sin(theta) * radius
points.append([x, y, z])
return points
def _rotation_matrix(
axis: Tuple[float, float, float], theta: float
) -> NDArray[np.float64]:
axis = np.asarray(axis)
axis = axis / np.sqrt(np.dot(axis, axis))
a = np.cos(theta / 2.0)
b, c, d = -axis * np.sin(theta / 2.0)
aa, bb, cc, dd = a * a, b * b, c * c, d * d
bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
return np.array(
[
[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
[2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
[2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc],
]
)
def _check_rotation_matrix(R: NDArray[np.float64]):
# Check if the matrix is square
if R.shape != (3, 3):
raise ValueError("Matrix must be 3x3.")
# Check orthogonality: R.T * R should be close to the identity matrix
I = np.eye(3)
if not np.allclose(np.dot(R.T, R), I):
raise ValueError("allclose")
# Check determinant: Should be +1
if not np.isclose(np.linalg.det(R), 1.0):
raise ValueError(f"det {np.linalg.det(R)}")

View file

@ -0,0 +1,21 @@
import numpy as np
from typing import List
from .random import random
from .interpolate import interpolate, INTERPOLATION_TYPE
def get_edit_points(variance: float, count: int) -> List[float]:
return [
random(i / (count - 1) - variance, i / (count - 1) + variance)
for i in range(count)
]
def get_colour_lut(
variance=0.1, count=5, type: INTERPOLATION_TYPE = "cubic"
) -> List[int]:
edit_points = get_edit_points(variance=variance, count=count)
return [
round(interpolate(edit_points, i / 255, type=type) * 255)
for i in np.linspace(0, 255, 256)
]

View file

@ -0,0 +1,35 @@
import numpy as np
from scipy.interpolate import CubicSpline
from typing import List, Literal
INTERPOLATION_TYPE = Literal["cubic", "linear"]
def interpolate(
control_points: List[float], t: float, type: INTERPOLATION_TYPE
) -> float:
control_points = sorted(control_points)
if type == "cubic":
x = np.linspace(0, 1, len(control_points))
cs = CubicSpline(x, control_points)
return cs(t)
if type == "linear":
n = len(control_points) - 1
segment_indices = np.linspace(0, 1, n + 1)
index = np.searchsorted(segment_indices, t, side="right") - 1
if t == 1:
return control_points[-1]
else:
t_normalized = (t - segment_indices[index]) / (
segment_indices[index + 1] - segment_indices[index]
)
return control_points[index] + t_normalized * (
control_points[index + 1] - control_points[index]
)
raise ValueError("Invalid type")

11
src/editor/utils/kldiv.py Normal file
View file

@ -0,0 +1,11 @@
import numpy as np
def kldiv(P: np.ndarray, Q: np.ndarray) -> float:
P /= P.sum()
Q /= Q.sum()
P_safe = np.maximum(P, np.finfo(float).eps)
Q_safe = np.maximum(Q, np.finfo(float).eps)
return np.sum(P_safe * np.log(P_safe / Q_safe))

View file

@ -0,0 +1,10 @@
import numpy as np
def random(min: float = 0, max: float = 1):
mu = (max + min) / 2 # Mean of the distribution
sigma = (
max - min
) / 6 # Standard deviation, chosen so that ~99.7% fall within [min_val, max_val]
sample = np.random.normal(mu, sigma)
return np.clip(sample, min, max)

View file

@ -0,0 +1,3 @@
from .display_images import display_images
from .plot_histograms_in_3d import plot_histograms_in_3d
from .plot_histograms_in_2d import plot_histograms_in_2d

View file

@ -0,0 +1,25 @@
import matplotlib.pyplot as plt
from typing import Dict
from PIL.Image import Image
from math import ceil
def display_images(images: Dict[str, Image], images_per_row: int = 3):
fig, axes = plt.subplots(
nrows=ceil(len(images) / images_per_row),
ncols=min(images_per_row, len(images)),
figsize=(12, 8),
)
axes = axes.flatten()
for i, (title, image) in enumerate(images.items()):
axes[i].imshow(image)
axes[i].axis("off")
axes[i].set_title(title)
for i in range(len(images), len(axes)):
axes[i].axis("off")
plt.tight_layout()
plt.show()

View file

@ -0,0 +1,32 @@
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from typing import Dict
def plot_histograms_in_2d(histograms: Dict[str, np.ndarray]):
fig = plt.figure(figsize=(15, 5))
for i, (title, histogram) in enumerate(histograms.items(), 1):
ax = fig.add_subplot(1, 3, i, projection="3d")
size = histogram.shape[0]
x, y, z = np.indices(histogram.shape)
x = x.flatten()
y = y.flatten()
z = z.flatten()
values = histogram.flatten()
sizes = values * 5000
colors = np.vstack((x, y, z)).T / (size - 1)
sc = ax.scatter(x, y, z, c=colors, s=sizes, marker="o", alpha=0.5)
ax.set_xlim([0, (size - 1)])
ax.set_ylim([0, (size - 1)])
ax.set_zlim([0, (size - 1)])
ax.set_title(title)
return fig

View file

@ -0,0 +1,62 @@
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from math import ceil
from typing import Dict
import numpy as np
def plot_histograms_in_3d(
histograms: Dict[str, np.ndarray], histogram_per_row: int = 3
):
cols = min(histogram_per_row, len(histograms))
rows = ceil(len(histograms) / histogram_per_row)
fig = make_subplots(
rows=rows,
cols=cols,
specs=[[{"type": "scatter3d"} for _ in range(cols)] for _ in range(rows)],
)
for i, (title, histogram) in enumerate(histograms.items()):
fig.add_trace(
_get_3d_scatter_plot_from_histogram(title, histogram),
row=(i // (histogram_per_row + 1)) + 1,
col=(i % histogram_per_row) + 1,
)
scenes = {
f"scene{i}": dict(camera=dict(eye=dict(x=0.1, y=0, z=2)))
for i in range(1, len(histograms) + 1)
}
fig.update_layout(**scenes)
fig.show()
def _get_3d_scatter_plot_from_histogram(title, histogram):
x, y, z, marker_size = [], [], [], []
bins = len(histogram)
for i, row in enumerate(histogram):
for j, col in enumerate(row):
for k, value in enumerate(col):
if value > 0:
x.append(i)
y.append(j)
z.append(k)
marker_size.append(value)
return go.Scatter3d(
x=x,
y=y,
z=z,
mode="markers",
marker=dict(
size=[min(20, ms * 10000) for ms in marker_size],
color=[
f"rgb({xi*256/bins},{yi*256/bins},{zi*256/bins})"
for xi, yi, zi in zip(x, y, z)
],
opacity=1,
line=dict(width=0),
),
name=title,
)

View file

@ -0,0 +1,821 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>photo_id</th>\n",
" <th>photo_url</th>\n",
" <th>photo_image_url</th>\n",
" <th>photo_submitted_at</th>\n",
" <th>photo_featured</th>\n",
" <th>photo_width</th>\n",
" <th>photo_height</th>\n",
" <th>photo_aspect_ratio</th>\n",
" <th>photo_description</th>\n",
" <th>photographer_username</th>\n",
" <th>...</th>\n",
" <th>photo_location_country</th>\n",
" <th>photo_location_city</th>\n",
" <th>stats_views</th>\n",
" <th>stats_downloads</th>\n",
" <th>ai_description</th>\n",
" <th>ai_primary_landmark_name</th>\n",
" <th>ai_primary_landmark_latitude</th>\n",
" <th>ai_primary_landmark_longitude</th>\n",
" <th>ai_primary_landmark_confidence</th>\n",
" <th>blur_hash</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3690</th>\n",
" <td>XFmznQhx9lM</td>\n",
" <td>https://unsplash.com/photos/XFmznQhx9lM</td>\n",
" <td>https://images.unsplash.com/photo-156347321301...</td>\n",
" <td>2019-07-18 18:07:14.031684</td>\n",
" <td>t</td>\n",
" <td>4443</td>\n",
" <td>2962</td>\n",
" <td>1.50</td>\n",
" <td>Fall color in the countryside of Eastern Washi...</td>\n",
" <td>timothyeberly</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2978748547</td>\n",
" <td>304950</td>\n",
" <td>orange leaf trees</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LBJPSa4o0hW?pI4;-.R*E459O?sk</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6103</th>\n",
" <td>YDNvydD1jAY</td>\n",
" <td>https://unsplash.com/photos/YDNvydD1jAY</td>\n",
" <td>https://images.unsplash.com/photo-149034936815...</td>\n",
" <td>2017-03-24 09:56:57.505262</td>\n",
" <td>t</td>\n",
" <td>4500</td>\n",
" <td>3000</td>\n",
" <td>1.50</td>\n",
" <td>Flowers in spring</td>\n",
" <td>maartendeckers</td>\n",
" <td>...</td>\n",
" <td>Belgium</td>\n",
" <td>NaN</td>\n",
" <td>2722857886</td>\n",
" <td>416983</td>\n",
" <td>pink, yellow and brown petaled flowers</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LQJInG*JMyIm^ROpxbNFyCNGnln4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3389</th>\n",
" <td>4oovIxttThA</td>\n",
" <td>https://unsplash.com/photos/4oovIxttThA</td>\n",
" <td>https://images.unsplash.com/photo-1560850038-f...</td>\n",
" <td>2019-06-18 09:36:35.94311</td>\n",
" <td>t</td>\n",
" <td>5025</td>\n",
" <td>3141</td>\n",
" <td>1.60</td>\n",
" <td>NaN</td>\n",
" <td>a8ka</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2190084956</td>\n",
" <td>253730</td>\n",
" <td>aerial view of houses near ocean</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LaCt8}~BwNIpozoLofofWBWBaef6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18789</th>\n",
" <td>BkR842UVXqk</td>\n",
" <td>https://unsplash.com/photos/BkR842UVXqk</td>\n",
" <td>https://images.unsplash.com/photo-1558816280-d...</td>\n",
" <td>2019-05-25 20:32:08.153319</td>\n",
" <td>t</td>\n",
" <td>4000</td>\n",
" <td>6000</td>\n",
" <td>0.67</td>\n",
" <td>NaN</td>\n",
" <td>olenkasergienko</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1934025254</td>\n",
" <td>294785</td>\n",
" <td>pink petaled flower</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LA71AxX50_xHt7j[S1ju0_nm^8NZ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21742</th>\n",
" <td>GRLN5FC4cLg</td>\n",
" <td>https://unsplash.com/photos/GRLN5FC4cLg</td>\n",
" <td>https://images.unsplash.com/photo-1552300977-c...</td>\n",
" <td>2019-03-11 10:50:25.9311</td>\n",
" <td>t</td>\n",
" <td>2992</td>\n",
" <td>3992</td>\n",
" <td>0.75</td>\n",
" <td>NaN</td>\n",
" <td>turner_imagery</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1916027735</td>\n",
" <td>306073</td>\n",
" <td>high angle photography of cliff</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LKCr=#~VNat7X-%M%1j?9tNbxaay</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3272</th>\n",
" <td>5zsw1PjXg8k</td>\n",
" <td>https://unsplash.com/photos/5zsw1PjXg8k</td>\n",
" <td>https://images.unsplash.com/photo-142372152343...</td>\n",
" <td>2015-02-12 06:12:09.092905</td>\n",
" <td>f</td>\n",
" <td>2448</td>\n",
" <td>3264</td>\n",
" <td>0.75</td>\n",
" <td>NaN</td>\n",
" <td>melissaaskew</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>14088</td>\n",
" <td>812</td>\n",
" <td>waterfalls in the middle of the forest</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LG8#NK.84m4mt6f#RjkD9EM_%N-=</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3273</th>\n",
" <td>gqa-fnYASIQ</td>\n",
" <td>https://unsplash.com/photos/gqa-fnYASIQ</td>\n",
" <td>https://images.unsplash.com/photo-142302719730...</td>\n",
" <td>2015-02-04 05:19:59.869141</td>\n",
" <td>f</td>\n",
" <td>5086</td>\n",
" <td>3391</td>\n",
" <td>1.50</td>\n",
" <td>NaN</td>\n",
" <td>wilstewart3</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13384</td>\n",
" <td>858</td>\n",
" <td>a street sign sitting on the side of a body of...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LvKKi,RjM{j[_NWBWBfk5EoLoeaz</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12956</th>\n",
" <td>Cq62qvCW8bM</td>\n",
" <td>https://unsplash.com/photos/Cq62qvCW8bM</td>\n",
" <td>https://images.unsplash.com/photo-142245228993...</td>\n",
" <td>2015-01-28 13:38:18.071331</td>\n",
" <td>f</td>\n",
" <td>4896</td>\n",
" <td>3264</td>\n",
" <td>1.50</td>\n",
" <td>NaN</td>\n",
" <td>kseny</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>13068</td>\n",
" <td>1054</td>\n",
" <td>man in black shirt and blue pants sitting on b...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LGEW2ko~M{%N0;ofnhRkwvozt8of</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17074</th>\n",
" <td>Py8vZdCw35U</td>\n",
" <td>https://unsplash.com/photos/Py8vZdCw35U</td>\n",
" <td>https://images.unsplash.com/photo-142034363140...</td>\n",
" <td>2015-01-04 03:54:41.031772</td>\n",
" <td>f</td>\n",
" <td>2320</td>\n",
" <td>1553</td>\n",
" <td>1.49</td>\n",
" <td>NaN</td>\n",
" <td>mrbrodeur</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>12617</td>\n",
" <td>581</td>\n",
" <td>man in black jacket standing on brown sand und...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LuI~ZRogaxR*0?Rjofj[Mxs.a|fP</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1394</th>\n",
" <td>SpRN0qZPLr8</td>\n",
" <td>https://unsplash.com/photos/SpRN0qZPLr8</td>\n",
" <td>https://images.unsplash.com/photo-141621393610...</td>\n",
" <td>2014-11-17 08:47:33.427134</td>\n",
" <td>f</td>\n",
" <td>6016</td>\n",
" <td>4000</td>\n",
" <td>1.50</td>\n",
" <td>NaN</td>\n",
" <td>tarunccet</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8266</td>\n",
" <td>101</td>\n",
" <td>brown wooden house on green grass field near b...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LjEppYn}j]kC%jj[f6f6x8fPaxay</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>25000 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" photo_id photo_url \\\n",
"3690 XFmznQhx9lM https://unsplash.com/photos/XFmznQhx9lM \n",
"6103 YDNvydD1jAY https://unsplash.com/photos/YDNvydD1jAY \n",
"3389 4oovIxttThA https://unsplash.com/photos/4oovIxttThA \n",
"18789 BkR842UVXqk https://unsplash.com/photos/BkR842UVXqk \n",
"21742 GRLN5FC4cLg https://unsplash.com/photos/GRLN5FC4cLg \n",
"... ... ... \n",
"3272 5zsw1PjXg8k https://unsplash.com/photos/5zsw1PjXg8k \n",
"3273 gqa-fnYASIQ https://unsplash.com/photos/gqa-fnYASIQ \n",
"12956 Cq62qvCW8bM https://unsplash.com/photos/Cq62qvCW8bM \n",
"17074 Py8vZdCw35U https://unsplash.com/photos/Py8vZdCw35U \n",
"1394 SpRN0qZPLr8 https://unsplash.com/photos/SpRN0qZPLr8 \n",
"\n",
" photo_image_url \\\n",
"3690 https://images.unsplash.com/photo-156347321301... \n",
"6103 https://images.unsplash.com/photo-149034936815... \n",
"3389 https://images.unsplash.com/photo-1560850038-f... \n",
"18789 https://images.unsplash.com/photo-1558816280-d... \n",
"21742 https://images.unsplash.com/photo-1552300977-c... \n",
"... ... \n",
"3272 https://images.unsplash.com/photo-142372152343... \n",
"3273 https://images.unsplash.com/photo-142302719730... \n",
"12956 https://images.unsplash.com/photo-142245228993... \n",
"17074 https://images.unsplash.com/photo-142034363140... \n",
"1394 https://images.unsplash.com/photo-141621393610... \n",
"\n",
" photo_submitted_at photo_featured photo_width photo_height \\\n",
"3690 2019-07-18 18:07:14.031684 t 4443 2962 \n",
"6103 2017-03-24 09:56:57.505262 t 4500 3000 \n",
"3389 2019-06-18 09:36:35.94311 t 5025 3141 \n",
"18789 2019-05-25 20:32:08.153319 t 4000 6000 \n",
"21742 2019-03-11 10:50:25.9311 t 2992 3992 \n",
"... ... ... ... ... \n",
"3272 2015-02-12 06:12:09.092905 f 2448 3264 \n",
"3273 2015-02-04 05:19:59.869141 f 5086 3391 \n",
"12956 2015-01-28 13:38:18.071331 f 4896 3264 \n",
"17074 2015-01-04 03:54:41.031772 f 2320 1553 \n",
"1394 2014-11-17 08:47:33.427134 f 6016 4000 \n",
"\n",
" photo_aspect_ratio photo_description \\\n",
"3690 1.50 Fall color in the countryside of Eastern Washi... \n",
"6103 1.50 Flowers in spring \n",
"3389 1.60 NaN \n",
"18789 0.67 NaN \n",
"21742 0.75 NaN \n",
"... ... ... \n",
"3272 0.75 NaN \n",
"3273 1.50 NaN \n",
"12956 1.50 NaN \n",
"17074 1.49 NaN \n",
"1394 1.50 NaN \n",
"\n",
" photographer_username ... photo_location_country photo_location_city \\\n",
"3690 timothyeberly ... NaN NaN \n",
"6103 maartendeckers ... Belgium NaN \n",
"3389 a8ka ... NaN NaN \n",
"18789 olenkasergienko ... NaN NaN \n",
"21742 turner_imagery ... NaN NaN \n",
"... ... ... ... ... \n",
"3272 melissaaskew ... NaN NaN \n",
"3273 wilstewart3 ... NaN NaN \n",
"12956 kseny ... NaN NaN \n",
"17074 mrbrodeur ... NaN NaN \n",
"1394 tarunccet ... NaN NaN \n",
"\n",
" stats_views stats_downloads \\\n",
"3690 2978748547 304950 \n",
"6103 2722857886 416983 \n",
"3389 2190084956 253730 \n",
"18789 1934025254 294785 \n",
"21742 1916027735 306073 \n",
"... ... ... \n",
"3272 14088 812 \n",
"3273 13384 858 \n",
"12956 13068 1054 \n",
"17074 12617 581 \n",
"1394 8266 101 \n",
"\n",
" ai_description \\\n",
"3690 orange leaf trees \n",
"6103 pink, yellow and brown petaled flowers \n",
"3389 aerial view of houses near ocean \n",
"18789 pink petaled flower \n",
"21742 high angle photography of cliff \n",
"... ... \n",
"3272 waterfalls in the middle of the forest \n",
"3273 a street sign sitting on the side of a body of... \n",
"12956 man in black shirt and blue pants sitting on b... \n",
"17074 man in black jacket standing on brown sand und... \n",
"1394 brown wooden house on green grass field near b... \n",
"\n",
" ai_primary_landmark_name ai_primary_landmark_latitude \\\n",
"3690 NaN NaN \n",
"6103 NaN NaN \n",
"3389 NaN NaN \n",
"18789 NaN NaN \n",
"21742 NaN NaN \n",
"... ... ... \n",
"3272 NaN NaN \n",
"3273 NaN NaN \n",
"12956 NaN NaN \n",
"17074 NaN NaN \n",
"1394 NaN NaN \n",
"\n",
" ai_primary_landmark_longitude ai_primary_landmark_confidence \\\n",
"3690 NaN NaN \n",
"6103 NaN NaN \n",
"3389 NaN NaN \n",
"18789 NaN NaN \n",
"21742 NaN NaN \n",
"... ... ... \n",
"3272 NaN NaN \n",
"3273 NaN NaN \n",
"12956 NaN NaN \n",
"17074 NaN NaN \n",
"1394 NaN NaN \n",
"\n",
" blur_hash \n",
"3690 LBJPSa4o0hW?pI4;-.R*E459O?sk \n",
"6103 LQJInG*JMyIm^ROpxbNFyCNGnln4 \n",
"3389 LaCt8}~BwNIpozoLofofWBWBaef6 \n",
"18789 LA71AxX50_xHt7j[S1ju0_nm^8NZ \n",
"21742 LKCr=#~VNat7X-%M%1j?9tNbxaay \n",
"... ... \n",
"3272 LG8#NK.84m4mt6f#RjkD9EM_%N-= \n",
"3273 LvKKi,RjM{j[_NWBWBfk5EoLoeaz \n",
"12956 LGEW2ko~M{%N0;ofnhRkwvozt8of \n",
"17074 LuI~ZRogaxR*0?Rjofj[Mxs.a|fP \n",
"1394 LjEppYn}j]kC%jj[f6f6x8fPaxay \n",
"\n",
"[25000 rows x 31 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"\n",
"DATA_PATH = Path('/mnt/wsl/PHYSICALDRIVE1/data/unsplash')\n",
"DATA_PATH.mkdir(exist_ok=True, parents=True)\n",
"\n",
"unsplash_dataset_path = \"/home/andras/projects/bipolaroid/unsplash-research-dataset-lite-latest/photos.tsv000\"\n",
"unsplash_dataset = pd.read_csv(unsplash_dataset_path, sep=\"\\t\")\n",
"unsplash_dataset.sort_values(by=\"stats_views\", ascending=False, inplace=True)\n",
"unsplash_dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 113/25000 [00:30<1:02:36, 6.62it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading toPRrcyAIUY: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 1%| | 184/25000 [00:48<1:19:49, 5.18it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading t7YycgAoVSw: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 1%| | 219/25000 [00:58<1:40:16, 4.12it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading LOlMe8HfofI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 3%|▎ | 744/25000 [03:24<1:18:52, 5.12it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48c37350>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f50686e0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df49e0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df7b60>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48dc81d0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48dcac90>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df5850>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48df7cb0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48c351f0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading vigsqYux_-8: HTTPSConnectionPool(host='images.unsplash.com_thebeach.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f4f6b4a0>: Failed to resolve 'images.unsplash.com_thebeach.jpg' ([Errno -2] Name or service not known)\"))\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 12%|█▏ | 2885/25000 [12:37<1:37:39, 3.77it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481fd370>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483a9580>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48380560>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48382ba0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48381940>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483828a0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa483aaf60>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481fee10>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f56cbfb0>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n",
"Error downloading 9_9hzZVjV8s: HTTPSConnectionPool(host='images.unsplash.company', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48383920>: Failed to resolve 'images.unsplash.company' ([Errno -2] Name or service not known)\"))\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 12%|█▏ | 2909/25000 [12:43<1:15:43, 4.86it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading NcociWzk23A: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 14%|█▍ | 3505/25000 [15:13<1:46:00, 3.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481af4d0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481ad160>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481943b0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa4818bcb0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48189fa0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48189a00>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa48196c60>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481ac710>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7faa481693a0>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n",
"Error downloading rsJtMXn3p_c: HTTPSConnectionPool(host='images.unsplash.com-grass-sun.jpg', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fa9f56e0110>: Failed to resolve 'images.unsplash.com-grass-sun.jpg' ([Errno -2] Name or service not known)\"))\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 29%|██▉ | 7352/25000 [31:15<1:03:57, 4.60it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading T2LEdBxpm54: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 34%|███▎ | 8375/25000 [35:53<1:13:19, 3.78it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading q_4pIVaXPEk: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 34%|███▍ | 8568/25000 [36:40<1:08:17, 4.01it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading iGANt1N2ge8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 37%|███▋ | 9282/25000 [40:24<1:41:11, 2.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 2FqpN2CWCLo: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 37%|███▋ | 9343/25000 [40:45<1:22:43, 3.15it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 4T7-GLBDLKE: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 38%|███▊ | 9398/25000 [41:03<1:43:40, 2.51it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading i-xtI6jD7bQ: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 40%|███▉ | 9992/25000 [44:16<1:03:56, 3.91it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 0GBafJ-ZenA: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out. (read timeout=10)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 46%|████▌ | 11508/25000 [51:29<1:16:16, 2.95it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading k2RWB_aPfqI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 47%|████▋ | 11626/25000 [52:01<36:59, 6.03it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading 7ICXVb10NJs: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 47%|████▋ | 11655/25000 [52:08<1:05:47, 3.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading PgBTaq-AgVI: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 62%|██████▏ | 15477/25000 [1:10:12<47:47, 3.32it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error downloading zS_b76LrEL8: HTTPSConnectionPool(host='images.unsplash.com', port=443): Read timed out.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 70%|██████▉ | 17470/25000 [1:19:44<19:51, 6.32it/s] "
]
}
],
"source": [
"import concurrent.futures\n",
"import requests\n",
"from tqdm import tqdm\n",
"from typing import List\n",
"\n",
"progress = tqdm(total=len(unsplash_dataset))\n",
"\n",
"def download_image(row):\n",
" filename = DATA_PATH / f\"{row['photo_id']}.jpg\"\n",
" for _ in range(10):\n",
" try:\n",
" response = requests.get(row[\"photo_image_url\"], timeout=10)\n",
" with open(filename, \"wb\") as f:\n",
" f.write(response.content)\n",
" with progress.get_lock():\n",
" progress.update(1)\n",
" break\n",
" except Exception as e:\n",
" print(f\"Error downloading {row['photo_id']}: {e}\")\n",
"\n",
"with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:\n",
" futures: List[concurrent.futures.Future] = []\n",
" for row in unsplash_dataset.iterrows():\n",
" row = row[1]\n",
" future = executor.submit(download_image, row)\n",
" futures.append(future)\n",
"\n",
" progress.display()\n",
" concurrent.futures.wait(futures)\n",
"progress.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "bipolaroid",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

109
src/laion.ipynb Normal file
View file

@ -0,0 +1,109 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3439/3439 [00:00<00:00, 6104.95it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6988.74it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6957.73it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6734.31it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7696.85it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7331.94it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6240.69it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7451.37it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7135.27it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 3855.91it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 3567.51it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 2853.24it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6952.67it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6177.45it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 3130.18it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 3303.45it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 3662.39it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 2754.25it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6633.24it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6548.62it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 4601.06it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 2288.88it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 3635.54it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 2179.42it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6750.76it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6691.62it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 5768.00it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 3440.06it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 2743.69it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 3034.45it/s]\n",
"100%|██████████| 3439/3439 [00:02<00:00, 1261.15it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6129.07it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6573.12it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 6425.97it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 2865.05it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 4130.32it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 3020.61it/s]\n",
"100%|██████████| 3439/3439 [00:02<00:00, 1446.82it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 8095.71it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7679.18it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7918.50it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 3519.17it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 3258.94it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 2436.68it/s]\n",
"100%|██████████| 3439/3439 [00:03<00:00, 1000.79it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7625.18it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7752.86it/s]\n",
"100%|██████████| 3439/3439 [00:00<00:00, 7538.78it/s]\n",
"100%|██████████| 3439/3439 [00:01<00:00, 3115.93it/s]\n"
]
}
],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"import hashlib\n",
"\n",
"\n",
"DATA_PATH = Path('/mnt/wsl/PHYSICALDRIVE1/data/laion')\n",
"DATA_PATH.mkdir(exist_ok=True, parents=True)\n",
"\n",
"LAION_PATH = Path('/home/andras/projects/laion_improved_aesthetics_6.5plus_with_images/data')\n",
"\n",
"\n",
"for file in LAION_PATH.glob(\"*.parquet\"):\n",
" df = pd.read_parquet(file)\n",
" for row in tqdm(list(df.iterrows())):\n",
" row = row[1]\n",
" bytes = row['image']['bytes']\n",
" digest = hashlib.sha1(bytes).hexdigest()\n",
" with open(DATA_PATH / f\"{digest}.jpg\", 'wb') as f:\n",
" f.write(bytes)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "bipolaroid",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

31958
src/pdf_transfer.ipynb Normal file

File diff suppressed because one or more lines are too long

40783
src/show_histograms.ipynb Normal file

File diff suppressed because one or more lines are too long

111557
src/train.ipynb Normal file

File diff suppressed because one or more lines are too long