{ "cells": [ { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3439/3439 [00:00<00:00, 6104.95it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6988.74it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6957.73it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6734.31it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7696.85it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7331.94it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6240.69it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7451.37it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7135.27it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 3855.91it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 3567.51it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 2853.24it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6952.67it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6177.45it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 3130.18it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 3303.45it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 3662.39it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 2754.25it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6633.24it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6548.62it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 4601.06it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 2288.88it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 3635.54it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 2179.42it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6750.76it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6691.62it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 5768.00it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 3440.06it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 2743.69it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 3034.45it/s]\n", "100%|██████████| 3439/3439 [00:02<00:00, 1261.15it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6129.07it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6573.12it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 6425.97it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 2865.05it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 4130.32it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 3020.61it/s]\n", "100%|██████████| 3439/3439 [00:02<00:00, 1446.82it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 8095.71it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7679.18it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7918.50it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 3519.17it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 3258.94it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 2436.68it/s]\n", "100%|██████████| 3439/3439 [00:03<00:00, 1000.79it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7625.18it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7752.86it/s]\n", "100%|██████████| 3439/3439 [00:00<00:00, 7538.78it/s]\n", "100%|██████████| 3439/3439 [00:01<00:00, 3115.93it/s]\n" ] } ], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "from tqdm import tqdm\n", "import hashlib\n", "\n", "\n", "DATA_PATH = Path('/mnt/wsl/PHYSICALDRIVE1/data/laion')\n", "DATA_PATH.mkdir(exist_ok=True, parents=True)\n", "\n", "LAION_PATH = Path('/home/andras/projects/laion_improved_aesthetics_6.5plus_with_images/data')\n", "\n", "\n", "for file in LAION_PATH.glob(\"*.parquet\"):\n", " df = pd.read_parquet(file)\n", " for row in tqdm(list(df.iterrows())):\n", " row = row[1]\n", " bytes = row['image']['bytes']\n", " digest = hashlib.sha1(bytes).hexdigest()\n", " with open(DATA_PATH / f\"{digest}.jpg\", 'wb') as f:\n", " f.write(bytes)" ] } ], "metadata": { "kernelspec": { "display_name": "bipolaroid", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }