Move transform logic around

This commit is contained in:
Andras Schmelczer 2026-01-31 12:48:29 +00:00
parent e1b38a1b95
commit 38b0cf1ea1
14 changed files with 1073 additions and 336 deletions

View file

@ -8,17 +8,35 @@ import osmium
import polars as pl
from tqdm import tqdm
from .config import (
BATCH_SIZE,
GEOFABRIK_GB_URL,
MIN_OCCURENCE_COUNT,
POI_TAG_KEYS,
UK_BBOX_EAST,
UK_BBOX_NORTH,
UK_BBOX_SOUTH,
UK_BBOX_WEST,
from pathlib import Path
BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20
GEOFABRIK_GB_URL = (
"https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
)
UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96
UK_BBOX_EAST = 1.68
UK_BBOX_NORTH = 58.64
POI_TAG_KEYS: list[str] = [
"amenity",
"building",
"craft",
"emergency",
"healthcare",
"leisure",
"office",
"shop",
"tourism",
"public_transport",
]
def download_pbf(pbf_file: Path) -> None:
@ -144,10 +162,9 @@ def main() -> None:
)
df = df.join(valid_categories.select("category"), on="category", how="semi")
args.output.parent.mkdir(parents=True, exist_ok=True)
print(f"Total POIs: {handler.poi_count:,}")
df.sink_parquet(args.output)
print(f"Saved to {args.output}")
print(f"Total POIs: {handler.poi_count:,}")
if __name__ == "__main__":

View file

@ -1,32 +0,0 @@
from pathlib import Path
DATA_DIR = Path("./data_sources")
GB_PBF_FILE = DATA_DIR / "great-britain-latest.osm.pbf"
OUTPUT_FILE = DATA_DIR / "uk_pois.parquet"
BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20
GEOFABRIK_GB_URL = (
"https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
)
UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96
UK_BBOX_EAST = 1.68
UK_BBOX_NORTH = 58.64
POI_TAG_KEYS: list[str] = [
"amenity",
"building",
"craft",
"emergency",
"healthcare",
"leisure",
"office",
"shop",
"tourism",
"public_transport",
]