From ac45af85145faa83469729b86ae4e42f06b28b3a Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 1 Feb 2026 08:48:07 +0000 Subject: [PATCH] Move dict --- pipeline/transform/poi_proximity.py | 12 +++++++++++- pipeline/utils/__init__.py | 3 +-- pipeline/utils/poi_counts.py | 20 +++++++------------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/pipeline/transform/poi_proximity.py b/pipeline/transform/poi_proximity.py index dce09cc..0be1a9f 100644 --- a/pipeline/transform/poi_proximity.py +++ b/pipeline/transform/poi_proximity.py @@ -8,6 +8,16 @@ import polars as pl from pipeline.utils.poi_counts import _count_pois_per_postcode +# POI category groups for proximity counting +POI_GROUPS = { + "restaurants": ["Restaurant", "Fast Food"], + "groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"], + "parks": ["Park", "Garden", "Nature Reserve"], + "public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py +} + + + def main(): parser = argparse.ArgumentParser( description="Count POIs within radius per postcode" @@ -31,7 +41,7 @@ def main(): pois = pl.read_parquet(args.pois) - result = _count_pois_per_postcode(postcodes, pois, radius_km=2) + result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2) result.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024) diff --git a/pipeline/utils/__init__.py b/pipeline/utils/__init__.py index 772d4ae..9dc42cc 100644 --- a/pipeline/utils/__init__.py +++ b/pipeline/utils/__init__.py @@ -1,7 +1,7 @@ from .download import download, extract_zip from .fuzzy_join import fuzzy_join_on_postcode from .haversine import haversine_km, haversine_km_expr -from .poi_counts import POI_GROUPS, count_pois_within_radius +from .poi_counts import count_pois_within_radius __all__ = [ "download", @@ -9,6 +9,5 @@ __all__ = [ "fuzzy_join_on_postcode", "haversine_km", "haversine_km_expr", - "POI_GROUPS", "count_pois_within_radius", ] diff --git a/pipeline/utils/poi_counts.py b/pipeline/utils/poi_counts.py index 8c39288..0112080 100644 --- a/pipeline/utils/poi_counts.py +++ b/pipeline/utils/poi_counts.py @@ -1,6 +1,5 @@ """Count POIs within a radius of properties, optimized via postcode deduplication.""" -import os import tempfile import numpy as np @@ -8,17 +7,12 @@ import polars as pl from .haversine import haversine_km -# POI category groups for proximity counting -POI_GROUPS = { - "restaurants": ["Restaurant", "Fast Food"], - "groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"], - "parks": ["Park", "Garden", "Nature Reserve"], - "public_transport": ["Station", "Stop", "Bus Station"], -} - def _count_pois_per_postcode( - postcodes_df: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0 + postcodes_df: pl.DataFrame, + pois: pl.DataFrame, + groups: dict[str, list[str]], + radius_km: float = 2.0, ) -> pl.DataFrame: """ For each unique postcode, count POIs within radius_km by category group. @@ -59,7 +53,7 @@ def _count_pois_per_postcode( # Pre-compute category masks category_masks = {} - for group, categories in POI_GROUPS.items(): + for group, categories in groups.items(): mask = np.isin(poi_cats, categories) category_masks[group] = mask print(f" {group}: {mask.sum():,} POIs") @@ -71,7 +65,7 @@ def _count_pois_per_postcode( # Initialize result arrays result_counts = { - group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS + group: np.zeros(n_postcodes, dtype=np.int32) for group in groups } # Process in batches with progress @@ -128,7 +122,7 @@ def _count_pois_per_postcode( # Build result dataframe result_data = {"postcode": pc_codes} - for group in POI_GROUPS: + for group in groups: result_data[f"{group}_{int(radius_km)}km"] = result_counts[group] result = pl.DataFrame(result_data)