Move dict

This commit is contained in:
Andras Schmelczer 2026-02-01 08:48:07 +00:00
parent a3c1b6090e
commit ac45af8514
3 changed files with 19 additions and 16 deletions

View file

@ -8,6 +8,16 @@ import polars as pl
from pipeline.utils.poi_counts import _count_pois_per_postcode from pipeline.utils.poi_counts import _count_pois_per_postcode
# POI category groups for proximity counting
POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
"parks": ["Park", "Garden", "Nature Reserve"],
"public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py
}
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Count POIs within radius per postcode" description="Count POIs within radius per postcode"
@ -31,7 +41,7 @@ def main():
pois = pl.read_parquet(args.pois) pois = pl.read_parquet(args.pois)
result = _count_pois_per_postcode(postcodes, pois, radius_km=2) result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
result.write_parquet(args.output) result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024) size_mb = args.output.stat().st_size / (1024 * 1024)

View file

@ -1,7 +1,7 @@
from .download import download, extract_zip from .download import download, extract_zip
from .fuzzy_join import fuzzy_join_on_postcode from .fuzzy_join import fuzzy_join_on_postcode
from .haversine import haversine_km, haversine_km_expr from .haversine import haversine_km, haversine_km_expr
from .poi_counts import POI_GROUPS, count_pois_within_radius from .poi_counts import count_pois_within_radius
__all__ = [ __all__ = [
"download", "download",
@ -9,6 +9,5 @@ __all__ = [
"fuzzy_join_on_postcode", "fuzzy_join_on_postcode",
"haversine_km", "haversine_km",
"haversine_km_expr", "haversine_km_expr",
"POI_GROUPS",
"count_pois_within_radius", "count_pois_within_radius",
] ]

View file

@ -1,6 +1,5 @@
"""Count POIs within a radius of properties, optimized via postcode deduplication.""" """Count POIs within a radius of properties, optimized via postcode deduplication."""
import os
import tempfile import tempfile
import numpy as np import numpy as np
@ -8,17 +7,12 @@ import polars as pl
from .haversine import haversine_km from .haversine import haversine_km
# POI category groups for proximity counting
POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
"parks": ["Park", "Garden", "Nature Reserve"],
"public_transport": ["Station", "Stop", "Bus Station"],
}
def _count_pois_per_postcode( def _count_pois_per_postcode(
postcodes_df: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0 postcodes_df: pl.DataFrame,
pois: pl.DataFrame,
groups: dict[str, list[str]],
radius_km: float = 2.0,
) -> pl.DataFrame: ) -> pl.DataFrame:
""" """
For each unique postcode, count POIs within radius_km by category group. For each unique postcode, count POIs within radius_km by category group.
@ -59,7 +53,7 @@ def _count_pois_per_postcode(
# Pre-compute category masks # Pre-compute category masks
category_masks = {} category_masks = {}
for group, categories in POI_GROUPS.items(): for group, categories in groups.items():
mask = np.isin(poi_cats, categories) mask = np.isin(poi_cats, categories)
category_masks[group] = mask category_masks[group] = mask
print(f" {group}: {mask.sum():,} POIs") print(f" {group}: {mask.sum():,} POIs")
@ -71,7 +65,7 @@ def _count_pois_per_postcode(
# Initialize result arrays # Initialize result arrays
result_counts = { result_counts = {
group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS group: np.zeros(n_postcodes, dtype=np.int32) for group in groups
} }
# Process in batches with progress # Process in batches with progress
@ -128,7 +122,7 @@ def _count_pois_per_postcode(
# Build result dataframe # Build result dataframe
result_data = {"postcode": pc_codes} result_data = {"postcode": pc_codes}
for group in POI_GROUPS: for group in groups:
result_data[f"{group}_{int(radius_km)}km"] = result_counts[group] result_data[f"{group}_{int(radius_km)}km"] = result_counts[group]
result = pl.DataFrame(result_data) result = pl.DataFrame(result_data)