Last night
This commit is contained in:
parent
2906b01734
commit
42ee2d4c51
47 changed files with 848 additions and 478 deletions
|
|
@ -1,14 +1,12 @@
|
|||
"""Count POIs within a radius of properties, optimized via postcode deduplication."""
|
||||
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from .haversine import haversine_km
|
||||
|
||||
|
||||
def _count_pois_per_postcode(
|
||||
def count_pois_per_postcode(
|
||||
postcodes_df: pl.DataFrame,
|
||||
pois: pl.DataFrame,
|
||||
groups: dict[str, list[str]],
|
||||
|
|
@ -64,9 +62,7 @@ def _count_pois_per_postcode(
|
|||
pc_codes = postcodes_df["postcode"].to_list()
|
||||
|
||||
# Initialize result arrays
|
||||
result_counts = {
|
||||
group: np.zeros(n_postcodes, dtype=np.int32) for group in groups
|
||||
}
|
||||
result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in groups}
|
||||
|
||||
# Process in batches with progress
|
||||
batch_size = 50000
|
||||
|
|
@ -128,47 +124,3 @@ def _count_pois_per_postcode(
|
|||
result = pl.DataFrame(result_data)
|
||||
print(" Completed POI counting")
|
||||
return result
|
||||
|
||||
|
||||
def count_pois_within_radius(
|
||||
properties: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0
|
||||
) -> dict[str, pl.Series]:
|
||||
"""
|
||||
Count POIs within radius for properties, optimized by deduplicating postcodes.
|
||||
|
||||
Returns dict of {column_name: count_series} aligned to properties dataframe.
|
||||
"""
|
||||
# Get unique postcodes with coordinates
|
||||
print("Deduplicating postcodes...")
|
||||
unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
|
||||
subset=["postcode"]
|
||||
)
|
||||
|
||||
print(
|
||||
f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
|
||||
)
|
||||
|
||||
# Count POIs per postcode
|
||||
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
|
||||
|
||||
print(" Writing postcode counts to temp file...")
|
||||
with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp:
|
||||
tmp_path = tmp.name
|
||||
postcode_counts.write_parquet(tmp_path)
|
||||
|
||||
# Join using lazy evaluation
|
||||
print(" Joining counts back to properties (lazy)...")
|
||||
count_cols = [f"{group}_{int(radius_km)}km" for group in POI_GROUPS]
|
||||
|
||||
# Convert properties to lazy frame, join, then collect
|
||||
result_lazy = (
|
||||
properties.lazy()
|
||||
.select("postcode")
|
||||
.join(pl.scan_parquet(tmp_path), on="postcode", how="left")
|
||||
.select(count_cols)
|
||||
.fill_null(0)
|
||||
)
|
||||
|
||||
result_df = result_lazy.collect(engine="streaming")
|
||||
|
||||
return {col: result_df[col] for col in count_cols}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue