Last night
This commit is contained in:
parent
2906b01734
commit
42ee2d4c51
47 changed files with 848 additions and 478 deletions
|
|
@ -98,10 +98,9 @@ def main():
|
|||
if checkpoint_path.exists():
|
||||
checkpoint_df = pl.read_parquet(checkpoint_path)
|
||||
# Deduplicate checkpoint rows per postcode, preferring rows with data
|
||||
checkpoint_df = (
|
||||
checkpoint_df.sort("public_transport_quick_minutes", nulls_last=True)
|
||||
.unique(subset=["postcode"], keep="first")
|
||||
)
|
||||
checkpoint_df = checkpoint_df.sort(
|
||||
"public_transport_quick_minutes", nulls_last=True
|
||||
).unique(subset=["postcode"], keep="first")
|
||||
completed_postcodes = set(checkpoint_df["postcode"].to_list())
|
||||
prior_results = [
|
||||
JourneyResult(
|
||||
|
|
@ -145,9 +144,9 @@ def main():
|
|||
results_df = results_to_dataframe(all_results)
|
||||
|
||||
all_postcodes = {r.postcode for r in all_results}
|
||||
coords_df = postcodes_df.filter(
|
||||
pl.col("postcode").is_in(all_postcodes)
|
||||
).select(["postcode", "lat", "long"])
|
||||
coords_df = postcodes_df.filter(pl.col("postcode").is_in(all_postcodes)).select(
|
||||
["postcode", "lat", "long"]
|
||||
)
|
||||
results_df = coords_df.join(results_df, on="postcode", how="left")
|
||||
|
||||
results_df = results_df.with_columns(
|
||||
|
|
|
|||
|
|
@ -113,7 +113,12 @@ def _build_wide(
|
|||
*[pl.col(c).fill_nan(None) for c in noise_cols],
|
||||
)
|
||||
.with_columns(
|
||||
pl.max_horizontal(*noise_cols).fill_null(0).alias("noise_lden_db"),
|
||||
pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("noise_lden_db")
|
||||
.fill_null(pl.col("noise_lden_db").min())
|
||||
.alias("noise_lden_db"),
|
||||
)
|
||||
.select("postcode", "noise_lden_db")
|
||||
)
|
||||
|
|
@ -153,8 +158,8 @@ def _build_wide(
|
|||
wide = wide.with_columns(
|
||||
pl.when(pl.col("pp_property_type").is_in(["Terraced", "Semi-Detached"]))
|
||||
.then(pl.col("built_form"))
|
||||
.otherwise(pl.col("epc_property_type"))
|
||||
.alias("epc_property_type")
|
||||
.otherwise(pl.col("pp_property_type"))
|
||||
.alias("property_type")
|
||||
)
|
||||
|
||||
wide = (
|
||||
|
|
@ -191,12 +196,13 @@ def _build_wide(
|
|||
"Barriers to Housing and Services Score",
|
||||
"lsoa21",
|
||||
"oa21",
|
||||
"epc_property_type",
|
||||
"pp_property_type",
|
||||
"built_form",
|
||||
)
|
||||
.rename(
|
||||
{
|
||||
"date_of_transfer": "Previous transaction date",
|
||||
"date_of_transfer": "Date of last transaction",
|
||||
"construction_age_band": "Construction age",
|
||||
"is_construction_date_approximate": "Is construction date approximate",
|
||||
"pp_address": "Address per Property Register",
|
||||
|
|
@ -206,11 +212,11 @@ def _build_wide(
|
|||
"current_energy_rating": "Current energy rating",
|
||||
"potential_energy_rating": "Potential energy rating",
|
||||
"total_floor_area": "Total floor area (sqm)",
|
||||
"epc_property_type": "Property type",
|
||||
"restaurants_2km": "Restaurants within 2km",
|
||||
"groceries_2km": "Groceries within 2km",
|
||||
"parks_2km": "Parks within 2km",
|
||||
"public_transport_2km": "Public transport within 2km",
|
||||
"property_type": "Property type",
|
||||
"restaurants_2km": "Number of restaurants within 2km",
|
||||
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
|
||||
"parks_2km": "Number of parks within 2km",
|
||||
"public_transport_2km": "Number of public transport stations within 2km",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
|
|
@ -219,7 +225,6 @@ def _build_wide(
|
|||
"max_download_speed": "Max available download speed (Mbps)",
|
||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||
"transaction_year": "Transaction year",
|
||||
"environmental_risk": "Environmental risk",
|
||||
"collapsible_deposits_risk": "Collapsible deposits risk",
|
||||
"compressible_ground_risk": "Compressible ground risk",
|
||||
|
|
|
|||
|
|
@ -42,7 +42,10 @@ def process_oa(
|
|||
|
||||
for pc, polys in pc_inspire_polys.items():
|
||||
merged = unary_union(polys)
|
||||
clipped = merged.intersection(oa_geom)
|
||||
if not merged.is_valid:
|
||||
merged = make_valid(merged)
|
||||
valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
|
||||
clipped = merged.intersection(valid_oa)
|
||||
if not clipped.is_empty:
|
||||
if not clipped.is_valid:
|
||||
clipped = make_valid(clipped)
|
||||
|
|
@ -58,11 +61,13 @@ def process_oa(
|
|||
used = None
|
||||
for pc, geom in claimed.items():
|
||||
if used is not None:
|
||||
if not geom.is_valid:
|
||||
geom = make_valid(geom)
|
||||
if not used.is_valid:
|
||||
used = make_valid(used)
|
||||
geom = geom.difference(used)
|
||||
if geom.is_empty:
|
||||
continue
|
||||
if not geom.is_valid:
|
||||
geom = make_valid(geom)
|
||||
geom = _extract_polygonal(geom)
|
||||
if geom is None:
|
||||
continue
|
||||
|
|
@ -75,11 +80,12 @@ def process_oa(
|
|||
all_claimed = unary_union(list(claimed.values()))
|
||||
if not all_claimed.is_valid:
|
||||
all_claimed = make_valid(all_claimed)
|
||||
remaining = oa_geom.difference(all_claimed)
|
||||
valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
|
||||
remaining = valid_oa.difference(all_claimed)
|
||||
if not remaining.is_valid:
|
||||
remaining = make_valid(remaining)
|
||||
else:
|
||||
remaining = oa_geom
|
||||
remaining = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
|
||||
|
||||
# Distribute remaining area via Voronoi
|
||||
if not remaining.is_empty and remaining.area > 0.01:
|
||||
|
|
|
|||
|
|
@ -75,6 +75,9 @@ def compute_voronoi_regions(
|
|||
n_real = len(pts)
|
||||
pc_polys: dict[str, list[Polygon]] = defaultdict(list)
|
||||
|
||||
if not boundary.is_valid:
|
||||
boundary = make_valid(boundary)
|
||||
|
||||
for i in range(n_real):
|
||||
region_idx = vor.point_region[i]
|
||||
region = vor.regions[region_idx]
|
||||
|
|
@ -100,6 +103,8 @@ def _equal_split_fallback(
|
|||
postcodes: list[str], boundary: Polygon | MultiPolygon
|
||||
) -> dict[str, Polygon | MultiPolygon]:
|
||||
"""Split boundary into roughly equal horizontal strips, one per postcode."""
|
||||
if not boundary.is_valid:
|
||||
boundary = make_valid(boundary)
|
||||
min_x, min_y, max_x, max_y = boundary.bounds
|
||||
n = len(postcodes)
|
||||
result = {}
|
||||
|
|
|
|||
|
|
@ -1,14 +1,12 @@
|
|||
"""Count POIs within a radius of properties, optimized via postcode deduplication."""
|
||||
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from .haversine import haversine_km
|
||||
|
||||
|
||||
def _count_pois_per_postcode(
|
||||
def count_pois_per_postcode(
|
||||
postcodes_df: pl.DataFrame,
|
||||
pois: pl.DataFrame,
|
||||
groups: dict[str, list[str]],
|
||||
|
|
@ -64,9 +62,7 @@ def _count_pois_per_postcode(
|
|||
pc_codes = postcodes_df["postcode"].to_list()
|
||||
|
||||
# Initialize result arrays
|
||||
result_counts = {
|
||||
group: np.zeros(n_postcodes, dtype=np.int32) for group in groups
|
||||
}
|
||||
result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in groups}
|
||||
|
||||
# Process in batches with progress
|
||||
batch_size = 50000
|
||||
|
|
@ -128,47 +124,3 @@ def _count_pois_per_postcode(
|
|||
result = pl.DataFrame(result_data)
|
||||
print(" Completed POI counting")
|
||||
return result
|
||||
|
||||
|
||||
def count_pois_within_radius(
|
||||
properties: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0
|
||||
) -> dict[str, pl.Series]:
|
||||
"""
|
||||
Count POIs within radius for properties, optimized by deduplicating postcodes.
|
||||
|
||||
Returns dict of {column_name: count_series} aligned to properties dataframe.
|
||||
"""
|
||||
# Get unique postcodes with coordinates
|
||||
print("Deduplicating postcodes...")
|
||||
unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
|
||||
subset=["postcode"]
|
||||
)
|
||||
|
||||
print(
|
||||
f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
|
||||
)
|
||||
|
||||
# Count POIs per postcode
|
||||
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
|
||||
|
||||
print(" Writing postcode counts to temp file...")
|
||||
with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp:
|
||||
tmp_path = tmp.name
|
||||
postcode_counts.write_parquet(tmp_path)
|
||||
|
||||
# Join using lazy evaluation
|
||||
print(" Joining counts back to properties (lazy)...")
|
||||
count_cols = [f"{group}_{int(radius_km)}km" for group in POI_GROUPS]
|
||||
|
||||
# Convert properties to lazy frame, join, then collect
|
||||
result_lazy = (
|
||||
properties.lazy()
|
||||
.select("postcode")
|
||||
.join(pl.scan_parquet(tmp_path), on="postcode", how="left")
|
||||
.select(count_cols)
|
||||
.fill_null(0)
|
||||
)
|
||||
|
||||
result_df = result_lazy.collect(engine="streaming")
|
||||
|
||||
return {col: result_df[col] for col in count_cols}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue