Fix data loading

This commit is contained in:
Andras Schmelczer 2026-02-01 13:09:58 +00:00
parent 7235df0a97
commit c84af213e2
2 changed files with 65 additions and 15 deletions

View file

@ -2,6 +2,9 @@ import argparse
import polars as pl
from pathlib import Path
MIN_PRICE = 10_000
MIN_FLOOR_AREA_M2 = 10
def _build_wide(
epc_pp_path: Path,
@ -27,11 +30,17 @@ def _build_wide(
)
wide = wide.join(arcgis, on="postcode", how="inner")
journey_times = pl.scan_parquet(journey_times_path).select(
"postcode",
"public_transport_easy_minutes",
"public_transport_quick_minutes",
"cycling_minutes",
journey_times = (
pl.scan_parquet(journey_times_path)
.select(
"postcode",
"public_transport_easy_minutes",
"public_transport_quick_minutes",
"cycling_minutes",
)
.sort("public_transport_quick_minutes", nulls_last=True)
.group_by("postcode")
.first()
)
wide = wide.join(journey_times, on="postcode", how="left")
@ -49,15 +58,39 @@ def _build_wide(
crime = pl.scan_parquet(crime_path)
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
wide = wide.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
)
poi_counts = pl.scan_parquet(poi_proximity_path)
wide = wide.join(poi_counts, on="postcode", how="left")
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
noise = (
pl.scan_parquet(noise_path)
.with_columns(
pl.max_horizontal(
"road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"
).alias("noise_lden_db"),
# NaN → null so max_horizontal ignores missing instead of propagating NaN
*[pl.col(c).fill_nan(None) for c in noise_cols],
)
.with_columns(
pl.max_horizontal(*noise_cols).fill_null(0).alias("noise_lden_db"),
)
.select("postcode", "noise_lden_db")
)
@ -102,7 +135,18 @@ def _build_wide(
)
wide = (
wide.filter(pl.col("total_floor_area") > 0)
wide.filter(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
.filter(pl.col("latest_price") >= MIN_PRICE)
.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))
.alias("duration"),
pl.when(pl.col("current_energy_rating") == "INVALID!")
.then(None)
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
)
.with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
@ -146,11 +190,13 @@ def _build_wide(
"parks_2km": "Parks within 2km",
"public_transport_2km": "Public transport within 2km",
"latest_price": "Last known price",
"number_habitable_rooms": "Rooms (including bedrooms & bathrooms)",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
}
)
)