Fix data loading
This commit is contained in:
parent
7235df0a97
commit
c84af213e2
2 changed files with 65 additions and 15 deletions
|
|
@ -190,7 +190,8 @@ def sample_noise_at_postcodes(
|
||||||
easting: np.ndarray,
|
easting: np.ndarray,
|
||||||
northing: np.ndarray,
|
northing: np.ndarray,
|
||||||
label: str,
|
label: str,
|
||||||
) -> np.ndarray:
|
col_name: str,
|
||||||
|
) -> pl.Series:
|
||||||
"""Sample noise values from merged tiles at given BNG coordinates."""
|
"""Sample noise values from merged tiles at given BNG coordinates."""
|
||||||
print(f"[{label}] Merging {len(tile_paths)} tiles...")
|
print(f"[{label}] Merging {len(tile_paths)} tiles...")
|
||||||
datasets = [rasterio.open(p) for p in tile_paths]
|
datasets = [rasterio.open(p) for p in tile_paths]
|
||||||
|
|
@ -223,7 +224,10 @@ def sample_noise_at_postcodes(
|
||||||
|
|
||||||
valid_count = int(np.sum(~np.isnan(noise_db)))
|
valid_count = int(np.sum(~np.isnan(noise_db)))
|
||||||
print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
|
print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
|
||||||
return noise_db
|
|
||||||
|
# Return as masked Series: use null (not NaN) so that Polars max_horizontal
|
||||||
|
# correctly ignores missing values instead of propagating NaN.
|
||||||
|
return pl.Series(col_name, noise_db).fill_nan(None)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
|
|
@ -264,11 +268,11 @@ def main() -> None:
|
||||||
|
|
||||||
if not tile_paths:
|
if not tile_paths:
|
||||||
print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
|
print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
|
||||||
noise_db = np.full(len(lat), np.nan, dtype=np.float32)
|
series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
|
||||||
else:
|
else:
|
||||||
noise_db = sample_noise_at_postcodes(tile_paths, easting, northing, label)
|
series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name)
|
||||||
|
|
||||||
result = result.with_columns(pl.Series(col_name, noise_db))
|
result = result.with_columns(series)
|
||||||
|
|
||||||
result.write_parquet(args.output, compression="zstd")
|
result.write_parquet(args.output, compression="zstd")
|
||||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,9 @@ import argparse
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
MIN_PRICE = 10_000
|
||||||
|
MIN_FLOOR_AREA_M2 = 10
|
||||||
|
|
||||||
|
|
||||||
def _build_wide(
|
def _build_wide(
|
||||||
epc_pp_path: Path,
|
epc_pp_path: Path,
|
||||||
|
|
@ -27,11 +30,17 @@ def _build_wide(
|
||||||
)
|
)
|
||||||
wide = wide.join(arcgis, on="postcode", how="inner")
|
wide = wide.join(arcgis, on="postcode", how="inner")
|
||||||
|
|
||||||
journey_times = pl.scan_parquet(journey_times_path).select(
|
journey_times = (
|
||||||
"postcode",
|
pl.scan_parquet(journey_times_path)
|
||||||
"public_transport_easy_minutes",
|
.select(
|
||||||
"public_transport_quick_minutes",
|
"postcode",
|
||||||
"cycling_minutes",
|
"public_transport_easy_minutes",
|
||||||
|
"public_transport_quick_minutes",
|
||||||
|
"cycling_minutes",
|
||||||
|
)
|
||||||
|
.sort("public_transport_quick_minutes", nulls_last=True)
|
||||||
|
.group_by("postcode")
|
||||||
|
.first()
|
||||||
)
|
)
|
||||||
wide = wide.join(journey_times, on="postcode", how="left")
|
wide = wide.join(journey_times, on="postcode", how="left")
|
||||||
|
|
||||||
|
|
@ -49,15 +58,39 @@ def _build_wide(
|
||||||
crime = pl.scan_parquet(crime_path)
|
crime = pl.scan_parquet(crime_path)
|
||||||
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
|
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
|
||||||
|
|
||||||
|
wide = wide.with_columns(
|
||||||
|
pl.sum_horizontal(
|
||||||
|
"Violence and sexual offences (avg/yr)",
|
||||||
|
"Robbery (avg/yr)",
|
||||||
|
"Burglary (avg/yr)",
|
||||||
|
"Possession of weapons (avg/yr)",
|
||||||
|
).alias("serious_crime_avg_yr"),
|
||||||
|
pl.sum_horizontal(
|
||||||
|
"Anti-social behaviour (avg/yr)",
|
||||||
|
"Criminal damage and arson (avg/yr)",
|
||||||
|
"Shoplifting (avg/yr)",
|
||||||
|
"Bicycle theft (avg/yr)",
|
||||||
|
"Theft from the person (avg/yr)",
|
||||||
|
"Other theft (avg/yr)",
|
||||||
|
"Vehicle crime (avg/yr)",
|
||||||
|
"Public order (avg/yr)",
|
||||||
|
"Drugs (avg/yr)",
|
||||||
|
"Other crime (avg/yr)",
|
||||||
|
).alias("minor_crime_avg_yr"),
|
||||||
|
)
|
||||||
|
|
||||||
poi_counts = pl.scan_parquet(poi_proximity_path)
|
poi_counts = pl.scan_parquet(poi_proximity_path)
|
||||||
wide = wide.join(poi_counts, on="postcode", how="left")
|
wide = wide.join(poi_counts, on="postcode", how="left")
|
||||||
|
|
||||||
|
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
|
||||||
noise = (
|
noise = (
|
||||||
pl.scan_parquet(noise_path)
|
pl.scan_parquet(noise_path)
|
||||||
.with_columns(
|
.with_columns(
|
||||||
pl.max_horizontal(
|
# NaN → null so max_horizontal ignores missing instead of propagating NaN
|
||||||
"road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"
|
*[pl.col(c).fill_nan(None) for c in noise_cols],
|
||||||
).alias("noise_lden_db"),
|
)
|
||||||
|
.with_columns(
|
||||||
|
pl.max_horizontal(*noise_cols).fill_null(0).alias("noise_lden_db"),
|
||||||
)
|
)
|
||||||
.select("postcode", "noise_lden_db")
|
.select("postcode", "noise_lden_db")
|
||||||
)
|
)
|
||||||
|
|
@ -102,7 +135,18 @@ def _build_wide(
|
||||||
)
|
)
|
||||||
|
|
||||||
wide = (
|
wide = (
|
||||||
wide.filter(pl.col("total_floor_area") > 0)
|
wide.filter(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||||
|
.filter(pl.col("latest_price") >= MIN_PRICE)
|
||||||
|
.with_columns(
|
||||||
|
pl.when(pl.col("duration") == "U")
|
||||||
|
.then(None)
|
||||||
|
.otherwise(pl.col("duration"))
|
||||||
|
.alias("duration"),
|
||||||
|
pl.when(pl.col("current_energy_rating") == "INVALID!")
|
||||||
|
.then(None)
|
||||||
|
.otherwise(pl.col("current_energy_rating"))
|
||||||
|
.alias("current_energy_rating"),
|
||||||
|
)
|
||||||
.with_columns(
|
.with_columns(
|
||||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||||
.round(0)
|
.round(0)
|
||||||
|
|
@ -146,11 +190,13 @@ def _build_wide(
|
||||||
"parks_2km": "Parks within 2km",
|
"parks_2km": "Parks within 2km",
|
||||||
"public_transport_2km": "Public transport within 2km",
|
"public_transport_2km": "Public transport within 2km",
|
||||||
"latest_price": "Last known price",
|
"latest_price": "Last known price",
|
||||||
"number_habitable_rooms": "Rooms (including bedrooms & bathrooms)",
|
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||||
"noise_lden_db": "Noise (dB)",
|
"noise_lden_db": "Noise (dB)",
|
||||||
"good_primary_5km": "Good+ primary schools within 5km",
|
"good_primary_5km": "Good+ primary schools within 5km",
|
||||||
"good_secondary_5km": "Good+ secondary schools within 5km",
|
"good_secondary_5km": "Good+ secondary schools within 5km",
|
||||||
"max_download_speed": "Max available download speed (Mbps)",
|
"max_download_speed": "Max available download speed (Mbps)",
|
||||||
|
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||||
|
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue