Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -17,7 +17,11 @@ from shapely.strtree import STRtree
|
|||
from thefuzz import fuzz
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.join_epc_pp import _scan_epc_certificates
|
||||
from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
MAX_COMPARABLE_PSM,
|
||||
MIN_COMPARABLE_PSM,
|
||||
)
|
||||
from pipeline.utils.fuzzy_join import (
|
||||
normalize_address_key,
|
||||
normalize_postcode_key,
|
||||
|
|
@ -59,7 +63,7 @@ _AREA_COLUMNS = [
|
|||
"Air Quality and Road Safety Score",
|
||||
# Ethnicity
|
||||
"% South Asian",
|
||||
"% East Asian",
|
||||
"% East/SE Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
|
|
@ -1060,14 +1064,10 @@ def _canonical_epc_property_type_expr() -> pl.Expr:
|
|||
|
||||
|
||||
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
||||
return (
|
||||
pl.col(column)
|
||||
.cast(pl.Utf8)
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
# Use the shared band->midpoint-year mapping so the direct-EPC / listings
|
||||
# path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
|
||||
# implausible years -> null). Already-numeric inputs pass through unchanged.
|
||||
return epc_band_to_year(pl.col(column))
|
||||
|
||||
|
||||
def _address_score(query: str, candidate: str | None) -> int:
|
||||
|
|
@ -1956,7 +1956,9 @@ def _build(
|
|||
|
||||
# Broadband: derive max available download speed tier per postcode from
|
||||
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as a numeric (UInt16) Mbps value so
|
||||
# it sorts/filters correctly; null (not a fabricated 10) when no availability
|
||||
# tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
|
||||
broadband = (
|
||||
pl.scan_parquet(broadband_path)
|
||||
.select(
|
||||
|
|
@ -1969,13 +1971,12 @@ def _build(
|
|||
.then(100)
|
||||
.when(pl.col("SFBB availability (% premises)") > 0)
|
||||
.then(30)
|
||||
.otherwise(10)
|
||||
.otherwise(None)
|
||||
.cast(pl.UInt16)
|
||||
.alias("max_download_speed"),
|
||||
)
|
||||
.group_by("bb_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
|
||||
)
|
||||
area_side_tables = {
|
||||
"iod": iod,
|
||||
|
|
@ -2052,9 +2053,20 @@ def _build(
|
|||
.otherwise(pl.col("current_energy_rating"))
|
||||
.alias("current_energy_rating"),
|
||||
).with_columns(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
# Null out implausible per-sqm values (outside the kNN comparable band):
|
||||
# bulk/block transactions divided by a single unit's floor area otherwise
|
||||
# produce figures up to ~£1.5M/sqm.
|
||||
pl.when(
|
||||
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
& (
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
|
||||
)
|
||||
)
|
||||
.then(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
|
||||
)
|
||||
.otherwise(None)
|
||||
.alias("Price per sqm"),
|
||||
)
|
||||
wide = _finalize_merged_columns(wide)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue