This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -43,48 +43,39 @@ def build_knn_pool(
"""
print("Building kNN pool...")
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
query = (
lf
.select(
"Postcode",
"Property type",
"lat",
"lon",
"Total floor area (sqm)",
"Last known price",
"Date of last transaction",
)
.filter(
pl.col("lat").is_not_null(),
pl.col("lon").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Postcode").is_not_null(),
pl.col("Date of last transaction").is_not_null(),
)
query = lf.select(
"Postcode",
"Property type",
"lat",
"lon",
"Total floor area (sqm)",
"Last known price",
"Date of last transaction",
).filter(
pl.col("lat").is_not_null(),
pl.col("lon").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Postcode").is_not_null(),
pl.col("Date of last transaction").is_not_null(),
)
if max_sale_year is not None:
query = query.filter(
pl.col("Date of last transaction").dt.year() < max_sale_year
)
pool = (
query.with_columns(
sector_expr(),
type_group_expr(),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (
pl.col("Date of last transaction").dt.month().cast(pl.Float64)
- 1.0
)
/ 12.0
).alias("_sale_fy"),
pl.lit(ref_frac_year).alias("_ref_fy"),
).collect()
)
pool = query.with_columns(
sector_expr(),
type_group_expr(),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
/ 12.0
).alias("_sale_fy"),
pl.lit(ref_frac_year).alias("_ref_fy"),
).collect()
pool = pool.filter(pl.col("type_group").is_not_null())
print(f" {len(pool):,} pool properties with lat/lon, floor area, price")