Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -67,6 +67,16 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
.struct.field("price")
.alias("input_price"),
)
.with_columns(
# Date of the input (second-to-last) sale, used by the kNN leakage
# filter to exclude the target property's own prior sale from its
# comparables. Built from year+month (day defaults to the 1st).
pl.date(
pl.col("input_year").cast(pl.Int32),
pl.col("input_month").cast(pl.Int32),
1,
).alias("input_date"),
)
.with_columns(
(
pl.col("actual_year").cast(pl.Float64)