This commit is contained in:
Andras Schmelczer 2026-02-18 21:22:15 +00:00
parent 524580eb25
commit ffe080adef
82 changed files with 2652 additions and 2956 deletions

View file

@ -328,14 +328,19 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
return filled
def build_index(input_path: Path, max_pair_year: int | None = None) -> pl.DataFrame:
def build_index(
input_path: Path,
max_pair_year: int | None = None,
postcodes_path: Path | None = None,
) -> pl.DataFrame:
"""Build the full price index from raw data.
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
"""
pairs = extract_pairs(input_path, max_year2=max_pair_year)
centroids = extract_centroids(input_path)
centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min())
max_year = CURRENT_YEAR
@ -448,10 +453,12 @@ def main():
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--postcodes", type=Path, required=True,
help="Path to postcode.parquet (for lat/lon centroids)")
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
result = build_index(args.input)
result = build_index(args.input, postcodes_path=args.postcodes)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)