This commit is contained in:
Andras Schmelczer 2026-02-08 18:40:17 +00:00
parent 6c90cf3c0f
commit 5b68c8da04
14 changed files with 687 additions and 551 deletions

View file

@ -1,298 +0,0 @@
version: '3'
vars:
DATA_DIR: /bulk/property-data
TILES_OUTPUT: "{{.DATA_DIR}}/uk.pmtiles"
ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet"
PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet"
IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet"
POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet"
POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet"
POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet"
EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet"
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
EPC: "{{.DATA_DIR}}/certificates.csv"
JOURNEY_TIMES_BANK: "{{.DATA_DIR}}/journey_times_bank.parquet"
JOURNEY_TIMES_FITZROVIA: "{{.DATA_DIR}}/journey_times_fitzrovia.parquet"
ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet"
CRIME_DIR: "{{.DATA_DIR}}/crime"
CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet"
NOISE_OUTPUT: "{{.DATA_DIR}}/road_noise.parquet"
OFSTED_OUTPUT: "{{.DATA_DIR}}/ofsted.parquet"
NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet"
BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet"
SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet"
POSTCODES_OUTPUT: "{{.DATA_DIR}}/postcodes"
GEOSURE_OUTPUT: "{{.DATA_DIR}}/geosure"
GEOSURE_PARQUET: "{{.DATA_DIR}}/geosure.parquet"
INSPIRE_OUTPUT: "{{.DATA_DIR}}/inspire"
OA_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/oa_boundaries.gpkg"
UPRN_LOOKUP_OUTPUT: "{{.DATA_DIR}}/uprn_lookup.parquet"
POSTCODE_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/new_postcode_boundaries"
tasks:
download:tiles:
desc: Download UK map tiles (PMTiles format from Protomaps)
status:
- test -f {{.TILES_OUTPUT}}
vars:
PMTILES_VERSION: "1.22.3"
PMTILES_BIN: "{{.DATA_DIR}}/pmtiles"
cmds:
- |
echo "Downloading UK PMTiles (~1.5GB)..."
echo "This extracts UK tiles from the Protomaps planet file."
echo ""
# Download pmtiles CLI if not present
if [ ! -f "{{.PMTILES_BIN}}" ]; then
echo "Downloading pmtiles CLI v{{.PMTILES_VERSION}}..."
curl -sL "https://github.com/protomaps/go-pmtiles/releases/download/v{{.PMTILES_VERSION}}/go-pmtiles_{{.PMTILES_VERSION}}_Linux_x86_64.tar.gz" | tar -xz -C "{{.DATA_DIR}}" pmtiles
chmod +x "{{.PMTILES_BIN}}"
fi
# Extract UK region (bbox: -10.5,49.5,2.5,61)
# Using a recent daily build from Protomaps
"{{.PMTILES_BIN}}" extract https://build.protomaps.com/20260201.pmtiles {{.TILES_OUTPUT}} --bbox=-10.5,49.5,2.5,61
prompt:epc:
desc: Prompt user to download EPC dataset (requires registration)
status:
- test -f {{.EPC}}
cmds:
- |
echo ""
echo "=== EPC dataset not found ==="
echo "The EPC certificates file is required: {{.EPC}}"
echo ""
echo "To obtain it, register at https://epc.opendatacommunities.org/login"
echo ""
exit 1
prompt:journey-times:
desc: Download TFL journey times if missing (requires API key registration)
status:
- test -f {{.JOURNEY_TIMES_BANK}} || test -f {{.JOURNEY_TIMES_FITZROVIA}}
deps:
- download:arcgis
cmds:
- |
echo ""
echo "=== TFL journey times not found ==="
echo "Register for a TFL API key at https://api-portal.tfl.gov.uk/signin"
echo "Then set the TFL_API_KEY environment variable and run:"
echo " task download:journey-times -- bank"
echo " task download:journey-times -- fitzrovia"
echo ""
exit 1
download:arcgis:
desc: Download and convert ArcGIS postcode data
status:
- test -f {{.ARCGIS_OUTPUT}}
cmds:
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
download:price-paid:
desc: Download and convert Land Registry price-paid data
status:
- test -f {{.PRICE_PAID_OUTPUT}}
cmds:
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
download:deprivation:
desc: Download and convert Index of Deprivation data
status:
- test -f {{.IOD_OUTPUT}}
cmds:
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
download:ethnicity:
desc: Download ethnicity by local authority data
status:
- test -f {{.ETHNICITY_OUTPUT}}
cmds:
- uv run python -m pipeline.download.ethnicity --output {{.ETHNICITY_OUTPUT}}
download:naptan:
desc: Download NaPTAN station data
status:
- test -f {{.NAPTAN_OUTPUT}}
cmds:
- uv run python -m pipeline.download.naptan --output {{.NAPTAN_OUTPUT}}
download:pois:
desc: Download and extract POIs from OpenStreetMap
status:
- test -f {{.POIS_RAW_OUTPUT}}
cmds:
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
download:ofsted:
desc: Download Ofsted school inspection outcomes
status:
- test -f {{.OFSTED_OUTPUT}}
cmds:
- uv run python -m pipeline.download.ofsted --output {{.OFSTED_OUTPUT}}
download:broadband:
desc: Download Ofcom broadband performance data
status:
- test -f {{.BROADBAND_OUTPUT}}
cmds:
- uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}}
download:postcodes:
desc: Download GB postcodes data from MapIt
status:
- test -f {{.POSTCODES_OUTPUT}}
cmds:
- uv run python -m pipeline.download.postcodes --output {{.POSTCODES_OUTPUT}}
download:geosure:
desc: Download OS GeoSure ground stability data (5km hex grid)
status:
- test -d {{.GEOSURE_OUTPUT}}
cmds:
- uv run python -m pipeline.download.geosure --output {{.GEOSURE_OUTPUT}}
download:noise:
desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids
deps:
- download:arcgis
status:
- test -f {{.NOISE_OUTPUT}}
cmds:
- uv run python -m pipeline.download.noise --arcgis {{.ARCGIS_OUTPUT}} --output {{.NOISE_OUTPUT}}
transform:pois:
desc: Transform raw POIs to filtered version with friendly names
deps:
- download:pois
- download:naptan
status:
- test -f {{.POIS_FILTERED_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --naptan {{.NAPTAN_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
transform:epc-pp:
desc: Fuzzy join EPC and Price Paid data
deps:
- download:price-paid
- prompt:epc
status:
- test -f {{.EPC_PP_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
transform:crime:
desc: Transform crime CSVs into yearly averages by LSOA
status:
- test -f {{.CRIME_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.crime --input {{.CRIME_DIR}} --output {{.CRIME_OUTPUT}}
transform:poi-proximity:
desc: Compute POI proximity counts per postcode
deps:
- download:arcgis
- transform:pois
status:
- test -f {{.POI_PROXIMITY_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
transform:school-proximity:
desc: Compute good+ school proximity counts per postcode
deps:
- download:ofsted
- download:arcgis
status:
- test -f {{.SCHOOL_PROXIMITY_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}}
transform:geosure:
desc: Spatial-join GeoSure ground stability data to postcode centroids
deps:
- download:geosure
- download:arcgis
status:
- test -f {{.GEOSURE_PARQUET}}
cmds:
- uv run python -m pipeline.transform.transform_geosure --geosure {{.GEOSURE_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.GEOSURE_PARQUET}}
download:inspire:
desc: Download INSPIRE Index Polygon GML files from HM Land Registry
status:
- test -d {{.INSPIRE_OUTPUT}}
cmds:
- uv run python -m pipeline.download.inspire --output {{.INSPIRE_OUTPUT}}
download:oa-boundaries:
desc: Download Output Areas (2021) boundary polygons (England & Wales)
status:
- test -f {{.OA_BOUNDARIES_OUTPUT}}
cmds:
- uv run python -m pipeline.download.oa_boundaries --output {{.OA_BOUNDARIES_OUTPUT}}
download:uprn-lookup:
desc: Download National Statistics UPRN Lookup and convert to parquet
status:
- test -f {{.UPRN_LOOKUP_OUTPUT}}
cmds:
- uv run python -m pipeline.download.uprn_lookup --output {{.UPRN_LOOKUP_OUTPUT}}
transform:postcode-boundaries:
desc: Generate postcode boundary polygons from OA boundaries + INSPIRE + UPRNs
deps:
- download:oa-boundaries
- download:inspire
- download:uprn-lookup
cmds:
- >-
uv run python -m pipeline.transform.postcode_boundaries
--uprn {{.UPRN_LOOKUP_OUTPUT}}
--oa-boundaries {{.OA_BOUNDARIES_OUTPUT}}
--inspire {{.INSPIRE_OUTPUT}}
--output {{.POSTCODE_BOUNDARIES_OUTPUT}}
download:journey-times:
desc: "Fetch TfL journey times: task download:journey-times"
deps:
- download:arcgis
status:
- test -f {{.DATA_DIR}}/journey_times_*.parquet
cmds:
- uv run python -m pipeline.journey_times --destination {{.CLI_ARGS}} --output-dir {{.DATA_DIR}} --postcodes {{.ARCGIS_OUTPUT}}
prepare:
desc: Build wide property dataframe with all joins
deps:
# - transform:epc-pp
# - download:arcgis
# - download:deprivation
# - download:ethnicity
# - download:broadband
# - download:noise
# - transform:crime
# - transform:poi-proximity
# - transform:school-proximity
# - transform:geosure
# - prompt:journey-times
status:
- test -f {{.WIDE_OUTPUT}}
cmds:
- >-
uv run python -m pipeline.transform.merge
--epc-pp {{.EPC_PP_OUTPUT}}
--arcgis {{.ARCGIS_OUTPUT}}
--iod {{.IOD_OUTPUT}}
--poi-proximity {{.POI_PROXIMITY_OUTPUT}}
--journey-times-bank {{.JOURNEY_TIMES_BANK}}
--journey-times-fitzrovia {{.JOURNEY_TIMES_FITZROVIA}}
--ethnicity {{.ETHNICITY_OUTPUT}}
--crime {{.CRIME_OUTPUT}}
--noise {{.NOISE_OUTPUT}}
--school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}}
--broadband {{.BROADBAND_OUTPUT}}
--geosure {{.GEOSURE_PARQUET}}
--output {{.WIDE_OUTPUT}}

View file

@ -1,10 +1,5 @@
version: '3'
includes:
data:
taskfile: ./Taskfile.data.yml
flatten: true
tasks:
install:

View file

@ -13,7 +13,8 @@ const DATA_SOURCES = [
id: 'epc',
name: 'Energy Performance Certificates (EPC)',
origin: 'Ministry of Housing, Communities & Local Government',
use: 'Domestic Energy Performance Certificates providing floor area, number of rooms, construction age, energy ratings, property type, and built form. Fuzzy-joined with Price Paid records by address within postcode buckets.',
use: 'Domestic Energy Performance Certificates providing floor area, number of rooms, construction age, energy ratings, property type, and built form. Fuzzy-joined with Price Paid records by address within postcode buckets. Property owners can opt out of public disclosure.',
optOutUrl: 'https://www.gov.uk/guidance/energy-performance-certificates-opt-out-of-public-disclosure',
url: 'https://epc.opendatacommunities.org/downloads/domestic',
license: 'Open Government Licence v3.0',
},
@ -180,6 +181,18 @@ export default function DataSourcesPage() {
>
{source.url}
</a>
{'optOutUrl' in source && source.optOutUrl && (
<div className="mt-2">
<a
href={source.optOutUrl}
target="_blank"
rel="noopener noreferrer"
className="text-sm text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300 hover:underline"
>
Opt out of public disclosure
</a>
</div>
)}
</div>
))}
</div>

View file

@ -0,0 +1,22 @@
interface IconProps {
className?: string;
}
export function LogoIcon({ className = 'w-4 h-4' }: IconProps) {
return (
<svg
className={className}
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth={2}
>
<path
strokeLinecap="round"
strokeLinejoin="round"
d="M12 2L20.7 7v10L12 22l-8.7-5V7z"
/>
<path strokeLinecap="round" strokeLinejoin="round" d="M8.5 12.5l2.5 2.5 4.5-5" />
</svg>
);
}

View file

@ -3,6 +3,7 @@ const HtmlWebpackPlugin = require('html-webpack-plugin');
const MiniCssExtractPlugin = require('mini-css-extract-plugin');
const CopyWebpackPlugin = require('copy-webpack-plugin');
const ReactRefreshWebpackPlugin = require('@pmmmwh/react-refresh-webpack-plugin');
const FaviconsWebpackPlugin = require('favicons-webpack-plugin');
module.exports = (env, argv) => {
const isProduction = argv.mode === 'production';
@ -54,6 +55,20 @@ module.exports = (env, argv) => {
new CopyWebpackPlugin({
patterns: [{ from: 'public', noErrorOnMissing: true }],
}),
new FaviconsWebpackPlugin({
logo: './public/favicon.svg',
favicons: {
background: '#0c0a09',
icons: {
favicons: true,
android: false,
appleIcon: false,
appleStartup: false,
yandex: false,
windows: false,
},
},
}),
...(isProduction
? [new MiniCssExtractPlugin()]
: [new ReactRefreshWebpackPlugin()]),

View file

@ -2,7 +2,7 @@
(title) Find your <green>perfect postcode</green> before you find your property.
Set the sliders according to your expectations and the map will highlight the areas that actually match. Instantly.
Set the sliders to your expectations and the map highlights the areas that actually match. Instantly.
<example with two sliders, last known price, minor crime/>
@ -10,9 +10,9 @@ That's just two. We've built 43 — spanning transport links, amenities, demogra
Here's the problem with property search: listings only show you what's on the market right now — a thin slice of what an area is actually like. And even if you could look beyond them, there are millions of postcodes across England. You can't research them all yourself.
So we've built this dashboard for You. We took millions of historical transactions and public records, connected and extended them with proprietary algorithms on top — so the map doesn't just show you raw data, it surfaces the patterns that matter.
We built this for you — years of historical transactions and public records, extended with proprietary algorithms so the map doesn't just show raw data, it surfaces the patterns that matter.
Understand areas first. Then find the right property within them, with expectations you've set yourself rather than ones the market set for you.
Understand areas first. Then find the right property within them, with expectations you've set rather than ones the market set for you.
(Fun cereal graphic on the side with this popup) You might buy a box of cereal because it's 20% off. Your next home is not a box of cereal. Don't let a discount on the wrong property distract you from finding the right one. Know what you're looking for, then go looking.

2
manual-data/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
certificates.csv
crime

View file

@ -3,6 +3,7 @@
Test set: properties with 2+ sales where the last sale is 2022-2025.
Uses the second-to-last sale as input, predicts the last sale price.
Compares index-based prediction against a naive baseline (raw input price).
Uses type-stratified index when available, falling back to "All" type.
Output: backtest_results.parquet with predictions vs actuals.
"""
@ -15,6 +16,17 @@ import polars as pl
CURRENT_YEAR = 2025
TEST_YEAR_MIN = 2022
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
def type_group_expr():
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def extract_test_set(input_path: Path) -> pl.DataFrame:
@ -22,13 +34,14 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
print("Loading test set...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices")
.select("Postcode", "historical_prices", "Property type")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
type_group_expr(),
# Last sale (ground truth)
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
@ -49,21 +62,60 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
"""Index-based prediction: adjust input price by sector index change."""
# Join index at input year
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
# Join index at actual year
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
"""Index-based prediction with type-stratified fallback."""
has_type_group = "type_group" in index.columns
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join type-specific index at input year
test = test.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
left_on=["sector", "type_group", "input_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at input year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at actual year
test = test.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
left_on=["sector", "type_group", "actual_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
# Join "All" index at actual year
test = test.join(
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
)
else:
# Unstratified index
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
left_on=["sector", "input_year"],
right_on=["sector", "year"],
how="left",
)
test = test.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
left_on=["sector", "actual_year"],
right_on=["sector", "year"],
how="left",
)
test = test.with_columns(
(
@ -75,7 +127,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
"""Compute error metrics."""
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
actual = actual[valid]
predicted = predicted[valid]
@ -95,7 +146,6 @@ def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
def print_metrics_table(metrics_by_stage: dict):
"""Print a comparison table of metrics."""
print("\n" + "=" * 55)
print("BACKTEST RESULTS")
print("=" * 55)
@ -103,7 +153,6 @@ def print_metrics_table(metrics_by_stage: dict):
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
stages = list(metrics_by_stage.keys())
# Header
header = f"{'Metric':<25s}"
for stage in stages:
header += f" {stage:>14s}"
@ -133,7 +182,12 @@ def main():
args = parser.parse_args()
index = pl.read_parquet(args.index)
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
has_type_group = "type_group" in index.columns
if has_type_group:
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups")
else:
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
test = extract_test_set(args.input)

View file

@ -1,9 +1,10 @@
"""Apply repeat-sales price index to estimate current property prices.
"""Augment wide.parquet with an estimated current price column.
Joins the precomputed price index (from price_index.py) with each property's
last known sale to produce an inflation-adjusted current price estimate.
Joins the precomputed repeat-sales price index (from price_index.py) with each
property's last known sale to produce an inflation-adjusted current price estimate.
Uses type-stratified index when available, falling back to "All" type.
Output: estimated_prices.parquet with per-property estimates.
Modifies wide.parquet in-place, adding the "Estimated current price" column.
"""
import argparse
@ -12,78 +13,133 @@ from pathlib import Path
import polars as pl
CURRENT_YEAR = 2025
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
def type_group_expr():
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def main():
parser = argparse.ArgumentParser(description="Estimate current property prices")
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
parser = argparse.ArgumentParser(description="Augment wide.parquet with estimated current prices")
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet (modified in-place)")
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
parser.add_argument("--output", type=Path, required=True, help="Output estimated_prices.parquet")
args = parser.parse_args()
print("Loading property data...")
df = (
pl.scan_parquet(args.input)
.select("Postcode", "Address per Property Register", "Last known price", "Date of last transaction")
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Postcode").is_not_null(),
)
.with_columns(
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
pl.col("Date of last transaction").dt.year().alias("sale_year"),
)
.collect()
print("Loading wide.parquet...")
df = pl.read_parquet(args.input)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Drop existing estimated price column if re-running
if "Estimated current price" in df.columns:
df = df.drop("Estimated current price")
# Derive helper columns for the join
has_price = (
pl.col("Last known price").is_not_null()
& pl.col("Postcode").is_not_null()
& pl.col("Date of last transaction").is_not_null()
)
df = df.with_columns(
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("_sector"),
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
type_group_expr().alias("_type_group"),
)
print(f" {len(df):,} properties with known price and postcode")
index = pl.read_parquet(args.index)
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
has_type_group = "type_group" in index.columns
if has_type_group:
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups")
else:
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)")
print("\nApplying repeat-sales index...")
# Join index at sale year
df = df.join(
index.select("sector", "year", pl.col("log_index").alias("log_index_sale")),
left_on=["sector", "sale_year"],
right_on=["sector", "year"],
how="left",
)
if has_type_group:
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
# Join index at current year
index_current = (
index.filter(pl.col("year") == CURRENT_YEAR)
.select("sector", pl.col("log_index").alias("log_index_current"))
)
df = df.join(index_current, on="sector", how="left")
# Compute estimate; fall back to raw price when no index available
df = df.with_columns(
(
pl.col("Last known price").cast(pl.Float64)
* (pl.col("log_index_current") - pl.col("log_index_sale")).exp()
# Join type-specific index at sale year
df = df.join(
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("log_idx_sale_typed")),
left_on=["_sector", "_type_group", "_sale_year"],
right_on=["sector", "type_group", "year"],
how="left",
)
.fill_null(pl.col("Last known price").cast(pl.Float64))
.alias("estimated_price"),
# Join "All" index at sale year
df = df.join(
idx_all.select("sector", "year", pl.col("log_index").alias("log_idx_sale_all")),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
# Join type-specific index at current year
df = df.join(
idx_typed.filter(pl.col("year") == CURRENT_YEAR)
.select("sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")),
left_on=["_sector", "_type_group"],
right_on=["sector", "type_group"],
how="left",
)
# Join "All" index at current year
df = df.join(
idx_all.filter(pl.col("year") == CURRENT_YEAR)
.select("sector", pl.col("log_index").alias("log_idx_cur_all")),
left_on="_sector",
right_on="sector",
how="left",
)
df = df.with_columns(
pl.col("log_idx_sale_typed").fill_null(pl.col("log_idx_sale_all")).alias("_log_index_sale"),
pl.col("log_idx_cur_typed").fill_null(pl.col("log_idx_cur_all")).alias("_log_index_current"),
)
else:
df = df.join(
index.select("sector", "year", pl.col("log_index").alias("_log_index_sale")),
left_on=["_sector", "_sale_year"],
right_on=["sector", "year"],
how="left",
)
index_current = (
index.filter(pl.col("year") == CURRENT_YEAR)
.select("sector", pl.col("log_index").alias("_log_index_current"))
)
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
# Compute estimate — only for rows with a known price
df = df.with_columns(
pl.when(has_price)
.then(
pl.col("Last known price").cast(pl.Float64)
* (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
)
.otherwise(pl.lit(None))
.alias("Estimated current price"),
)
n_adjusted = df.filter(pl.col("log_index_sale").is_not_null()).height
print(f" {n_adjusted:,} properties adjusted by index ({n_adjusted / len(df) * 100:.1f}%)")
n_adjusted = df.filter(
has_price & pl.col("_log_index_sale").is_not_null()
).height
n_with_price = df.filter(has_price).height
print(f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)")
# Select output columns
output = df.select(
"Postcode",
"Address per Property Register",
pl.col("Last known price").alias("last_price"),
"sale_year",
"sector",
"estimated_price",
)
# Drop all temporary columns
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
df = df.drop(temp_cols)
output.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(f" {len(output):,} rows")
df.write_parquet(args.input)
size_mb = args.input.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
print(f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')")
if __name__ == "__main__":

View file

@ -1,10 +1,12 @@
"""Stage 1: Repeat-Sales Price Index
"""Repeat-Sales Price Index (improved)
Builds a hierarchical Case-Shiller repeat-sales price index from historical
transaction data. Solves WLS regression per postcode sector, district, area,
and nationally, then applies Bayesian shrinkage toward parent geographies.
Builds a hierarchical repeat-sales price index with:
1. Stratification by property type (Detached/Semi-Detached/Terraced/Flats)
2. Robust regression (IRLS with Huber weights) instead of hard outlier cutoff
3. National hedonic time-dummy model as ultimate shrinkage fallback
4. Spatial smoothing for sparse sectors via KD-tree nearest neighbors
Output: price_index.parquet with columns: sector, year, log_index, n_pairs
Output: price_index.parquet sector × type_group × year log_index
"""
import argparse
@ -14,31 +16,74 @@ import numpy as np
import polars as pl
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import lsqr
from scipy.spatial import KDTree
from tqdm import tqdm
MIN_PAIRS = 5 # minimum pairs to compute an index
SHRINKAGE_K = 50 # shrinkage parameter: higher = more shrinkage toward parent
OUTLIER_THRESHOLD = 2.5 # |log_ratio| > this → drop (>12x price change)
# --- Constants ---
MIN_PAIRS = 5
SHRINKAGE_K = 50
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
HUBER_K = 1.345
IRLS_ITERATIONS = 5
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
CURRENT_YEAR = 2025
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
AGE_LABELS = ["pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+"]
def type_group_expr():
"""Polars expression: Property type → type_group."""
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def age_band_expr():
"""Polars expression: Construction age (UInt16 year) → age band string."""
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
for i, brk in enumerate(AGE_BREAKS):
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
def sector_expr():
"""Polars expression: Postcode → sector (drop last 2 chars, strip)."""
return pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector")
def hierarchy_keys(sector: str) -> tuple[str, str]:
"""Return (district, area) for a sector string."""
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
area = ""
for ch in district:
if ch.isalpha():
area += ch
else:
break
return district, area
# --- Pair extraction ---
def extract_pairs(input_path: Path) -> pl.DataFrame:
"""Extract consecutive sale pairs from historical_prices."""
print("Loading historical prices...")
print("Extracting repeat-sale pairs...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "historical_prices")
.filter(
pl.col("Postcode").is_not_null(),
pl.col("historical_prices").list.len() >= 2,
)
.with_columns(
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
)
.select("Postcode", "historical_prices", "Property type")
.filter(pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2)
.with_columns(sector_expr(), type_group_expr())
.collect()
)
print(f" {len(df):,} properties with 2+ transactions")
print("Extracting consecutive pairs...")
pairs = (
df.lazy()
.with_columns(
@ -52,12 +97,8 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
pl.col("to_txn").struct.field("year").alias("year2"),
pl.col("to_txn").struct.field("price").alias("price2"),
)
.select("sector", "year1", "price1", "year2", "price2")
.filter(
pl.col("price1") > 0,
pl.col("price2") > 0,
pl.col("year2") > pl.col("year1"),
)
.select("sector", "type_group", "year1", "price1", "year2", "price2")
.filter(pl.col("price1") > 0, pl.col("price2") > 0, pl.col("year2") > pl.col("year1"))
.with_columns(
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
@ -65,207 +106,428 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.collect()
)
print(f" {len(pairs):,} consecutive pairs extracted")
# Add hierarchy columns
pairs = pairs.with_columns(
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
).with_columns(
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
)
print(f" {len(pairs):,} pairs extracted")
return pairs
def solve_wls_index(years1: np.ndarray, years2: np.ndarray, log_ratios: np.ndarray, weights: np.ndarray) -> dict[int, float]:
"""Solve WLS repeat-sales regression for a set of pairs.
# --- Sector centroids ---
Model: log(P2/P1) = beta[year2] - beta[year1], weighted by 1/sqrt(gap).
Pin beta[min_year] = 0.
Returns dict mapping year -> log_index (cumulative).
"""
if len(years1) < MIN_PAIRS:
def extract_centroids(input_path: Path) -> dict[str, tuple[float, float]]:
print("Computing sector centroids...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "lat", "lon")
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
.with_columns(sector_expr())
.group_by("sector")
.agg(pl.col("lat").mean(), pl.col("lon").mean())
.collect()
)
centroids = {}
for row in df.iter_rows(named=True):
centroids[row["sector"]] = (row["lat"], row["lon"])
print(f" {len(centroids):,} sector centroids")
return centroids
# --- Robust IRLS solver ---
def solve_robust_index(
years1: np.ndarray, years2: np.ndarray,
log_ratios: np.ndarray, base_weights: np.ndarray,
) -> dict[int, float]:
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
n = len(years1)
if n < MIN_PAIRS:
return {}
all_years = np.union1d(years1, years2)
min_year = int(all_years.min())
# Map years to column indices, skipping min_year (pinned to 0)
col = 0
year_to_col = {}
for y in all_years:
if int(y) != min_year:
year_to_col[int(y)] = col
iy = int(y)
if iy != min_year:
year_to_col[iy] = col
col += 1
n_cols = len(year_to_col)
if n_cols == 0:
return {}
n_rows = len(years1)
row_idx = []
col_idx = []
data = []
# Vectorized column index mapping
col2 = np.full(n, -1, dtype=np.int32)
col1 = np.full(n, -1, dtype=np.int32)
for year, c in year_to_col.items():
col2[years2 == year] = c
col1[years1 == year] = c
for i in range(n_rows):
y1, y2 = int(years1[i]), int(years2[i])
if y2 in year_to_col:
row_idx.append(i)
col_idx.append(year_to_col[y2])
data.append(weights[i])
if y1 in year_to_col:
row_idx.append(i)
col_idx.append(year_to_col[y1])
data.append(-weights[i])
# Sparse matrix structure (fixed across iterations)
mask2 = col2 >= 0
mask1 = col1 >= 0
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
A = csc_matrix((data, (row_idx, col_idx)), shape=(n_rows, n_cols))
b = log_ratios * weights
weights = base_weights.copy()
result = lsqr(A, b, atol=1e-10, btol=1e-10)
betas = result[0]
for _ in range(IRLS_ITERATIONS):
data = signs_arr * weights[rows_arr]
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
b = log_ratios * weights
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
# Residuals
predicted = np.zeros(n)
predicted[mask2] += betas[col2[mask2]]
predicted[mask1] -= betas[col1[mask1]]
residuals = log_ratios - predicted
# Huber reweighting
abs_r = np.abs(residuals)
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
weights = base_weights * huber_w
index = {min_year: 0.0}
for year, col in year_to_col.items():
index[year] = float(betas[col])
for year, c in year_to_col.items():
index[year] = float(betas[c])
return index
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str) -> dict[str, dict[int, float]]:
"""Compute raw indices for each geographic group."""
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
groups = pairs.group_by(group_col).agg(
pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
)
indices = {}
n_pairs_map = {}
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" Solving {group_col}"):
n_pairs = {}
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"):
key = row[group_col]
y1 = np.array(row["year1"], dtype=np.int32)
y2 = np.array(row["year2"], dtype=np.int32)
lr = np.array(row["log_ratio"], dtype=np.float64)
w = np.array(row["weight"], dtype=np.float64)
idx = solve_wls_index(y1, y2, lr, w)
idx = solve_robust_index(y1, y2, lr, w)
if idx:
indices[key] = idx
n_pairs_map[key] = len(y1)
return indices, n_pairs_map
n_pairs[key] = len(y1)
return indices, n_pairs
def shrink_index(raw: dict[int, float], parent: dict[int, float], n_pairs: int) -> dict[int, float]:
"""Bayesian shrinkage toward parent index."""
w = n_pairs / (n_pairs + SHRINKAGE_K)
# --- Hedonic model ---
def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dict[int, float]:
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
print("Computing hedonic index...")
df = (
pl.scan_parquet(input_path)
.select(
"Last known price", "Date of last transaction", "Property type",
"Total floor area (sqm)", "Current energy rating",
"Number of bedrooms & living rooms", "Construction age",
)
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Current energy rating").is_in(["A", "B", "C", "D", "E", "F", "G"]),
pl.col("Number of bedrooms & living rooms").is_not_null(),
pl.col("Construction age").is_not_null(),
)
.with_columns(
pl.col("Date of last transaction").dt.year().alias("sale_year"),
type_group_expr(),
age_band_expr(),
)
.filter(
pl.col("type_group").is_not_null(),
pl.col("sale_year").is_not_null(),
pl.col("sale_year") >= min_year,
pl.col("sale_year") <= max_year,
)
.collect()
)
print(f" {len(df):,} complete cases for hedonic model")
# Target
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
sale_years = df["sale_year"].to_numpy()
# Build feature matrix
parts = []
# log(floor_area)
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
parts.append(np.log(np.maximum(fa, 1.0)).reshape(-1, 1))
# Type dummies (ref: Detached)
tg = df["type_group"].to_numpy()
for t in ["Terraced", "Semi-Detached", "Flats"]:
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
# EPC dummies (ref: D)
epc = df["Current energy rating"].to_numpy()
for r in ["A", "B", "C", "E", "F", "G"]:
parts.append((epc == r).astype(np.float32).reshape(-1, 1))
# Rooms
parts.append(df["Number of bedrooms & living rooms"].to_numpy().astype(np.float32).reshape(-1, 1))
# Age band dummies (ref: pre-1900)
ab = df["age_band"].to_numpy()
for band in AGE_LABELS[1:]:
parts.append((ab == band).astype(np.float32).reshape(-1, 1))
# Intercept
parts.append(np.ones((len(df), 1), dtype=np.float32))
F = np.hstack(parts)
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
# Step 1: regress log(price) on features → quality score
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
quality_score = F.astype(np.float64) @ betas
residuals = log_price - quality_score
# Step 2: average residual by year = hedonic index
hedonic = {}
for y in range(min_year, max_year + 1):
mask = sale_years == y
if mask.sum() > 0:
hedonic[y] = float(np.mean(residuals[mask]))
# Normalize: min_year = 0
base = hedonic.get(min_year, 0.0)
for y in hedonic:
hedonic[y] -= base
print(f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}")
return hedonic
# --- Shrinkage ---
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
w = n_pairs / (n_pairs + k)
result = {}
all_years = set(raw.keys()) | set(parent.keys())
for y in all_years:
raw_val = raw.get(y, parent.get(y, 0.0))
parent_val = parent.get(y, raw.get(y, 0.0))
result[y] = w * raw_val + (1 - w) * parent_val
for y in set(raw) | set(parent):
r = raw.get(y, parent.get(y, 0.0))
p = parent.get(y, raw.get(y, 0.0))
result[y] = w * r + (1 - w) * p
return result
def forward_fill_index(index: dict[int, float], min_year: int, max_year: int) -> dict[int, float]:
"""Forward-fill missing years so index is continuous."""
def apply_shrinkage(
sector_idx, sector_n, district_idx, district_n,
area_idx, area_n, national_idx, national_n,
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
):
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
# National → hedonic
national_shrunk = shrink_index(national_idx, hedonic_idx, national_n)
# Area → national
area_shrunk = {}
for area, idx in area_idx.items():
area_shrunk[area] = shrink_index(idx, national_shrunk, area_n[area])
# District → area
district_shrunk = {}
for dist, idx in district_idx.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, national_shrunk)
district_shrunk[dist] = shrink_index(idx, parent, district_n[dist])
# Sector → district
sector_shrunk = {}
for sec, idx in sector_idx.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, national_shrunk)
sector_shrunk[sec] = shrink_index(idx, parent, sector_n[sec])
# Fill sectors without their own index
for sec in all_sectors:
if sec not in sector_shrunk:
d = sector_to_dist.get(sec, "")
a = dist_to_area.get(d, "")
sector_shrunk[sec] = district_shrunk.get(
d, area_shrunk.get(a, national_shrunk)
)
return sector_shrunk
# --- Spatial smoothing ---
def spatial_smooth(
sector_indices: dict, centroids: dict, n_pairs_map: dict,
) -> dict:
"""Blend sparse sector indices with K nearest neighbors."""
# Build coordinate arrays for sectors with centroids
sectors_with_coords = [s for s in sector_indices if s in centroids]
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
return sector_indices
coords = np.array([centroids[s] for s in sectors_with_coords])
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
mean_lat = np.mean(coords[:, 0])
scale = np.cos(np.radians(mean_lat))
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
tree = KDTree(scaled_coords)
result = dict(sector_indices)
for i, sec in enumerate(sectors_with_coords):
n = n_pairs_map.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue # enough data, skip smoothing
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
# Skip self (index 0, distance ~0)
neighbor_dists = dists[1:]
neighbor_idxs = idxs[1:]
inv_dists = []
neighbor_indices = []
for d, j in zip(neighbor_dists, neighbor_idxs):
ns = sectors_with_coords[j]
if d > 0 and ns in sector_indices:
inv_dists.append(1.0 / d)
neighbor_indices.append(sector_indices[ns])
if not neighbor_indices:
continue
total_inv = sum(inv_dists)
nbr_w = 1.0 - self_w
ws = [iw / total_inv * nbr_w for iw in inv_dists]
blended = {}
all_years = set(sector_indices[sec])
for ni in neighbor_indices:
all_years |= set(ni)
for y in all_years:
val = self_w * sector_indices[sec].get(y, 0.0)
for ni, w in zip(neighbor_indices, ws):
val += w * ni.get(y, 0.0)
blended[y] = val
result[sec] = blended
return result
# --- Forward fill ---
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
filled = {}
last_val = 0.0
last = 0.0
for y in range(min_year, max_year + 1):
if y in index:
last_val = index[y]
filled[y] = last_val
last = index[y]
filled[y] = last
return filled
# --- Main ---
def main():
parser = argparse.ArgumentParser(description="Build repeat-sales price index")
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
parser.add_argument("--output", type=Path, required=True, help="Output price_index.parquet")
parser = argparse.ArgumentParser(description="Build improved repeat-sales price index")
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
pairs = extract_pairs(args.input)
centroids = extract_centroids(args.input)
# Derive geographic hierarchy columns
pairs = pairs.with_columns(
# district = sector minus trailing digit(s), e.g. "SW1A 1" -> "SW1A"
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
).with_columns(
# area = leading letters only, e.g. "SW1A" -> "SW"
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
)
# Solve indices at each level
print("\nComputing national index...")
pairs_np = pairs.select("year1", "year2", "log_ratio", "weight")
national_idx = solve_wls_index(
pairs_np["year1"].to_numpy(),
pairs_np["year2"].to_numpy(),
pairs_np["log_ratio"].to_numpy(),
pairs_np["weight"].to_numpy(),
)
print(f" National index: {len(national_idx)} years")
print("\nComputing area indices...")
area_indices, area_pairs = compute_indices_for_level(pairs, "area")
print(f" {len(area_indices)} areas with indices")
print("\nComputing district indices...")
district_indices, district_pairs = compute_indices_for_level(pairs, "district")
print(f" {len(district_indices)} districts with indices")
print("\nComputing sector indices...")
sector_indices, sector_pairs = compute_indices_for_level(pairs, "sector")
print(f" {len(sector_indices)} sectors with indices")
# Shrink area -> national
print("\nApplying hierarchical shrinkage...")
for area, idx in tqdm(area_indices.items(), desc=" Area shrinkage"):
area_indices[area] = shrink_index(idx, national_idx, area_pairs[area])
# Shrink district -> area
for dist, idx in tqdm(district_indices.items(), desc=" District shrinkage"):
area = dist.replace(r"\d.*$", "")
# Extract area from district (leading letters)
area_key = ""
for ch in dist:
if ch.isalpha():
area_key += ch
else:
break
parent = area_indices.get(area_key, national_idx)
district_indices[dist] = shrink_index(idx, parent, district_pairs[dist])
# Shrink sector -> district
for sector, idx in tqdm(sector_indices.items(), desc=" Sector shrinkage"):
# District = sector minus trailing space+digit
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
parent = district_indices.get(dist_key, national_idx)
sector_indices[sector] = shrink_index(idx, parent, sector_pairs[sector])
# For sectors without enough data, fall back to district/area/national
all_sectors = pairs["sector"].unique().to_list()
min_year = int(pairs["year1"].min())
max_year = max(int(pairs["year2"].max()), 2025)
max_year = max(int(pairs["year2"].max()), CURRENT_YEAR)
print(f"\nFilling gaps and forward-filling ({min_year}-{max_year})...")
hedonic_idx = compute_hedonic_index(args.input, min_year, max_year)
# Precompute hierarchy
all_sectors = pairs["sector"].unique().to_list()
sector_to_dist = {}
dist_to_area = {}
for s in all_sectors:
d, a = hierarchy_keys(s)
sector_to_dist[s] = d
dist_to_area[d] = a
# Process each type group + "All"
all_type_groups = ["All"] + TYPE_GROUPS
final = {} # {type_group: {sector: {year: log_index}}}
final_n = {} # {type_group: {sector: n_pairs}}
for tg in all_type_groups:
print(f"\n--- {tg} ---")
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
if len(typed) < MIN_PAIRS:
print(f" Skipping (only {len(typed)} pairs)")
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
final_n[tg] = {s: 0 for s in all_sectors}
continue
print(f" {len(typed):,} pairs")
# National
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
national_idx = solve_robust_index(
np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(),
np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(),
)
national_n = len(typed)
print(f" National: {len(national_idx)} years")
# Area, district, sector
print(" Computing per-level indices:")
area_idx, area_n = compute_indices_for_level(typed, "area")
district_idx, district_n = compute_indices_for_level(typed, "district")
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
print(f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors")
# Shrinkage
print(" Applying shrinkage...")
sector_shrunk = apply_shrinkage(
sector_idx, sector_n, district_idx, district_n,
area_idx, area_n, national_idx, national_n,
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
)
# Spatial smoothing
print(" Spatial smoothing...")
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
# Forward fill
for sec in all_sectors:
sector_smoothed[sec] = forward_fill(
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
)
final[tg] = sector_smoothed
final_n[tg] = sector_n
# Assemble output
print("\nAssembling output...")
rows = []
for sector in tqdm(all_sectors, desc=" Forward-fill"):
if sector in sector_indices:
idx = sector_indices[sector]
else:
# Fall back to district, area, national
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
area_key = ""
for ch in dist_key:
if ch.isalpha():
area_key += ch
else:
break
idx = district_indices.get(dist_key, area_indices.get(area_key, national_idx))
n = sector_pairs.get(sector, 0)
filled = forward_fill_index(idx, min_year, max_year)
for year, log_idx in filled.items():
rows.append((sector, year, log_idx, n))
for tg in all_type_groups:
for sec in all_sectors:
n = final_n[tg].get(sec, 0)
for year, log_idx in final[tg][sec].items():
rows.append((sec, tg, year, log_idx, n))
result = pl.DataFrame(
rows,
schema={"sector": pl.String, "year": pl.Int32, "log_index": pl.Float64, "n_pairs": pl.Int64},
schema={"sector": pl.String, "type_group": pl.String, "year": pl.Int32,
"log_index": pl.Float64, "n_pairs": pl.Int64},
orient="row",
)
).sort("type_group", "sector", "year")
result = result.sort("sector", "year")
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
print(f" {result['sector'].n_unique():,} sectors × {max_year - min_year + 1} years = {len(result):,} rows")
print(f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows")
if __name__ == "__main__":

View file

@ -5,7 +5,7 @@ from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import _count_pois_per_postcode
from pipeline.utils.poi_counts import count_pois_per_postcode
SCHOOL_GROUPS = {
"good_primary": ["good_primary"],
@ -60,7 +60,7 @@ def main():
# Load all postcodes for proximity counting
postcodes = arcgis.rename({"lng": "lon"})
result = _count_pois_per_postcode(
result = count_pois_per_postcode(
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
)

View file

@ -1 +0,0 @@
/bulk/property-data

2
property-data/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*
!.gitignore

View file

@ -86,6 +86,20 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
suffix: "",
raw: false,
},
// FeatureConfig {
// name: "Estimated current price",
// bounds: Bounds::Fixed {
// min: 0.0,
// max: 2_000_000.0,
// },
// step: 10000.0,
// description: "Inflation-adjusted estimate of the current property value",
// detail: "Estimated by applying a repeat-sales price index to the last known sale price. The index tracks price changes within each postcode sector and property type. Properties sold recently will have estimates close to their sale price; older sales are adjusted more. Coverage depends on having enough repeat sales in the local area to build the index.",
// source: "price-paid",
// prefix: "£",
// suffix: "",
// raw: false,
// },
FeatureConfig {
name: "Price per sqm",
bounds: Bounds::Percentile {