Changes
This commit is contained in:
parent
6c90cf3c0f
commit
5b68c8da04
14 changed files with 687 additions and 551 deletions
|
|
@ -1,298 +0,0 @@
|
|||
version: '3'
|
||||
|
||||
vars:
|
||||
DATA_DIR: /bulk/property-data
|
||||
TILES_OUTPUT: "{{.DATA_DIR}}/uk.pmtiles"
|
||||
ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet"
|
||||
PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet"
|
||||
IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet"
|
||||
POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet"
|
||||
POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet"
|
||||
POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet"
|
||||
EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet"
|
||||
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
|
||||
EPC: "{{.DATA_DIR}}/certificates.csv"
|
||||
JOURNEY_TIMES_BANK: "{{.DATA_DIR}}/journey_times_bank.parquet"
|
||||
JOURNEY_TIMES_FITZROVIA: "{{.DATA_DIR}}/journey_times_fitzrovia.parquet"
|
||||
ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet"
|
||||
CRIME_DIR: "{{.DATA_DIR}}/crime"
|
||||
CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet"
|
||||
NOISE_OUTPUT: "{{.DATA_DIR}}/road_noise.parquet"
|
||||
OFSTED_OUTPUT: "{{.DATA_DIR}}/ofsted.parquet"
|
||||
NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet"
|
||||
BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet"
|
||||
SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet"
|
||||
POSTCODES_OUTPUT: "{{.DATA_DIR}}/postcodes"
|
||||
GEOSURE_OUTPUT: "{{.DATA_DIR}}/geosure"
|
||||
GEOSURE_PARQUET: "{{.DATA_DIR}}/geosure.parquet"
|
||||
INSPIRE_OUTPUT: "{{.DATA_DIR}}/inspire"
|
||||
OA_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/oa_boundaries.gpkg"
|
||||
UPRN_LOOKUP_OUTPUT: "{{.DATA_DIR}}/uprn_lookup.parquet"
|
||||
POSTCODE_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/new_postcode_boundaries"
|
||||
|
||||
tasks:
|
||||
download:tiles:
|
||||
desc: Download UK map tiles (PMTiles format from Protomaps)
|
||||
status:
|
||||
- test -f {{.TILES_OUTPUT}}
|
||||
vars:
|
||||
PMTILES_VERSION: "1.22.3"
|
||||
PMTILES_BIN: "{{.DATA_DIR}}/pmtiles"
|
||||
cmds:
|
||||
- |
|
||||
echo "Downloading UK PMTiles (~1.5GB)..."
|
||||
echo "This extracts UK tiles from the Protomaps planet file."
|
||||
echo ""
|
||||
# Download pmtiles CLI if not present
|
||||
if [ ! -f "{{.PMTILES_BIN}}" ]; then
|
||||
echo "Downloading pmtiles CLI v{{.PMTILES_VERSION}}..."
|
||||
curl -sL "https://github.com/protomaps/go-pmtiles/releases/download/v{{.PMTILES_VERSION}}/go-pmtiles_{{.PMTILES_VERSION}}_Linux_x86_64.tar.gz" | tar -xz -C "{{.DATA_DIR}}" pmtiles
|
||||
chmod +x "{{.PMTILES_BIN}}"
|
||||
fi
|
||||
# Extract UK region (bbox: -10.5,49.5,2.5,61)
|
||||
# Using a recent daily build from Protomaps
|
||||
"{{.PMTILES_BIN}}" extract https://build.protomaps.com/20260201.pmtiles {{.TILES_OUTPUT}} --bbox=-10.5,49.5,2.5,61
|
||||
|
||||
prompt:epc:
|
||||
desc: Prompt user to download EPC dataset (requires registration)
|
||||
status:
|
||||
- test -f {{.EPC}}
|
||||
cmds:
|
||||
- |
|
||||
echo ""
|
||||
echo "=== EPC dataset not found ==="
|
||||
echo "The EPC certificates file is required: {{.EPC}}"
|
||||
echo ""
|
||||
echo "To obtain it, register at https://epc.opendatacommunities.org/login"
|
||||
echo ""
|
||||
exit 1
|
||||
|
||||
prompt:journey-times:
|
||||
desc: Download TFL journey times if missing (requires API key registration)
|
||||
status:
|
||||
- test -f {{.JOURNEY_TIMES_BANK}} || test -f {{.JOURNEY_TIMES_FITZROVIA}}
|
||||
deps:
|
||||
- download:arcgis
|
||||
cmds:
|
||||
- |
|
||||
echo ""
|
||||
echo "=== TFL journey times not found ==="
|
||||
echo "Register for a TFL API key at https://api-portal.tfl.gov.uk/signin"
|
||||
echo "Then set the TFL_API_KEY environment variable and run:"
|
||||
echo " task download:journey-times -- bank"
|
||||
echo " task download:journey-times -- fitzrovia"
|
||||
echo ""
|
||||
exit 1
|
||||
|
||||
download:arcgis:
|
||||
desc: Download and convert ArcGIS postcode data
|
||||
status:
|
||||
- test -f {{.ARCGIS_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
|
||||
|
||||
download:price-paid:
|
||||
desc: Download and convert Land Registry price-paid data
|
||||
status:
|
||||
- test -f {{.PRICE_PAID_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
|
||||
|
||||
download:deprivation:
|
||||
desc: Download and convert Index of Deprivation data
|
||||
status:
|
||||
- test -f {{.IOD_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
|
||||
|
||||
download:ethnicity:
|
||||
desc: Download ethnicity by local authority data
|
||||
status:
|
||||
- test -f {{.ETHNICITY_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.ethnicity --output {{.ETHNICITY_OUTPUT}}
|
||||
|
||||
download:naptan:
|
||||
desc: Download NaPTAN station data
|
||||
status:
|
||||
- test -f {{.NAPTAN_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.naptan --output {{.NAPTAN_OUTPUT}}
|
||||
|
||||
download:pois:
|
||||
desc: Download and extract POIs from OpenStreetMap
|
||||
status:
|
||||
- test -f {{.POIS_RAW_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
|
||||
|
||||
download:ofsted:
|
||||
desc: Download Ofsted school inspection outcomes
|
||||
status:
|
||||
- test -f {{.OFSTED_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.ofsted --output {{.OFSTED_OUTPUT}}
|
||||
|
||||
download:broadband:
|
||||
desc: Download Ofcom broadband performance data
|
||||
status:
|
||||
- test -f {{.BROADBAND_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}}
|
||||
|
||||
download:postcodes:
|
||||
desc: Download GB postcodes data from MapIt
|
||||
status:
|
||||
- test -f {{.POSTCODES_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.postcodes --output {{.POSTCODES_OUTPUT}}
|
||||
|
||||
download:geosure:
|
||||
desc: Download OS GeoSure ground stability data (5km hex grid)
|
||||
status:
|
||||
- test -d {{.GEOSURE_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.geosure --output {{.GEOSURE_OUTPUT}}
|
||||
|
||||
download:noise:
|
||||
desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids
|
||||
deps:
|
||||
- download:arcgis
|
||||
status:
|
||||
- test -f {{.NOISE_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.noise --arcgis {{.ARCGIS_OUTPUT}} --output {{.NOISE_OUTPUT}}
|
||||
|
||||
transform:pois:
|
||||
desc: Transform raw POIs to filtered version with friendly names
|
||||
deps:
|
||||
- download:pois
|
||||
- download:naptan
|
||||
status:
|
||||
- test -f {{.POIS_FILTERED_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --naptan {{.NAPTAN_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
|
||||
|
||||
transform:epc-pp:
|
||||
desc: Fuzzy join EPC and Price Paid data
|
||||
deps:
|
||||
- download:price-paid
|
||||
- prompt:epc
|
||||
status:
|
||||
- test -f {{.EPC_PP_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
|
||||
|
||||
transform:crime:
|
||||
desc: Transform crime CSVs into yearly averages by LSOA
|
||||
status:
|
||||
- test -f {{.CRIME_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.transform.crime --input {{.CRIME_DIR}} --output {{.CRIME_OUTPUT}}
|
||||
|
||||
transform:poi-proximity:
|
||||
desc: Compute POI proximity counts per postcode
|
||||
deps:
|
||||
- download:arcgis
|
||||
- transform:pois
|
||||
status:
|
||||
- test -f {{.POI_PROXIMITY_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
|
||||
|
||||
transform:school-proximity:
|
||||
desc: Compute good+ school proximity counts per postcode
|
||||
deps:
|
||||
- download:ofsted
|
||||
- download:arcgis
|
||||
status:
|
||||
- test -f {{.SCHOOL_PROXIMITY_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}}
|
||||
|
||||
transform:geosure:
|
||||
desc: Spatial-join GeoSure ground stability data to postcode centroids
|
||||
deps:
|
||||
- download:geosure
|
||||
- download:arcgis
|
||||
status:
|
||||
- test -f {{.GEOSURE_PARQUET}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.transform.transform_geosure --geosure {{.GEOSURE_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.GEOSURE_PARQUET}}
|
||||
|
||||
download:inspire:
|
||||
desc: Download INSPIRE Index Polygon GML files from HM Land Registry
|
||||
status:
|
||||
- test -d {{.INSPIRE_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.inspire --output {{.INSPIRE_OUTPUT}}
|
||||
|
||||
download:oa-boundaries:
|
||||
desc: Download Output Areas (2021) boundary polygons (England & Wales)
|
||||
status:
|
||||
- test -f {{.OA_BOUNDARIES_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.oa_boundaries --output {{.OA_BOUNDARIES_OUTPUT}}
|
||||
|
||||
download:uprn-lookup:
|
||||
desc: Download National Statistics UPRN Lookup and convert to parquet
|
||||
status:
|
||||
- test -f {{.UPRN_LOOKUP_OUTPUT}}
|
||||
cmds:
|
||||
- uv run python -m pipeline.download.uprn_lookup --output {{.UPRN_LOOKUP_OUTPUT}}
|
||||
|
||||
transform:postcode-boundaries:
|
||||
desc: Generate postcode boundary polygons from OA boundaries + INSPIRE + UPRNs
|
||||
deps:
|
||||
- download:oa-boundaries
|
||||
- download:inspire
|
||||
- download:uprn-lookup
|
||||
cmds:
|
||||
- >-
|
||||
uv run python -m pipeline.transform.postcode_boundaries
|
||||
--uprn {{.UPRN_LOOKUP_OUTPUT}}
|
||||
--oa-boundaries {{.OA_BOUNDARIES_OUTPUT}}
|
||||
--inspire {{.INSPIRE_OUTPUT}}
|
||||
--output {{.POSTCODE_BOUNDARIES_OUTPUT}}
|
||||
|
||||
download:journey-times:
|
||||
desc: "Fetch TfL journey times: task download:journey-times"
|
||||
deps:
|
||||
- download:arcgis
|
||||
status:
|
||||
- test -f {{.DATA_DIR}}/journey_times_*.parquet
|
||||
cmds:
|
||||
- uv run python -m pipeline.journey_times --destination {{.CLI_ARGS}} --output-dir {{.DATA_DIR}} --postcodes {{.ARCGIS_OUTPUT}}
|
||||
|
||||
prepare:
|
||||
desc: Build wide property dataframe with all joins
|
||||
deps:
|
||||
# - transform:epc-pp
|
||||
# - download:arcgis
|
||||
# - download:deprivation
|
||||
# - download:ethnicity
|
||||
# - download:broadband
|
||||
# - download:noise
|
||||
# - transform:crime
|
||||
# - transform:poi-proximity
|
||||
# - transform:school-proximity
|
||||
# - transform:geosure
|
||||
# - prompt:journey-times
|
||||
status:
|
||||
- test -f {{.WIDE_OUTPUT}}
|
||||
cmds:
|
||||
- >-
|
||||
uv run python -m pipeline.transform.merge
|
||||
--epc-pp {{.EPC_PP_OUTPUT}}
|
||||
--arcgis {{.ARCGIS_OUTPUT}}
|
||||
--iod {{.IOD_OUTPUT}}
|
||||
--poi-proximity {{.POI_PROXIMITY_OUTPUT}}
|
||||
--journey-times-bank {{.JOURNEY_TIMES_BANK}}
|
||||
--journey-times-fitzrovia {{.JOURNEY_TIMES_FITZROVIA}}
|
||||
--ethnicity {{.ETHNICITY_OUTPUT}}
|
||||
--crime {{.CRIME_OUTPUT}}
|
||||
--noise {{.NOISE_OUTPUT}}
|
||||
--school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}}
|
||||
--broadband {{.BROADBAND_OUTPUT}}
|
||||
--geosure {{.GEOSURE_PARQUET}}
|
||||
--output {{.WIDE_OUTPUT}}
|
||||
|
|
@ -1,10 +1,5 @@
|
|||
version: '3'
|
||||
|
||||
includes:
|
||||
data:
|
||||
taskfile: ./Taskfile.data.yml
|
||||
flatten: true
|
||||
|
||||
|
||||
tasks:
|
||||
install:
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ const DATA_SOURCES = [
|
|||
id: 'epc',
|
||||
name: 'Energy Performance Certificates (EPC)',
|
||||
origin: 'Ministry of Housing, Communities & Local Government',
|
||||
use: 'Domestic Energy Performance Certificates providing floor area, number of rooms, construction age, energy ratings, property type, and built form. Fuzzy-joined with Price Paid records by address within postcode buckets.',
|
||||
use: 'Domestic Energy Performance Certificates providing floor area, number of rooms, construction age, energy ratings, property type, and built form. Fuzzy-joined with Price Paid records by address within postcode buckets. Property owners can opt out of public disclosure.',
|
||||
optOutUrl: 'https://www.gov.uk/guidance/energy-performance-certificates-opt-out-of-public-disclosure',
|
||||
url: 'https://epc.opendatacommunities.org/downloads/domestic',
|
||||
license: 'Open Government Licence v3.0',
|
||||
},
|
||||
|
|
@ -180,6 +181,18 @@ export default function DataSourcesPage() {
|
|||
>
|
||||
{source.url}
|
||||
</a>
|
||||
{'optOutUrl' in source && source.optOutUrl && (
|
||||
<div className="mt-2">
|
||||
<a
|
||||
href={source.optOutUrl}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-sm text-teal-600 dark:text-teal-400 hover:text-teal-800 dark:hover:text-teal-300 hover:underline"
|
||||
>
|
||||
Opt out of public disclosure
|
||||
</a>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
|
|
|||
22
frontend/src/components/ui/icons/LogoIcon.tsx
Normal file
22
frontend/src/components/ui/icons/LogoIcon.tsx
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
interface IconProps {
|
||||
className?: string;
|
||||
}
|
||||
|
||||
export function LogoIcon({ className = 'w-4 h-4' }: IconProps) {
|
||||
return (
|
||||
<svg
|
||||
className={className}
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
strokeWidth={2}
|
||||
>
|
||||
<path
|
||||
strokeLinecap="round"
|
||||
strokeLinejoin="round"
|
||||
d="M12 2L20.7 7v10L12 22l-8.7-5V7z"
|
||||
/>
|
||||
<path strokeLinecap="round" strokeLinejoin="round" d="M8.5 12.5l2.5 2.5 4.5-5" />
|
||||
</svg>
|
||||
);
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@ const HtmlWebpackPlugin = require('html-webpack-plugin');
|
|||
const MiniCssExtractPlugin = require('mini-css-extract-plugin');
|
||||
const CopyWebpackPlugin = require('copy-webpack-plugin');
|
||||
const ReactRefreshWebpackPlugin = require('@pmmmwh/react-refresh-webpack-plugin');
|
||||
const FaviconsWebpackPlugin = require('favicons-webpack-plugin');
|
||||
|
||||
module.exports = (env, argv) => {
|
||||
const isProduction = argv.mode === 'production';
|
||||
|
|
@ -54,6 +55,20 @@ module.exports = (env, argv) => {
|
|||
new CopyWebpackPlugin({
|
||||
patterns: [{ from: 'public', noErrorOnMissing: true }],
|
||||
}),
|
||||
new FaviconsWebpackPlugin({
|
||||
logo: './public/favicon.svg',
|
||||
favicons: {
|
||||
background: '#0c0a09',
|
||||
icons: {
|
||||
favicons: true,
|
||||
android: false,
|
||||
appleIcon: false,
|
||||
appleStartup: false,
|
||||
yandex: false,
|
||||
windows: false,
|
||||
},
|
||||
},
|
||||
}),
|
||||
...(isProduction
|
||||
? [new MiniCssExtractPlugin()]
|
||||
: [new ReactRefreshWebpackPlugin()]),
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
(title) Find your <green>perfect postcode</green> before you find your property.
|
||||
|
||||
Set the sliders according to your expectations and the map will highlight the areas that actually match. Instantly.
|
||||
Set the sliders to your expectations and the map highlights the areas that actually match. Instantly.
|
||||
|
||||
<example with two sliders, last known price, minor crime/>
|
||||
|
||||
|
|
@ -10,9 +10,9 @@ That's just two. We've built 43 — spanning transport links, amenities, demogra
|
|||
|
||||
Here's the problem with property search: listings only show you what's on the market right now — a thin slice of what an area is actually like. And even if you could look beyond them, there are millions of postcodes across England. You can't research them all yourself.
|
||||
|
||||
So we've built this dashboard for You. We took millions of historical transactions and public records, connected and extended them with proprietary algorithms on top — so the map doesn't just show you raw data, it surfaces the patterns that matter.
|
||||
We built this for you — years of historical transactions and public records, extended with proprietary algorithms so the map doesn't just show raw data, it surfaces the patterns that matter.
|
||||
|
||||
Understand areas first. Then find the right property within them, with expectations you've set yourself rather than ones the market set for you.
|
||||
Understand areas first. Then find the right property within them, with expectations you've set rather than ones the market set for you.
|
||||
|
||||
(Fun cereal graphic on the side with this popup) You might buy a box of cereal because it's 20% off. Your next home is not a box of cereal. Don't let a discount on the wrong property distract you from finding the right one. Know what you're looking for, then go looking.
|
||||
|
||||
|
|
|
|||
2
manual-data/.gitignore
vendored
Normal file
2
manual-data/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
certificates.csv
|
||||
crime
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
Test set: properties with 2+ sales where the last sale is 2022-2025.
|
||||
Uses the second-to-last sale as input, predicts the last sale price.
|
||||
Compares index-based prediction against a naive baseline (raw input price).
|
||||
Uses type-stratified index when available, falling back to "All" type.
|
||||
|
||||
Output: backtest_results.parquet with predictions vs actuals.
|
||||
"""
|
||||
|
|
@ -15,6 +16,17 @@ import polars as pl
|
|||
|
||||
CURRENT_YEAR = 2025
|
||||
TEST_YEAR_MIN = 2022
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def extract_test_set(input_path: Path) -> pl.DataFrame:
|
||||
|
|
@ -22,13 +34,14 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
print("Loading test set...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices")
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
type_group_expr(),
|
||||
# Last sale (ground truth)
|
||||
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
|
||||
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
|
||||
|
|
@ -49,21 +62,60 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
|
||||
|
||||
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Index-based prediction: adjust input price by sector index change."""
|
||||
# Join index at input year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join index at actual year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
"""Index-based prediction with type-stratified fallback."""
|
||||
has_type_group = "type_group" in index.columns
|
||||
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join type-specific index at input year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
|
||||
left_on=["sector", "type_group", "input_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at input year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join type-specific index at actual year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
|
||||
left_on=["sector", "type_group", "actual_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at actual year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
|
||||
pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
|
||||
)
|
||||
else:
|
||||
# Unstratified index
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
(
|
||||
|
|
@ -75,7 +127,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
|
||||
|
||||
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
||||
"""Compute error metrics."""
|
||||
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
|
||||
actual = actual[valid]
|
||||
predicted = predicted[valid]
|
||||
|
|
@ -95,7 +146,6 @@ def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
|||
|
||||
|
||||
def print_metrics_table(metrics_by_stage: dict):
|
||||
"""Print a comparison table of metrics."""
|
||||
print("\n" + "=" * 55)
|
||||
print("BACKTEST RESULTS")
|
||||
print("=" * 55)
|
||||
|
|
@ -103,7 +153,6 @@ def print_metrics_table(metrics_by_stage: dict):
|
|||
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
|
||||
stages = list(metrics_by_stage.keys())
|
||||
|
||||
# Header
|
||||
header = f"{'Metric':<25s}"
|
||||
for stage in stages:
|
||||
header += f" {stage:>14s}"
|
||||
|
|
@ -133,7 +182,12 @@ def main():
|
|||
args = parser.parse_args()
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups")
|
||||
else:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
|
||||
test = extract_test_set(args.input)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
"""Apply repeat-sales price index to estimate current property prices.
|
||||
"""Augment wide.parquet with an estimated current price column.
|
||||
|
||||
Joins the precomputed price index (from price_index.py) with each property's
|
||||
last known sale to produce an inflation-adjusted current price estimate.
|
||||
Joins the precomputed repeat-sales price index (from price_index.py) with each
|
||||
property's last known sale to produce an inflation-adjusted current price estimate.
|
||||
Uses type-stratified index when available, falling back to "All" type.
|
||||
|
||||
Output: estimated_prices.parquet with per-property estimates.
|
||||
Modifies wide.parquet in-place, adding the "Estimated current price" column.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -12,78 +13,133 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2025
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Estimate current property prices")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser = argparse.ArgumentParser(description="Augment wide.parquet with estimated current prices")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet (modified in-place)")
|
||||
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output estimated_prices.parquet")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading property data...")
|
||||
df = (
|
||||
pl.scan_parquet(args.input)
|
||||
.select("Postcode", "Address per Property Register", "Last known price", "Date of last transaction")
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Postcode").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
)
|
||||
.collect()
|
||||
print("Loading wide.parquet...")
|
||||
df = pl.read_parquet(args.input)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Drop existing estimated price column if re-running
|
||||
if "Estimated current price" in df.columns:
|
||||
df = df.drop("Estimated current price")
|
||||
|
||||
# Derive helper columns for the join
|
||||
has_price = (
|
||||
pl.col("Last known price").is_not_null()
|
||||
& pl.col("Postcode").is_not_null()
|
||||
& pl.col("Date of last transaction").is_not_null()
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("_sector"),
|
||||
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
|
||||
type_group_expr().alias("_type_group"),
|
||||
)
|
||||
print(f" {len(df):,} properties with known price and postcode")
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups")
|
||||
else:
|
||||
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)")
|
||||
|
||||
print("\nApplying repeat-sales index...")
|
||||
|
||||
# Join index at sale year
|
||||
df = df.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_sale")),
|
||||
left_on=["sector", "sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join index at current year
|
||||
index_current = (
|
||||
index.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", pl.col("log_index").alias("log_index_current"))
|
||||
)
|
||||
df = df.join(index_current, on="sector", how="left")
|
||||
|
||||
# Compute estimate; fall back to raw price when no index available
|
||||
df = df.with_columns(
|
||||
(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (pl.col("log_index_current") - pl.col("log_index_sale")).exp()
|
||||
# Join type-specific index at sale year
|
||||
df = df.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("log_idx_sale_typed")),
|
||||
left_on=["_sector", "_type_group", "_sale_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
.fill_null(pl.col("Last known price").cast(pl.Float64))
|
||||
.alias("estimated_price"),
|
||||
# Join "All" index at sale year
|
||||
df = df.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("log_idx_sale_all")),
|
||||
left_on=["_sector", "_sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join type-specific index at current year
|
||||
df = df.join(
|
||||
idx_typed.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")),
|
||||
left_on=["_sector", "_type_group"],
|
||||
right_on=["sector", "type_group"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at current year
|
||||
df = df.join(
|
||||
idx_all.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", pl.col("log_index").alias("log_idx_cur_all")),
|
||||
left_on="_sector",
|
||||
right_on="sector",
|
||||
how="left",
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col("log_idx_sale_typed").fill_null(pl.col("log_idx_sale_all")).alias("_log_index_sale"),
|
||||
pl.col("log_idx_cur_typed").fill_null(pl.col("log_idx_cur_all")).alias("_log_index_current"),
|
||||
)
|
||||
else:
|
||||
df = df.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("_log_index_sale")),
|
||||
left_on=["_sector", "_sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
index_current = (
|
||||
index.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", pl.col("log_index").alias("_log_index_current"))
|
||||
)
|
||||
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
|
||||
|
||||
# Compute estimate — only for rows with a known price
|
||||
df = df.with_columns(
|
||||
pl.when(has_price)
|
||||
.then(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
|
||||
)
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_adjusted = df.filter(pl.col("log_index_sale").is_not_null()).height
|
||||
print(f" {n_adjusted:,} properties adjusted by index ({n_adjusted / len(df) * 100:.1f}%)")
|
||||
n_adjusted = df.filter(
|
||||
has_price & pl.col("_log_index_sale").is_not_null()
|
||||
).height
|
||||
n_with_price = df.filter(has_price).height
|
||||
print(f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)")
|
||||
|
||||
# Select output columns
|
||||
output = df.select(
|
||||
"Postcode",
|
||||
"Address per Property Register",
|
||||
pl.col("Last known price").alias("last_price"),
|
||||
"sale_year",
|
||||
"sector",
|
||||
"estimated_price",
|
||||
)
|
||||
# Drop all temporary columns
|
||||
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
|
||||
df = df.drop(temp_cols)
|
||||
|
||||
output.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {len(output):,} rows")
|
||||
df.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
"""Stage 1: Repeat-Sales Price Index
|
||||
"""Repeat-Sales Price Index (improved)
|
||||
|
||||
Builds a hierarchical Case-Shiller repeat-sales price index from historical
|
||||
transaction data. Solves WLS regression per postcode sector, district, area,
|
||||
and nationally, then applies Bayesian shrinkage toward parent geographies.
|
||||
Builds a hierarchical repeat-sales price index with:
|
||||
1. Stratification by property type (Detached/Semi-Detached/Terraced/Flats)
|
||||
2. Robust regression (IRLS with Huber weights) instead of hard outlier cutoff
|
||||
3. National hedonic time-dummy model as ultimate shrinkage fallback
|
||||
4. Spatial smoothing for sparse sectors via KD-tree nearest neighbors
|
||||
|
||||
Output: price_index.parquet with columns: sector, year, log_index, n_pairs
|
||||
Output: price_index.parquet — sector × type_group × year → log_index
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -14,31 +16,74 @@ import numpy as np
|
|||
import polars as pl
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse.linalg import lsqr
|
||||
from scipy.spatial import KDTree
|
||||
from tqdm import tqdm
|
||||
|
||||
MIN_PAIRS = 5 # minimum pairs to compute an index
|
||||
SHRINKAGE_K = 50 # shrinkage parameter: higher = more shrinkage toward parent
|
||||
OUTLIER_THRESHOLD = 2.5 # |log_ratio| > this → drop (>12x price change)
|
||||
# --- Constants ---
|
||||
MIN_PAIRS = 5
|
||||
SHRINKAGE_K = 50
|
||||
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
|
||||
HUBER_K = 1.345
|
||||
IRLS_ITERATIONS = 5
|
||||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
CURRENT_YEAR = 2025
|
||||
|
||||
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
|
||||
AGE_LABELS = ["pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type → type_group."""
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def age_band_expr():
|
||||
"""Polars expression: Construction age (UInt16 year) → age band string."""
|
||||
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
|
||||
for i, brk in enumerate(AGE_BREAKS):
|
||||
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
|
||||
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
|
||||
|
||||
|
||||
def sector_expr():
|
||||
"""Polars expression: Postcode → sector (drop last 2 chars, strip)."""
|
||||
return pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector")
|
||||
|
||||
|
||||
def hierarchy_keys(sector: str) -> tuple[str, str]:
|
||||
"""Return (district, area) for a sector string."""
|
||||
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area = ""
|
||||
for ch in district:
|
||||
if ch.isalpha():
|
||||
area += ch
|
||||
else:
|
||||
break
|
||||
return district, area
|
||||
|
||||
|
||||
# --- Pair extraction ---
|
||||
|
||||
def extract_pairs(input_path: Path) -> pl.DataFrame:
|
||||
"""Extract consecutive sale pairs from historical_prices."""
|
||||
print("Loading historical prices...")
|
||||
print("Extracting repeat-sale pairs...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
)
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2)
|
||||
.with_columns(sector_expr(), type_group_expr())
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} properties with 2+ transactions")
|
||||
|
||||
print("Extracting consecutive pairs...")
|
||||
pairs = (
|
||||
df.lazy()
|
||||
.with_columns(
|
||||
|
|
@ -52,12 +97,8 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
|
|||
pl.col("to_txn").struct.field("year").alias("year2"),
|
||||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||||
)
|
||||
.select("sector", "year1", "price1", "year2", "price2")
|
||||
.filter(
|
||||
pl.col("price1") > 0,
|
||||
pl.col("price2") > 0,
|
||||
pl.col("year2") > pl.col("year1"),
|
||||
)
|
||||
.select("sector", "type_group", "year1", "price1", "year2", "price2")
|
||||
.filter(pl.col("price1") > 0, pl.col("price2") > 0, pl.col("year2") > pl.col("year1"))
|
||||
.with_columns(
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
|
||||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
|
||||
|
|
@ -65,207 +106,428 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
|
|||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(pairs):,} consecutive pairs extracted")
|
||||
|
||||
# Add hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
print(f" {len(pairs):,} pairs extracted")
|
||||
return pairs
|
||||
|
||||
|
||||
def solve_wls_index(years1: np.ndarray, years2: np.ndarray, log_ratios: np.ndarray, weights: np.ndarray) -> dict[int, float]:
|
||||
"""Solve WLS repeat-sales regression for a set of pairs.
|
||||
# --- Sector centroids ---
|
||||
|
||||
Model: log(P2/P1) = beta[year2] - beta[year1], weighted by 1/sqrt(gap).
|
||||
Pin beta[min_year] = 0.
|
||||
Returns dict mapping year -> log_index (cumulative).
|
||||
"""
|
||||
if len(years1) < MIN_PAIRS:
|
||||
def extract_centroids(input_path: Path) -> dict[str, tuple[float, float]]:
|
||||
print("Computing sector centroids...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "lat", "lon")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
|
||||
.with_columns(sector_expr())
|
||||
.group_by("sector")
|
||||
.agg(pl.col("lat").mean(), pl.col("lon").mean())
|
||||
.collect()
|
||||
)
|
||||
centroids = {}
|
||||
for row in df.iter_rows(named=True):
|
||||
centroids[row["sector"]] = (row["lat"], row["lon"])
|
||||
print(f" {len(centroids):,} sector centroids")
|
||||
return centroids
|
||||
|
||||
|
||||
# --- Robust IRLS solver ---
|
||||
|
||||
def solve_robust_index(
|
||||
years1: np.ndarray, years2: np.ndarray,
|
||||
log_ratios: np.ndarray, base_weights: np.ndarray,
|
||||
) -> dict[int, float]:
|
||||
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
|
||||
n = len(years1)
|
||||
if n < MIN_PAIRS:
|
||||
return {}
|
||||
|
||||
all_years = np.union1d(years1, years2)
|
||||
min_year = int(all_years.min())
|
||||
# Map years to column indices, skipping min_year (pinned to 0)
|
||||
|
||||
col = 0
|
||||
year_to_col = {}
|
||||
for y in all_years:
|
||||
if int(y) != min_year:
|
||||
year_to_col[int(y)] = col
|
||||
iy = int(y)
|
||||
if iy != min_year:
|
||||
year_to_col[iy] = col
|
||||
col += 1
|
||||
n_cols = len(year_to_col)
|
||||
if n_cols == 0:
|
||||
return {}
|
||||
|
||||
n_rows = len(years1)
|
||||
row_idx = []
|
||||
col_idx = []
|
||||
data = []
|
||||
# Vectorized column index mapping
|
||||
col2 = np.full(n, -1, dtype=np.int32)
|
||||
col1 = np.full(n, -1, dtype=np.int32)
|
||||
for year, c in year_to_col.items():
|
||||
col2[years2 == year] = c
|
||||
col1[years1 == year] = c
|
||||
|
||||
for i in range(n_rows):
|
||||
y1, y2 = int(years1[i]), int(years2[i])
|
||||
if y2 in year_to_col:
|
||||
row_idx.append(i)
|
||||
col_idx.append(year_to_col[y2])
|
||||
data.append(weights[i])
|
||||
if y1 in year_to_col:
|
||||
row_idx.append(i)
|
||||
col_idx.append(year_to_col[y1])
|
||||
data.append(-weights[i])
|
||||
# Sparse matrix structure (fixed across iterations)
|
||||
mask2 = col2 >= 0
|
||||
mask1 = col1 >= 0
|
||||
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
|
||||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
A = csc_matrix((data, (row_idx, col_idx)), shape=(n_rows, n_cols))
|
||||
b = log_ratios * weights
|
||||
weights = base_weights.copy()
|
||||
|
||||
result = lsqr(A, b, atol=1e-10, btol=1e-10)
|
||||
betas = result[0]
|
||||
for _ in range(IRLS_ITERATIONS):
|
||||
data = signs_arr * weights[rows_arr]
|
||||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||||
b = log_ratios * weights
|
||||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||||
|
||||
# Residuals
|
||||
predicted = np.zeros(n)
|
||||
predicted[mask2] += betas[col2[mask2]]
|
||||
predicted[mask1] -= betas[col1[mask1]]
|
||||
residuals = log_ratios - predicted
|
||||
|
||||
# Huber reweighting
|
||||
abs_r = np.abs(residuals)
|
||||
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
|
||||
weights = base_weights * huber_w
|
||||
|
||||
index = {min_year: 0.0}
|
||||
for year, col in year_to_col.items():
|
||||
index[year] = float(betas[col])
|
||||
for year, c in year_to_col.items():
|
||||
index[year] = float(betas[c])
|
||||
return index
|
||||
|
||||
|
||||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str) -> dict[str, dict[int, float]]:
|
||||
"""Compute raw indices for each geographic group."""
|
||||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
||||
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
|
||||
groups = pairs.group_by(group_col).agg(
|
||||
pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
|
||||
)
|
||||
|
||||
indices = {}
|
||||
n_pairs_map = {}
|
||||
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" Solving {group_col}"):
|
||||
n_pairs = {}
|
||||
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"):
|
||||
key = row[group_col]
|
||||
y1 = np.array(row["year1"], dtype=np.int32)
|
||||
y2 = np.array(row["year2"], dtype=np.int32)
|
||||
lr = np.array(row["log_ratio"], dtype=np.float64)
|
||||
w = np.array(row["weight"], dtype=np.float64)
|
||||
idx = solve_wls_index(y1, y2, lr, w)
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs_map[key] = len(y1)
|
||||
return indices, n_pairs_map
|
||||
n_pairs[key] = len(y1)
|
||||
return indices, n_pairs
|
||||
|
||||
|
||||
def shrink_index(raw: dict[int, float], parent: dict[int, float], n_pairs: int) -> dict[int, float]:
|
||||
"""Bayesian shrinkage toward parent index."""
|
||||
w = n_pairs / (n_pairs + SHRINKAGE_K)
|
||||
# --- Hedonic model ---
|
||||
|
||||
def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dict[int, float]:
|
||||
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
|
||||
print("Computing hedonic index...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(
|
||||
"Last known price", "Date of last transaction", "Property type",
|
||||
"Total floor area (sqm)", "Current energy rating",
|
||||
"Number of bedrooms & living rooms", "Construction age",
|
||||
)
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Current energy rating").is_in(["A", "B", "C", "D", "E", "F", "G"]),
|
||||
pl.col("Number of bedrooms & living rooms").is_not_null(),
|
||||
pl.col("Construction age").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
type_group_expr(),
|
||||
age_band_expr(),
|
||||
)
|
||||
.filter(
|
||||
pl.col("type_group").is_not_null(),
|
||||
pl.col("sale_year").is_not_null(),
|
||||
pl.col("sale_year") >= min_year,
|
||||
pl.col("sale_year") <= max_year,
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} complete cases for hedonic model")
|
||||
|
||||
# Target
|
||||
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
|
||||
sale_years = df["sale_year"].to_numpy()
|
||||
|
||||
# Build feature matrix
|
||||
parts = []
|
||||
# log(floor_area)
|
||||
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
|
||||
parts.append(np.log(np.maximum(fa, 1.0)).reshape(-1, 1))
|
||||
# Type dummies (ref: Detached)
|
||||
tg = df["type_group"].to_numpy()
|
||||
for t in ["Terraced", "Semi-Detached", "Flats"]:
|
||||
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
|
||||
# EPC dummies (ref: D)
|
||||
epc = df["Current energy rating"].to_numpy()
|
||||
for r in ["A", "B", "C", "E", "F", "G"]:
|
||||
parts.append((epc == r).astype(np.float32).reshape(-1, 1))
|
||||
# Rooms
|
||||
parts.append(df["Number of bedrooms & living rooms"].to_numpy().astype(np.float32).reshape(-1, 1))
|
||||
# Age band dummies (ref: pre-1900)
|
||||
ab = df["age_band"].to_numpy()
|
||||
for band in AGE_LABELS[1:]:
|
||||
parts.append((ab == band).astype(np.float32).reshape(-1, 1))
|
||||
# Intercept
|
||||
parts.append(np.ones((len(df), 1), dtype=np.float32))
|
||||
|
||||
F = np.hstack(parts)
|
||||
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
|
||||
|
||||
# Step 1: regress log(price) on features → quality score
|
||||
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
|
||||
quality_score = F.astype(np.float64) @ betas
|
||||
residuals = log_price - quality_score
|
||||
|
||||
# Step 2: average residual by year = hedonic index
|
||||
hedonic = {}
|
||||
for y in range(min_year, max_year + 1):
|
||||
mask = sale_years == y
|
||||
if mask.sum() > 0:
|
||||
hedonic[y] = float(np.mean(residuals[mask]))
|
||||
|
||||
# Normalize: min_year = 0
|
||||
base = hedonic.get(min_year, 0.0)
|
||||
for y in hedonic:
|
||||
hedonic[y] -= base
|
||||
|
||||
print(f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}")
|
||||
return hedonic
|
||||
|
||||
|
||||
# --- Shrinkage ---
|
||||
|
||||
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
|
||||
w = n_pairs / (n_pairs + k)
|
||||
result = {}
|
||||
all_years = set(raw.keys()) | set(parent.keys())
|
||||
for y in all_years:
|
||||
raw_val = raw.get(y, parent.get(y, 0.0))
|
||||
parent_val = parent.get(y, raw.get(y, 0.0))
|
||||
result[y] = w * raw_val + (1 - w) * parent_val
|
||||
for y in set(raw) | set(parent):
|
||||
r = raw.get(y, parent.get(y, 0.0))
|
||||
p = parent.get(y, raw.get(y, 0.0))
|
||||
result[y] = w * r + (1 - w) * p
|
||||
return result
|
||||
|
||||
|
||||
def forward_fill_index(index: dict[int, float], min_year: int, max_year: int) -> dict[int, float]:
|
||||
"""Forward-fill missing years so index is continuous."""
|
||||
def apply_shrinkage(
|
||||
sector_idx, sector_n, district_idx, district_n,
|
||||
area_idx, area_n, national_idx, national_n,
|
||||
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
|
||||
):
|
||||
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
|
||||
# National → hedonic
|
||||
national_shrunk = shrink_index(national_idx, hedonic_idx, national_n)
|
||||
|
||||
# Area → national
|
||||
area_shrunk = {}
|
||||
for area, idx in area_idx.items():
|
||||
area_shrunk[area] = shrink_index(idx, national_shrunk, area_n[area])
|
||||
|
||||
# District → area
|
||||
district_shrunk = {}
|
||||
for dist, idx in district_idx.items():
|
||||
a = dist_to_area.get(dist, "")
|
||||
parent = area_shrunk.get(a, national_shrunk)
|
||||
district_shrunk[dist] = shrink_index(idx, parent, district_n[dist])
|
||||
|
||||
# Sector → district
|
||||
sector_shrunk = {}
|
||||
for sec, idx in sector_idx.items():
|
||||
d = sector_to_dist.get(sec, "")
|
||||
parent = district_shrunk.get(d, national_shrunk)
|
||||
sector_shrunk[sec] = shrink_index(idx, parent, sector_n[sec])
|
||||
|
||||
# Fill sectors without their own index
|
||||
for sec in all_sectors:
|
||||
if sec not in sector_shrunk:
|
||||
d = sector_to_dist.get(sec, "")
|
||||
a = dist_to_area.get(d, "")
|
||||
sector_shrunk[sec] = district_shrunk.get(
|
||||
d, area_shrunk.get(a, national_shrunk)
|
||||
)
|
||||
|
||||
return sector_shrunk
|
||||
|
||||
|
||||
# --- Spatial smoothing ---
|
||||
|
||||
def spatial_smooth(
|
||||
sector_indices: dict, centroids: dict, n_pairs_map: dict,
|
||||
) -> dict:
|
||||
"""Blend sparse sector indices with K nearest neighbors."""
|
||||
# Build coordinate arrays for sectors with centroids
|
||||
sectors_with_coords = [s for s in sector_indices if s in centroids]
|
||||
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
||||
return sector_indices
|
||||
|
||||
coords = np.array([centroids[s] for s in sectors_with_coords])
|
||||
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
|
||||
mean_lat = np.mean(coords[:, 0])
|
||||
scale = np.cos(np.radians(mean_lat))
|
||||
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
||||
tree = KDTree(scaled_coords)
|
||||
|
||||
result = dict(sector_indices)
|
||||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = n_pairs_map.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue # enough data, skip smoothing
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
# Skip self (index 0, distance ~0)
|
||||
neighbor_dists = dists[1:]
|
||||
neighbor_idxs = idxs[1:]
|
||||
|
||||
inv_dists = []
|
||||
neighbor_indices = []
|
||||
for d, j in zip(neighbor_dists, neighbor_idxs):
|
||||
ns = sectors_with_coords[j]
|
||||
if d > 0 and ns in sector_indices:
|
||||
inv_dists.append(1.0 / d)
|
||||
neighbor_indices.append(sector_indices[ns])
|
||||
|
||||
if not neighbor_indices:
|
||||
continue
|
||||
|
||||
total_inv = sum(inv_dists)
|
||||
nbr_w = 1.0 - self_w
|
||||
ws = [iw / total_inv * nbr_w for iw in inv_dists]
|
||||
|
||||
blended = {}
|
||||
all_years = set(sector_indices[sec])
|
||||
for ni in neighbor_indices:
|
||||
all_years |= set(ni)
|
||||
for y in all_years:
|
||||
val = self_w * sector_indices[sec].get(y, 0.0)
|
||||
for ni, w in zip(neighbor_indices, ws):
|
||||
val += w * ni.get(y, 0.0)
|
||||
blended[y] = val
|
||||
result[sec] = blended
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# --- Forward fill ---
|
||||
|
||||
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
||||
filled = {}
|
||||
last_val = 0.0
|
||||
last = 0.0
|
||||
for y in range(min_year, max_year + 1):
|
||||
if y in index:
|
||||
last_val = index[y]
|
||||
filled[y] = last_val
|
||||
last = index[y]
|
||||
filled[y] = last
|
||||
return filled
|
||||
|
||||
|
||||
# --- Main ---
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Build repeat-sales price index")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output price_index.parquet")
|
||||
parser = argparse.ArgumentParser(description="Build improved repeat-sales price index")
|
||||
parser.add_argument("--input", type=Path, required=True)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
pairs = extract_pairs(args.input)
|
||||
centroids = extract_centroids(args.input)
|
||||
|
||||
# Derive geographic hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
# district = sector minus trailing digit(s), e.g. "SW1A 1" -> "SW1A"
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
# area = leading letters only, e.g. "SW1A" -> "SW"
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
# Solve indices at each level
|
||||
print("\nComputing national index...")
|
||||
pairs_np = pairs.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_wls_index(
|
||||
pairs_np["year1"].to_numpy(),
|
||||
pairs_np["year2"].to_numpy(),
|
||||
pairs_np["log_ratio"].to_numpy(),
|
||||
pairs_np["weight"].to_numpy(),
|
||||
)
|
||||
print(f" National index: {len(national_idx)} years")
|
||||
|
||||
print("\nComputing area indices...")
|
||||
area_indices, area_pairs = compute_indices_for_level(pairs, "area")
|
||||
print(f" {len(area_indices)} areas with indices")
|
||||
|
||||
print("\nComputing district indices...")
|
||||
district_indices, district_pairs = compute_indices_for_level(pairs, "district")
|
||||
print(f" {len(district_indices)} districts with indices")
|
||||
|
||||
print("\nComputing sector indices...")
|
||||
sector_indices, sector_pairs = compute_indices_for_level(pairs, "sector")
|
||||
print(f" {len(sector_indices)} sectors with indices")
|
||||
|
||||
# Shrink area -> national
|
||||
print("\nApplying hierarchical shrinkage...")
|
||||
for area, idx in tqdm(area_indices.items(), desc=" Area shrinkage"):
|
||||
area_indices[area] = shrink_index(idx, national_idx, area_pairs[area])
|
||||
|
||||
# Shrink district -> area
|
||||
for dist, idx in tqdm(district_indices.items(), desc=" District shrinkage"):
|
||||
area = dist.replace(r"\d.*$", "")
|
||||
# Extract area from district (leading letters)
|
||||
area_key = ""
|
||||
for ch in dist:
|
||||
if ch.isalpha():
|
||||
area_key += ch
|
||||
else:
|
||||
break
|
||||
parent = area_indices.get(area_key, national_idx)
|
||||
district_indices[dist] = shrink_index(idx, parent, district_pairs[dist])
|
||||
|
||||
# Shrink sector -> district
|
||||
for sector, idx in tqdm(sector_indices.items(), desc=" Sector shrinkage"):
|
||||
# District = sector minus trailing space+digit
|
||||
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
parent = district_indices.get(dist_key, national_idx)
|
||||
sector_indices[sector] = shrink_index(idx, parent, sector_pairs[sector])
|
||||
|
||||
# For sectors without enough data, fall back to district/area/national
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = max(int(pairs["year2"].max()), 2025)
|
||||
max_year = max(int(pairs["year2"].max()), CURRENT_YEAR)
|
||||
|
||||
print(f"\nFilling gaps and forward-filling ({min_year}-{max_year})...")
|
||||
hedonic_idx = compute_hedonic_index(args.input, min_year, max_year)
|
||||
|
||||
# Precompute hierarchy
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
sector_to_dist = {}
|
||||
dist_to_area = {}
|
||||
for s in all_sectors:
|
||||
d, a = hierarchy_keys(s)
|
||||
sector_to_dist[s] = d
|
||||
dist_to_area[d] = a
|
||||
|
||||
# Process each type group + "All"
|
||||
all_type_groups = ["All"] + TYPE_GROUPS
|
||||
final = {} # {type_group: {sector: {year: log_index}}}
|
||||
final_n = {} # {type_group: {sector: n_pairs}}
|
||||
|
||||
for tg in all_type_groups:
|
||||
print(f"\n--- {tg} ---")
|
||||
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
|
||||
if len(typed) < MIN_PAIRS:
|
||||
print(f" Skipping (only {len(typed)} pairs)")
|
||||
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
|
||||
final_n[tg] = {s: 0 for s in all_sectors}
|
||||
continue
|
||||
|
||||
print(f" {len(typed):,} pairs")
|
||||
|
||||
# National
|
||||
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_robust_index(
|
||||
np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(),
|
||||
np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(),
|
||||
)
|
||||
national_n = len(typed)
|
||||
print(f" National: {len(national_idx)} years")
|
||||
|
||||
# Area, district, sector
|
||||
print(" Computing per-level indices:")
|
||||
area_idx, area_n = compute_indices_for_level(typed, "area")
|
||||
district_idx, district_n = compute_indices_for_level(typed, "district")
|
||||
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
|
||||
print(f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors")
|
||||
|
||||
# Shrinkage
|
||||
print(" Applying shrinkage...")
|
||||
sector_shrunk = apply_shrinkage(
|
||||
sector_idx, sector_n, district_idx, district_n,
|
||||
area_idx, area_n, national_idx, national_n,
|
||||
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
print(" Spatial smoothing...")
|
||||
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
|
||||
|
||||
# Forward fill
|
||||
for sec in all_sectors:
|
||||
sector_smoothed[sec] = forward_fill(
|
||||
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
|
||||
)
|
||||
|
||||
final[tg] = sector_smoothed
|
||||
final_n[tg] = sector_n
|
||||
|
||||
# Assemble output
|
||||
print("\nAssembling output...")
|
||||
rows = []
|
||||
for sector in tqdm(all_sectors, desc=" Forward-fill"):
|
||||
if sector in sector_indices:
|
||||
idx = sector_indices[sector]
|
||||
else:
|
||||
# Fall back to district, area, national
|
||||
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area_key = ""
|
||||
for ch in dist_key:
|
||||
if ch.isalpha():
|
||||
area_key += ch
|
||||
else:
|
||||
break
|
||||
idx = district_indices.get(dist_key, area_indices.get(area_key, national_idx))
|
||||
|
||||
n = sector_pairs.get(sector, 0)
|
||||
filled = forward_fill_index(idx, min_year, max_year)
|
||||
for year, log_idx in filled.items():
|
||||
rows.append((sector, year, log_idx, n))
|
||||
for tg in all_type_groups:
|
||||
for sec in all_sectors:
|
||||
n = final_n[tg].get(sec, 0)
|
||||
for year, log_idx in final[tg][sec].items():
|
||||
rows.append((sec, tg, year, log_idx, n))
|
||||
|
||||
result = pl.DataFrame(
|
||||
rows,
|
||||
schema={"sector": pl.String, "year": pl.Int32, "log_index": pl.Float64, "n_pairs": pl.Int64},
|
||||
schema={"sector": pl.String, "type_group": pl.String, "year": pl.Int32,
|
||||
"log_index": pl.Float64, "n_pairs": pl.Int64},
|
||||
orient="row",
|
||||
)
|
||||
).sort("type_group", "sector", "year")
|
||||
|
||||
result = result.sort("sector", "year")
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {result['sector'].n_unique():,} sectors × {max_year - min_year + 1} years = {len(result):,} rows")
|
||||
print(f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.poi_counts import _count_pois_per_postcode
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
||||
SCHOOL_GROUPS = {
|
||||
"good_primary": ["good_primary"],
|
||||
|
|
@ -60,7 +60,7 @@ def main():
|
|||
# Load all postcodes for proximity counting
|
||||
postcodes = arcgis.rename({"lng": "lon"})
|
||||
|
||||
result = _count_pois_per_postcode(
|
||||
result = count_pois_per_postcode(
|
||||
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +0,0 @@
|
|||
/bulk/property-data
|
||||
2
property-data/.gitignore
vendored
Normal file
2
property-data/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
*
|
||||
!.gitignore
|
||||
|
|
@ -86,6 +86,20 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
|
|||
suffix: "",
|
||||
raw: false,
|
||||
},
|
||||
// FeatureConfig {
|
||||
// name: "Estimated current price",
|
||||
// bounds: Bounds::Fixed {
|
||||
// min: 0.0,
|
||||
// max: 2_000_000.0,
|
||||
// },
|
||||
// step: 10000.0,
|
||||
// description: "Inflation-adjusted estimate of the current property value",
|
||||
// detail: "Estimated by applying a repeat-sales price index to the last known sale price. The index tracks price changes within each postcode sector and property type. Properties sold recently will have estimates close to their sale price; older sales are adjusted more. Coverage depends on having enough repeat sales in the local area to build the index.",
|
||||
// source: "price-paid",
|
||||
// prefix: "£",
|
||||
// suffix: "",
|
||||
// raw: false,
|
||||
// },
|
||||
FeatureConfig {
|
||||
name: "Price per sqm",
|
||||
bounds: Bounds::Percentile {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue