perfect-postcode/pipeline/transform/join_epc_pp.py

693 lines
27 KiB
Python

import argparse
import csv
import io
import tempfile
import zipfile
from pathlib import Path
import polars as pl
import pyarrow as pa
import pyarrow.csv as pa_csv
import pyarrow.parquet as pq
from pipeline.local_temp import local_tmp_dir
from ..utils import (
fuzzy_join_on_postcode,
normalize_address_key,
normalize_postcode_key,
)
pl.Config.set_tbl_cols(-1)
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
# Value-quality floor for price aggregations. A flat nominal floor is a blunt
# tool against a deflating threshold — £50k was completely normal for a 1990s
# house, so a 50k floor wrongly discarded ~a third of legitimate 1990s
# open-market sales (and deleted properties whose only sales were old/cheap),
# biasing early-year price history upward. 10k recovers the large [10k,50k)
# band of genuine cheaper sales while still excluding the nominal/junk transfers
# (£1 etc.). A small tail of real sub-10k sales is still dropped — a deliberate
# conservative tradeoff to keep clearly-implausible transfers out.
MIN_PRICE = 10_000
# Time-aware consecutive-sale jump guard. Price-paid contains keyed-in price
# errors that pass the MIN_PRICE/category filters — e.g. 13 QUICKSETTS HR2 7PP,
# a 93 m² terrace, sold £140,000 in 2016 then "£207,500,000" in 2026 (clearly
# £207,500 with extra digits, lodged as category A) — and would otherwise
# become latest_price. A quality sale is flagged when it exceeds its
# neighbouring sale by more than JUMP_TOLERANCE * JUMP_GROWTH_PER_YEAR ** years
# between the two sales. Calibration: genuine extreme appreciation (prime
# London 1995->2026 is roughly x50 over 31 years) stays comfortably under
# 12 * 1.10**31 ≈ 230, while the HR2 case (x1,482 over 10 years against a
# threshold of 12 * 1.10**10 ≈ 31) is caught. JUMP_MIN_PRICE is an absolute
# floor on the flagged price itself so right-to-buy resales and other
# legitimate x20-50 jumps on cheap properties are never flagged.
JUMP_TOLERANCE = 12.0
JUMP_GROWTH_PER_YEAR = 1.10
JUMP_MIN_PRICE = 2_000_000
# Plausible construction-year range; band-derived years outside it (e.g. OCR
# noise like 1012 or 2202) are nulled rather than published.
MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030
# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
# habitable rooms) that otherwise propagate verbatim into the published per-
# property columns. Values outside these bands are nulled (treated as unknown)
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
MIN_FLOOR_HEIGHT_M = 1.5 # below this a storey is not habitable
MAX_FLOOR_HEIGHT_M = 6.0 # above this is a data error, not a normal storey
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0 # ~21,500 sqft; larger is a bulk/garbage record
MAX_HABITABLE_ROOMS = 20 # dwellings above this are data errors
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
"""Map an EPC construction age band to a single representative build year.
EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
(1958) rather than the lower bound, which previously biased every band-derived
year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
wide to pin to a year and return null. Single-year / ``... onwards`` bands use
that year. Already-numeric inputs (a year produced by an earlier call) pass
through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
"""
text = (
band.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
)
low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
year = (
pl.when(text.str.starts_with("before "))
.then(None)
.when(high.is_not_null())
.then(((low + high) / 2).round(0).cast(pl.Int32))
.otherwise(low)
)
return (
pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
.then(year)
.otherwise(None)
.cast(pl.UInt16, strict=False)
)
EPC_SOURCE_COLUMNS = [
"address",
"postcode",
"uprn",
"current_energy_rating",
"potential_energy_rating",
"property_type",
"built_form",
"inspection_date",
"total_floor_area",
"number_habitable_rooms",
"floor_height",
"construction_age_band",
"tenure",
]
def _normalise_csv_columns(columns: list[str]) -> list[str]:
return [column.strip().lower() for column in columns]
def _clean_string(column: str) -> pl.Expr:
stripped = pl.col(column).cast(pl.String).str.strip_chars()
return pl.when(stripped == "").then(None).otherwise(stripped)
def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
return _clean_string(column).cast(dtype, strict=False)
def _join_address_parts(*columns: str) -> pl.Expr:
"""Join address components into one display address, single-spaced.
Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent —
saon is "" on ~88% of rows — and ``concat_str(..., ignore_nulls=True)``
skips only nulls, so empty components still contributed their separator
(``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
Convert ``''``→null per component so ignore_nulls works as intended, then
defensively collapse residual whitespace runs and strip the result. A
fully-empty address becomes null (dropped by the downstream
``pp_address.is_not_null()`` filter) instead of whitespace junk.
"""
joined = pl.concat_str(
[_clean_string(column) for column in columns],
separator=" ",
ignore_nulls=True,
)
cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
return pl.when(cleaned == "").then(None).otherwise(cleaned)
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
return (
raw.select(
_clean_string("address").alias("epc_address"),
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
# UPRN keys an exact listing->EPC join downstream (~99% populated).
_clean_string("uprn").alias("uprn"),
_clean_string("current_energy_rating")
.str.to_uppercase()
.alias("current_energy_rating"),
_clean_string("potential_energy_rating")
.str.to_uppercase()
.alias("potential_energy_rating"),
_clean_string("property_type").alias("epc_property_type"),
_clean_string("built_form").alias("built_form"),
# Parse to a real Date once (unparseable/blank -> null) so dedup can
# sort newest-first with nulls_last and _event_year can use dt.year();
# a lexicographic string sort would let a null/garbled date win under
# Polars' default nulls-first descending order. EPC inspection dates
# are ISO (YYYY-MM-DD).
_clean_string("inspection_date")
.str.to_date(format="%Y-%m-%d", strict=False)
.alias("inspection_date"),
_clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
_clean_number("number_habitable_rooms", pl.Int16).alias(
"number_habitable_rooms"
),
_clean_number("floor_height", pl.Float64).alias("floor_height"),
_clean_string("construction_age_band").alias("construction_age_band"),
_clean_string("tenure").alias("tenure"),
)
.filter(pl.col("epc_address").is_not_null())
.with_columns(
# Null implausible EPC dimensions so data-entry errors don't reach
# the published per-property columns (Interior height, Total floor
# area, Number of bedrooms & living rooms). Treated as unknown.
pl.when(
(pl.col("number_habitable_rooms") >= 1)
& (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
)
.then(pl.col("number_habitable_rooms"))
.otherwise(None)
.alias("number_habitable_rooms"),
pl.when(
pl.col("floor_height").is_between(
MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
)
)
.then(pl.col("floor_height"))
.otherwise(None)
.alias("floor_height"),
pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
.then(pl.col("total_floor_area"))
.otherwise(None)
.alias("total_floor_area"),
)
)
def _certificate_member_names(zip_file: zipfile.ZipFile) -> list[str]:
return sorted(
name
for name in zip_file.namelist()
if not name.endswith("/")
and Path(name).name.lower().startswith("certificates")
and name.lower().endswith(".csv")
)
def _read_zip_csv_header(zip_file: zipfile.ZipFile, member_name: str) -> list[str]:
with zip_file.open(member_name) as member:
text = io.TextIOWrapper(member, encoding="utf-8-sig", newline="")
try:
return next(csv.reader(text))
except StopIteration as exc:
raise ValueError(f"EPC CSV member is empty: {member_name}") from exc
def _source_columns_for_header(header: list[str]) -> list[str]:
columns_by_normalised_name = {
normalised: source
for source, normalised in zip(header, _normalise_csv_columns(header))
}
return [
columns_by_normalised_name.get(column, column) for column in EPC_SOURCE_COLUMNS
]
def _zip_certificates_to_parquet(zip_path: Path, output_path: Path) -> None:
schema = pa.schema((column, pa.string()) for column in EPC_SOURCE_COLUMNS)
writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd")
try:
try:
zip_file = zipfile.ZipFile(zip_path)
except zipfile.BadZipFile as exc:
raise ValueError(
f"{zip_path} is not a readable EPC zip archive; re-download "
"domestic-csv.zip and try again"
) from exc
with zip_file:
member_names = _certificate_member_names(zip_file)
if not member_names:
raise ValueError(f"No certificate CSV files found in {zip_path}")
for member_name in member_names:
print(f"Reading EPC certificates from {member_name}")
source_columns = _source_columns_for_header(
_read_zip_csv_header(zip_file, member_name)
)
convert_options = pa_csv.ConvertOptions(
include_columns=source_columns,
include_missing_columns=True,
column_types={
source_column: pa.string() for source_column in source_columns
},
strings_can_be_null=True,
)
read_options = pa_csv.ReadOptions(block_size=64 * 1024 * 1024)
with zip_file.open(member_name) as member:
reader = pa_csv.open_csv(
member,
read_options=read_options,
convert_options=convert_options,
)
while True:
try:
batch = reader.read_next_batch()
except StopIteration:
break
if batch.num_rows == 0:
continue
writer.write_batch(batch.rename_columns(EPC_SOURCE_COLUMNS))
finally:
writer.close()
def _scan_epc_certificates(epc_path: Path, temp_dir: Path) -> pl.LazyFrame:
if epc_path.suffix.lower() == ".zip":
parquet_path = temp_dir / "epc-certificates.parquet"
_zip_certificates_to_parquet(epc_path, parquet_path)
raw = pl.scan_parquet(parquet_path)
else:
raw = pl.scan_csv(
epc_path,
infer_schema=False,
with_column_names=_normalise_csv_columns,
)
return _select_epc_columns(raw)
def flag_price_outliers(slim: pl.DataFrame) -> pl.DataFrame:
"""Flag the implausible side of extreme consecutive-sale price jumps.
``slim`` holds one row per quality (>= MIN_PRICE, category A) sale:
(_pp_group_address, _pp_group_postcode, date_of_transfer, price). Per
property, each sale is compared against its previous and next sale and
the HIGHER sale of an implausible pair is flagged:
- UP rule: the sale is more than the time-aware threshold above its
PREVIOUS sale (catches a garbage spike after a normal sale);
- DOWN rule: the NEXT sale is less than 1/threshold of this one (catches
a garbage spike before a normal sale);
- either way the flagged price itself must be >= JUMP_MIN_PRICE, so
cheap-property noise and right-to-buy-style resales stay safe.
Runs as a bounded EAGER pass: .shift().over() window functions may not
execute under the streaming sink used by fuzzy_join_on_postcode, so the
flags are computed here and left-joined back into the lazy stream.
Returns the exclusion rows (group keys, date_of_transfer, price) with a
literal ``_price_outlier`` column, unique on the four join columns so
the join-back can never fan out.
"""
group_keys = ["_pp_group_address", "_pp_group_postcode"]
# Years between consecutive sales, floored at six months so back-to-back
# transfers don't get a near-zero exponent and an over-tight threshold.
dy_prev = (
(pl.col("date_of_transfer") - pl.col("_prev_date")).dt.total_days() / 365.25
).clip(lower_bound=0.5)
dy_next = (
(pl.col("_next_date") - pl.col("date_of_transfer")).dt.total_days() / 365.25
).clip(lower_bound=0.5)
up_rule = (pl.col("price") / pl.col("_prev_price")) > JUMP_TOLERANCE * pl.lit(
JUMP_GROWTH_PER_YEAR
).pow(dy_prev)
down_rule = (pl.col("_next_price") / pl.col("price")) < 1 / (
JUMP_TOLERANCE * pl.lit(JUMP_GROWTH_PER_YEAR).pow(dy_next)
)
return (
slim.sort([*group_keys, "date_of_transfer"])
.with_columns(
pl.col("price").shift(1).over(group_keys).alias("_prev_price"),
pl.col("date_of_transfer").shift(1).over(group_keys).alias("_prev_date"),
pl.col("price").shift(-1).over(group_keys).alias("_next_price"),
pl.col("date_of_transfer").shift(-1).over(group_keys).alias("_next_date"),
)
# fill_null(False): a missing neighbour (first/last sale of a group)
# makes that rule's comparison null, which must read as "not flagged".
.filter(
(up_rule.fill_null(False) | down_rule.fill_null(False))
& (pl.col("price") >= JUMP_MIN_PRICE)
)
.select(*group_keys, "date_of_transfer", "price")
.unique()
.with_columns(pl.lit(True).alias("_price_outlier"))
)
def main():
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
parser.add_argument(
"--epc", type=Path, required=True, help="EPC certificates CSV file or zip"
)
parser.add_argument(
"--price-paid", type=Path, required=True, help="Price paid parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory(
prefix="epc_certificates_", dir=local_tmp_dir()
) as tmpdir:
_run(args.epc, args.price_paid, args.output, Path(tmpdir))
def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Path):
epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
)
# Dedup fork: keep latest certificate per property. inspection_date is a typed
# Date (see _select_epc_columns); nulls_last keeps a real-dated cert ahead of a
# null/unparseable-dated one so the genuinely newest certificate is chosen.
epc = (
epc_base.sort("inspection_date", descending=True, nulls_last=True)
.group_by("_epc_match_address", "_epc_match_postcode")
.first()
.drop("tenure")
)
# Events fork: detect renovation events between consecutive certificates
# Collect eagerly because .over() window functions don't work in streaming
# engine (fuzzy_join.py:50 uses sink_parquet which requires streaming).
events = (
epc_base.sort("inspection_date")
.with_columns(
pl.col("current_energy_rating")
.replace_strict(RATING_RANK, default=None, return_dtype=pl.Int32)
.alias("_rating_rank"),
)
.with_columns(
pl.col("number_habitable_rooms")
.shift(1)
.over("_epc_match_address", "_epc_match_postcode")
.alias("_prev_rooms"),
pl.col("total_floor_area")
.shift(1)
.over("_epc_match_address", "_epc_match_postcode")
.alias("_prev_area"),
pl.col("_rating_rank")
.shift(1)
.over("_epc_match_address", "_epc_match_postcode")
.alias("_prev_rating_rank"),
)
.with_columns(
pl.when(
pl.col("number_habitable_rooms").is_not_null()
& pl.col("_prev_rooms").is_not_null()
& (pl.col("number_habitable_rooms") != pl.col("_prev_rooms"))
)
.then(pl.lit("Remodelling"))
.when(
pl.col("total_floor_area").is_not_null()
& pl.col("_prev_area").is_not_null()
& (pl.col("total_floor_area") > pl.col("_prev_area"))
)
.then(pl.lit("Extension"))
.when(
pl.col("_rating_rank").is_not_null()
& pl.col("_prev_rating_rank").is_not_null()
& (pl.col("_rating_rank") < pl.col("_prev_rating_rank"))
)
.then(pl.lit("Renovation"))
.otherwise(pl.lit(None, dtype=pl.String))
.alias("_event"),
)
.filter(pl.col("_event").is_not_null())
.with_columns(
pl.col("inspection_date").dt.year().cast(pl.Int32).alias("_event_year"),
)
.group_by("_epc_match_address", "_epc_match_postcode")
.agg(
pl.struct(
pl.col("_event_year").alias("year"),
pl.col("_event").alias("event"),
).alias("renovation_history"),
)
.collect()
)
event_counts = (
events["renovation_history"].explode().struct.field("event").value_counts()
)
print(f"Renovation events: {events.height} properties with events")
print(event_counts)
# Social tenure fork: flag properties that were ever social housing
social_tenure = (
epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
.select("_epc_match_address", "_epc_match_postcode")
.unique()
.with_columns(pl.lit("Yes").alias("was_council_house"))
.collect()
)
print(f"Former council houses (EPC social tenure): {social_tenure.height}")
# Left-join events and social tenure back onto dedup EPC
epc = (
epc.join(
events.lazy(),
on=["_epc_match_address", "_epc_match_postcode"],
how="left",
)
.join(
social_tenure.lazy(),
on=["_epc_match_address", "_epc_match_postcode"],
how="left",
)
.with_columns(
pl.col("was_council_house").fill_null("No"),
)
)
print("EPC dataset")
print(epc.head().collect())
# https://www.gov.uk/guidance/about-the-price-paid-data
property_type_map = {
"D": "Detached",
"S": "Semi-Detached",
"T": "Terraced",
"F": "Flats/Maisonettes",
"O": "Other",
}
duration_map = {"F": "Freehold", "L": "Leasehold"}
# price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
# VALUE-QUALITY filters: they gate the price aggregations only. Category B
# entries (repossessions, bulk/portfolio, power-of-sale transfers), sub-MIN
# sales and jump-flagged outliers must not pollute latest_price /
# historical_prices (and the downstream price-per-sqm feature), but they
# MUST still count for first_transfer_date / old_new so a new-build's
# genuine earliest transfer year is preserved.
price_ok = pl.col("price") >= MIN_PRICE
category_ok = pl.col("ppd_category") == "A"
value_ok = price_ok & category_ok
# quality_ok additionally excludes consecutive-sale jump outliers (see
# flag_price_outliers); _price_outlier exists only after the join below.
quality_ok = value_ok & pl.col("_price_outlier").is_null()
price_paid_base = (
pl.scan_parquet(price_paid_path)
.select(
"price",
"date_of_transfer",
pl.col("property_type")
.alias("pp_property_type")
.replace(property_type_map),
pl.col("postcode").str.strip_chars(),
"paon",
"saon",
"street",
"locality",
"town_city",
pl.col("duration").replace(duration_map),
"old_new",
"ppd_category",
)
.filter(pl.col("pp_property_type") != "Other")
.with_columns(
_join_address_parts("saon", "paon", "street").alias("pp_address"),
)
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
normalize_postcode_key(pl.col("postcode")).alias("_pp_match_postcode"),
)
.filter(pl.col("_pp_match_postcode").is_not_null())
.with_columns(
pl.coalesce("_pp_match_address", "pp_address").alias("_pp_group_address"),
pl.col("_pp_match_postcode").alias("_pp_group_postcode"),
)
.filter(pl.col("pp_address").is_not_null())
# Price-paid carries ~72k duplicate (address, postcode, date, price)
# transaction groups with DISTINCT transaction ids — the same completed
# sale lodged twice — which double-counted sales in historical_prices.
# Collapse each to one row. ppd_category stays in the subset so an
# A/B-categorised pair of the same sale survives as two rows; only the
# A row feeds the price aggregations (quality_ok), which is intentional.
.unique(
subset=[
"_pp_group_address",
"_pp_group_postcode",
"date_of_transfer",
"price",
"ppd_category",
],
keep="any",
)
)
# Bounded eager pass over the quality sales only (~30M rows x 4 narrow
# columns): the window functions inside flag_price_outliers may not run
# under the streaming sink used by fuzzy_join_on_postcode, so the outlier
# flags are computed here and joined back into the lazy stream.
outliers = flag_price_outliers(
price_paid_base.filter(value_ok)
.select(
"_pp_group_address", "_pp_group_postcode", "date_of_transfer", "price"
)
.collect(engine="streaming")
)
print(f"Implausible consecutive-sale price jumps flagged: {outliers.height}")
price_paid = (
# Outlier rows stay in the stream (they still count for
# first_transfer_date / old_new, same as category-B sales); quality_ok
# merely drops them from the price aggregations. _price_outlier is not
# aggregated below, so the helper column dies with the group_by.
price_paid_base.join(
outliers.lazy(),
on=[
"_pp_group_address",
"_pp_group_postcode",
"date_of_transfer",
"price",
],
how="left",
)
.sort("date_of_transfer")
.group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
.agg(
pl.col("pp_address").last(),
pl.col("postcode").last(),
pl.col("_pp_match_address").last(),
pl.col("_pp_match_postcode").last(),
# Price aggregations are restricted to quality-passing sales.
pl.struct(
pl.col("date_of_transfer").dt.year().alias("year"),
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
"price",
)
.filter(quality_ok)
.alias("historical_prices"),
pl.col("pp_property_type").last(),
pl.col("duration").last(),
pl.col("price").filter(quality_ok).last().alias("latest_price"),
pl.col("date_of_transfer").filter(quality_ok).last(),
# first_transfer_date / old_new reflect the genuine earliest transfer
# over the full per-group transaction stream (not value-filtered).
pl.col("date_of_transfer").first().alias("first_transfer_date"),
pl.col("old_new").first(),
)
# Preserve the property universe: previously a property needed >=1 sale
# >=MIN_PRICE to form a group, so drop groups with no quality-passing sale.
.filter(pl.col("latest_price").is_not_null())
)
print("Price paid dataset")
print(price_paid.head().collect())
joined = (
fuzzy_join_on_postcode(
left=price_paid,
right=epc,
left_address_col="pp_address",
right_address_col="epc_address",
left_postcode_col="postcode",
right_postcode_col="epc_postcode",
)
.drop("epc_postcode")
# Audit trail: keep the fuzzy-match confidence (100 = exact address
# match) in the published output; null means no EPC match.
.rename({"_match_score": "epc_match_score"})
.collect(engine="streaming")
)
matched = joined.filter(
pl.col("epc_address").is_not_null() & pl.col("pp_address").is_not_null()
)
total = joined.height
print(f"Unique properties: {total}")
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
print(f"Unmatched: {total - matched.height}")
# For new-builds (old_new == "Y"), use the first transaction date year as
# the exact construction date; otherwise fall back to the EPC age band.
epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
transfer_year = (
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
)
is_new_build = pl.col("old_new") == "Y"
joined = joined.with_columns(
pl.when(is_new_build & transfer_year.is_not_null())
.then(transfer_year)
.otherwise(epc_band_year)
.alias("construction_age_band"),
pl.when(is_new_build & transfer_year.is_not_null())
.then(pl.lit(0, dtype=pl.UInt8))
.when(epc_band_year.is_not_null())
.then(pl.lit(1, dtype=pl.UInt8))
.otherwise(pl.lit(None, dtype=pl.UInt8))
.alias("is_construction_date_approximate"),
).drop(
[
"old_new",
"first_transfer_date",
"_pp_match_address",
"_pp_match_postcode",
"_pp_group_address",
"_pp_group_postcode",
"_epc_match_address",
"_epc_match_postcode",
],
strict=False,
)
joined = joined.rename({col: col.lower() for col in joined.columns})
print(joined.head())
joined.write_parquet(output_path)
print(f"Wrote {output_path}")
if __name__ == "__main__":
main()