Format python

This commit is contained in:
Andras Schmelczer 2026-01-31 13:07:09 +00:00
parent 85f5770e09
commit 4c258018c3
17 changed files with 348 additions and 248 deletions

View file

@ -9,79 +9,108 @@ pl.Config.set_tbl_cols(-1)
def main():
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
parser.add_argument("--epc", type=Path, required=True, help="EPC certificates CSV file")
parser.add_argument("--price-paid", type=Path, required=True, help="Price paid parquet file")
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
parser.add_argument(
"--epc", type=Path, required=True, help="EPC certificates CSV file"
)
parser.add_argument(
"--price-paid", type=Path, required=True, help="Price paid parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
epc = pl.scan_csv(args.epc).select(
pl.col('ADDRESS').alias('epc_address'),
'POSTCODE',
'CURRENT_ENERGY_RATING',
'POTENTIAL_ENERGY_RATING',
pl.col('PROPERTY_TYPE').alias('epc_property_type'),
'BUILT_FORM',
'INSPECTION_DATE',
'TOTAL_FLOOR_AREA',
'NUMBER_HABITABLE_ROOMS',
'FLOOR_HEIGHT',
'CONSTRUCTION_AGE_BAND'
).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
epc = (
pl.scan_csv(args.epc)
.select(
pl.col("ADDRESS").alias("epc_address"),
"POSTCODE",
"CURRENT_ENERGY_RATING",
"POTENTIAL_ENERGY_RATING",
pl.col("PROPERTY_TYPE").alias("epc_property_type"),
"BUILT_FORM",
"INSPECTION_DATE",
"TOTAL_FLOOR_AREA",
"NUMBER_HABITABLE_ROOMS",
"FLOOR_HEIGHT",
"CONSTRUCTION_AGE_BAND",
)
.filter(pl.col("epc_address").is_not_null())
.sort("INSPECTION_DATE", descending=True)
.group_by("epc_address", "POSTCODE")
.first()
)
print("EPC dataset")
print(epc.head().collect())
# https://www.gov.uk/guidance/about-the-price-paid-data
property_type_map = {"D": "Detached", "S": "Semi-Detached", "T": "Terraced", "F": "Flats/Maisonettes", "O": "Other"}
property_type_map = {
"D": "Detached",
"S": "Semi-Detached",
"T": "Terraced",
"F": "Flats/Maisonettes",
"O": "Other",
}
duration_map = {"F": "Freehold", "L": "Leasehold"}
price_paid = (pl.scan_parquet(args.price_paid).select(
"price",
"date_of_transfer",
pl.col('property_type').alias("pp_property_type").replace(property_type_map),
"postcode",
'paon',
'saon',
'street',
'locality',
'town_city',
pl.col('duration').replace(duration_map)
)
.filter(pl.col('pp_property_type') != 'Other').with_columns(
pl.concat_str(
[pl.col('saon'), pl.col('paon'), pl.col('street')],
separator=' ',
ignore_nulls=True,
).alias('pp_address'),
price_paid = (
pl.scan_parquet(args.price_paid)
.select(
"price",
"date_of_transfer",
pl.col("property_type")
.alias("pp_property_type")
.replace(property_type_map),
"postcode",
"paon",
"saon",
"street",
"locality",
"town_city",
pl.col("duration").replace(duration_map),
)
.sort('date_of_transfer')
.group_by('pp_address', 'postcode', maintain_order=True)
.filter(pl.col("pp_property_type") != "Other")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
)
.sort("date_of_transfer")
.group_by("pp_address", "postcode", maintain_order=True)
.agg(
pl.struct(
pl.col('date_of_transfer').dt.year().alias('year'),
'price',
).alias('historical_prices'),
pl.col('pp_property_type').last(),
pl.col('duration').last(),
pl.col('price').last().alias('latest_price'),
pl.col('date_of_transfer').last(),
pl.col("date_of_transfer").dt.year().alias("year"),
"price",
).alias("historical_prices"),
pl.col("pp_property_type").last(),
pl.col("duration").last(),
pl.col("price").last().alias("latest_price"),
pl.col("date_of_transfer").last(),
)
).filter(pl.col('pp_address').is_not_null())
).filter(pl.col("pp_address").is_not_null())
print("Price paid dataset")
print(price_paid.head().collect())
joined = fuzzy_join_on_postcode(
left=price_paid,
right=epc,
left_address_col='pp_address',
right_address_col='epc_address',
left_postcode_col='postcode',
right_postcode_col='POSTCODE',
).drop('POSTCODE').collect()
joined = (
fuzzy_join_on_postcode(
left=price_paid,
right=epc,
left_address_col="pp_address",
right_address_col="epc_address",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
)
.drop("POSTCODE")
.collect()
)
matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
matched = joined.filter(
pl.col("epc_address").is_not_null() & pl.col("pp_address").is_not_null()
)
total = joined.height
print(f"Unique properties: {total}")
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")

View file

@ -24,7 +24,9 @@ def _build_wide(
"lsoa21",
)
wide = wide.join(arcgis, on="postcode", how="inner")
print(f" {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB")
print(
f" {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB"
)
# Journey times (optional)
if journey_times_path and journey_times_path.exists():
@ -42,9 +44,7 @@ def _build_wide(
if iod_path and iod_path.exists():
print("Joining IoD scores...")
iod = pl.read_parquet(iod_path)
wide = wide.join(
iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left"
)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
print(f" {wide.estimated_size('mb'):.1f} MB after IoD")
# POI proximity counts (pre-computed per postcode)
@ -66,44 +66,68 @@ def _build_wide(
)
# Derived columns
wide = wide.with_columns(
(pl.col("latest_price") / pl.col("total_floor_area")).alias("Price per sqm"),
).drop(
'date_of_transfer',
'inspection_date',
'floor_height',
'lsoa21',
'LSOA code (2021)',
'Local Authority District code (2024)',
'Local Authority District name (2024)',
'imd_score',
'housing_barriers_score',
'idaci_score',
'idaopi_score',
'children_young_people_score',
'adult_skills_score',
'geographical_barriers_score',
'wider_barriers_score',
).rename({
'construction_age_band': "Approximate construction age",
"income_score": "Income Score (rate)",
"employment_score": "Employment Score (rate)",
"education_score": "Education, Skills and Training Score",
"health_score": "Health Deprivation and Disability Score",
"crime_score": "Crime Score",
})
wide = (
wide.with_columns(
(pl.col("latest_price") / pl.col("total_floor_area")).alias(
"Price per sqm"
),
)
.drop(
"date_of_transfer",
"inspection_date",
"floor_height",
"lsoa21",
"LSOA code (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
"imd_score",
"housing_barriers_score",
"idaci_score",
"idaopi_score",
"children_young_people_score",
"adult_skills_score",
"geographical_barriers_score",
"wider_barriers_score",
)
.rename(
{
"construction_age_band": "Approximate construction age",
"income_score": "Income Score (rate)",
"employment_score": "Employment Score (rate)",
"education_score": "Education, Skills and Training Score",
"health_score": "Health Deprivation and Disability Score",
"crime_score": "Crime Score",
}
)
)
return wide
def main():
parser = argparse.ArgumentParser(description="Build wide property dataframe with all joins")
parser.add_argument("--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file")
parser.add_argument("--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file")
parser.add_argument("--iod", type=Path, help="Index of Deprivation parquet file (optional)")
parser.add_argument("--poi-proximity", type=Path, help="POI proximity counts parquet file (optional)")
parser.add_argument("--journey-times", type=Path, help="Journey times parquet file (optional)")
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
parser = argparse.ArgumentParser(
description="Build wide property dataframe with all joins"
)
parser.add_argument(
"--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
)
parser.add_argument(
"--iod", type=Path, help="Index of Deprivation parquet file (optional)"
)
parser.add_argument(
"--poi-proximity",
type=Path,
help="POI proximity counts parquet file (optional)",
)
parser.add_argument(
"--journey-times", type=Path, help="Journey times parquet file (optional)"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
wide = _build_wide(
@ -119,7 +143,7 @@ def main():
wide.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")

View file

@ -584,9 +584,7 @@ def transform(input_path: Path) -> pl.LazyFrame:
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
unmapped.append(cat)
if unmapped:
raise ValueError(
f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}"
)
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
mapped_but_absent = []
@ -623,9 +621,15 @@ def transform(input_path: Path) -> pl.LazyFrame:
def main():
parser = argparse.ArgumentParser(description="Transform raw POIs to filtered version with friendly names")
parser.add_argument("--input", type=Path, required=True, help="Raw POIs parquet file")
parser.add_argument("--output", type=Path, required=True, help="Output filtered POIs parquet file")
parser = argparse.ArgumentParser(
description="Transform raw POIs to filtered version with friendly names"
)
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input).collect()