Format python

2026-01-31 13:07:09 +00:00 · 2026-01-31 13:07:09 +00:00 · 4c258018c3
commit 4c258018c3
parent 85f5770e09
17 changed files with 348 additions and 248 deletions
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -9,79 +9,108 @@ pl.Config.set_tbl_cols(-1)

 def main():
    parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
-    parser.add_argument("--epc", type=Path, required=True, help="EPC certificates CSV file")
-    parser.add_argument("--price-paid", type=Path, required=True, help="Price paid parquet file")
-    parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
+    parser.add_argument(
+        "--epc", type=Path, required=True, help="EPC certificates CSV file"
+    )
+    parser.add_argument(
+        "--price-paid", type=Path, required=True, help="Price paid parquet file"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
    args = parser.parse_args()

-    epc = pl.scan_csv(args.epc).select(
-        pl.col('ADDRESS').alias('epc_address'),
-        'POSTCODE',
-        'CURRENT_ENERGY_RATING',
-        'POTENTIAL_ENERGY_RATING',
-        pl.col('PROPERTY_TYPE').alias('epc_property_type'),
-        'BUILT_FORM',
-        'INSPECTION_DATE',
-        'TOTAL_FLOOR_AREA',
-        'NUMBER_HABITABLE_ROOMS',
-        'FLOOR_HEIGHT',
-        'CONSTRUCTION_AGE_BAND'
-    ).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
-
+    epc = (
+        pl.scan_csv(args.epc)
+        .select(
+            pl.col("ADDRESS").alias("epc_address"),
+            "POSTCODE",
+            "CURRENT_ENERGY_RATING",
+            "POTENTIAL_ENERGY_RATING",
+            pl.col("PROPERTY_TYPE").alias("epc_property_type"),
+            "BUILT_FORM",
+            "INSPECTION_DATE",
+            "TOTAL_FLOOR_AREA",
+            "NUMBER_HABITABLE_ROOMS",
+            "FLOOR_HEIGHT",
+            "CONSTRUCTION_AGE_BAND",
+        )
+        .filter(pl.col("epc_address").is_not_null())
+        .sort("INSPECTION_DATE", descending=True)
+        .group_by("epc_address", "POSTCODE")
+        .first()
+    )

    print("EPC dataset")
    print(epc.head().collect())

    # https://www.gov.uk/guidance/about-the-price-paid-data
-    property_type_map = {"D": "Detached", "S": "Semi-Detached", "T": "Terraced", "F": "Flats/Maisonettes", "O": "Other"}
+    property_type_map = {
+        "D": "Detached",
+        "S": "Semi-Detached",
+        "T": "Terraced",
+        "F": "Flats/Maisonettes",
+        "O": "Other",
+    }
    duration_map = {"F": "Freehold", "L": "Leasehold"}

-    price_paid = (pl.scan_parquet(args.price_paid).select(
-        "price",
-        "date_of_transfer",
-        pl.col('property_type').alias("pp_property_type").replace(property_type_map),
-        "postcode",
-        'paon',
-        'saon',
-        'street',
-        'locality',
-        'town_city',
-        pl.col('duration').replace(duration_map)
-    )
-    .filter(pl.col('pp_property_type') != 'Other').with_columns(
-            pl.concat_str(
-                [pl.col('saon'), pl.col('paon'), pl.col('street')],
-                separator=' ',
-                ignore_nulls=True,
-            ).alias('pp_address'),
+    price_paid = (
+        pl.scan_parquet(args.price_paid)
+        .select(
+            "price",
+            "date_of_transfer",
+            pl.col("property_type")
+            .alias("pp_property_type")
+            .replace(property_type_map),
+            "postcode",
+            "paon",
+            "saon",
+            "street",
+            "locality",
+            "town_city",
+            pl.col("duration").replace(duration_map),
        )
-        .sort('date_of_transfer')
-        .group_by('pp_address', 'postcode', maintain_order=True)
+        .filter(pl.col("pp_property_type") != "Other")
+        .with_columns(
+            pl.concat_str(
+                [pl.col("saon"), pl.col("paon"), pl.col("street")],
+                separator=" ",
+                ignore_nulls=True,
+            ).alias("pp_address"),
+        )
+        .sort("date_of_transfer")
+        .group_by("pp_address", "postcode", maintain_order=True)
        .agg(
            pl.struct(
-                pl.col('date_of_transfer').dt.year().alias('year'),
-                'price',
-            ).alias('historical_prices'),
-            pl.col('pp_property_type').last(),
-            pl.col('duration').last(),
-            pl.col('price').last().alias('latest_price'),
-            pl.col('date_of_transfer').last(),
+                pl.col("date_of_transfer").dt.year().alias("year"),
+                "price",
+            ).alias("historical_prices"),
+            pl.col("pp_property_type").last(),
+            pl.col("duration").last(),
+            pl.col("price").last().alias("latest_price"),
+            pl.col("date_of_transfer").last(),
        )
-    ).filter(pl.col('pp_address').is_not_null())
+    ).filter(pl.col("pp_address").is_not_null())

    print("Price paid dataset")
    print(price_paid.head().collect())

-    joined = fuzzy_join_on_postcode(
-        left=price_paid,
-        right=epc,
-        left_address_col='pp_address',
-        right_address_col='epc_address',
-        left_postcode_col='postcode',
-        right_postcode_col='POSTCODE',
-    ).drop('POSTCODE').collect()
+    joined = (
+        fuzzy_join_on_postcode(
+            left=price_paid,
+            right=epc,
+            left_address_col="pp_address",
+            right_address_col="epc_address",
+            left_postcode_col="postcode",
+            right_postcode_col="POSTCODE",
+        )
+        .drop("POSTCODE")
+        .collect()
+    )

-    matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
+    matched = joined.filter(
+        pl.col("epc_address").is_not_null() & pl.col("pp_address").is_not_null()
+    )
    total = joined.height
    print(f"Unique properties: {total}")
    print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -24,7 +24,9 @@ def _build_wide(
        "lsoa21",
    )
    wide = wide.join(arcgis, on="postcode", how="inner")
-    print(f"  {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB")
+    print(
+        f"  {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB"
+    )

    # Journey times (optional)
    if journey_times_path and journey_times_path.exists():
@ -42,9 +44,7 @@ def _build_wide(
    if iod_path and iod_path.exists():
        print("Joining IoD scores...")
        iod = pl.read_parquet(iod_path)
-        wide = wide.join(
-            iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left"
-        )
+        wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
        print(f"  {wide.estimated_size('mb'):.1f} MB after IoD")

    # POI proximity counts (pre-computed per postcode)
@ -66,44 +66,68 @@ def _build_wide(
        )

    # Derived columns
-    wide = wide.with_columns(
-        (pl.col("latest_price") / pl.col("total_floor_area")).alias("Price per sqm"),
-    ).drop(
-        'date_of_transfer',
-        'inspection_date',
-        'floor_height',
-        'lsoa21',
-        'LSOA code (2021)',
-        'Local Authority District code (2024)',
-        'Local Authority District name (2024)',
-        'imd_score',
-        'housing_barriers_score',
-        'idaci_score',
-        'idaopi_score',
-        'children_young_people_score',
-        'adult_skills_score',
-        'geographical_barriers_score',
-        'wider_barriers_score',
-    ).rename({
-        'construction_age_band': "Approximate construction age",
-        "income_score": "Income Score (rate)",
-        "employment_score": "Employment Score (rate)",
-        "education_score": "Education, Skills and Training Score",
-        "health_score": "Health Deprivation and Disability Score",
-        "crime_score": "Crime Score",
-    })
+    wide = (
+        wide.with_columns(
+            (pl.col("latest_price") / pl.col("total_floor_area")).alias(
+                "Price per sqm"
+            ),
+        )
+        .drop(
+            "date_of_transfer",
+            "inspection_date",
+            "floor_height",
+            "lsoa21",
+            "LSOA code (2021)",
+            "Local Authority District code (2024)",
+            "Local Authority District name (2024)",
+            "imd_score",
+            "housing_barriers_score",
+            "idaci_score",
+            "idaopi_score",
+            "children_young_people_score",
+            "adult_skills_score",
+            "geographical_barriers_score",
+            "wider_barriers_score",
+        )
+        .rename(
+            {
+                "construction_age_band": "Approximate construction age",
+                "income_score": "Income Score (rate)",
+                "employment_score": "Employment Score (rate)",
+                "education_score": "Education, Skills and Training Score",
+                "health_score": "Health Deprivation and Disability Score",
+                "crime_score": "Crime Score",
+            }
+        )
+    )

    return wide


 def main():
-    parser = argparse.ArgumentParser(description="Build wide property dataframe with all joins")
-    parser.add_argument("--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file")
-    parser.add_argument("--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file")
-    parser.add_argument("--iod", type=Path, help="Index of Deprivation parquet file (optional)")
-    parser.add_argument("--poi-proximity", type=Path, help="POI proximity counts parquet file (optional)")
-    parser.add_argument("--journey-times", type=Path, help="Journey times parquet file (optional)")
-    parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
+    parser = argparse.ArgumentParser(
+        description="Build wide property dataframe with all joins"
+    )
+    parser.add_argument(
+        "--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
+    )
+    parser.add_argument(
+        "--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
+    )
+    parser.add_argument(
+        "--iod", type=Path, help="Index of Deprivation parquet file (optional)"
+    )
+    parser.add_argument(
+        "--poi-proximity",
+        type=Path,
+        help="POI proximity counts parquet file (optional)",
+    )
+    parser.add_argument(
+        "--journey-times", type=Path, help="Journey times parquet file (optional)"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
    args = parser.parse_args()

    wide = _build_wide(
@ -119,7 +143,7 @@ def main():

    wide.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
-    
+
    print(f"Wrote {args.output} ({size_mb:.1f} MB)")


--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -584,9 +584,7 @@ def transform(input_path: Path) -> pl.LazyFrame:
        if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
            unmapped.append(cat)
    if unmapped:
-        raise ValueError(
-            f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}"
-        )
+        raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")

    # Verify every CATEGORY_MAP key actually exists in the data (catch typos)
    mapped_but_absent = []
@ -623,9 +621,15 @@ def transform(input_path: Path) -> pl.LazyFrame:


 def main():
-    parser = argparse.ArgumentParser(description="Transform raw POIs to filtered version with friendly names")
-    parser.add_argument("--input", type=Path, required=True, help="Raw POIs parquet file")
-    parser.add_argument("--output", type=Path, required=True, help="Output filtered POIs parquet file")
+    parser = argparse.ArgumentParser(
+        description="Transform raw POIs to filtered version with friendly names"
+    )
+    parser.add_argument(
+        "--input", type=Path, required=True, help="Raw POIs parquet file"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output filtered POIs parquet file"
+    )
    args = parser.parse_args()

    df = transform(args.input).collect()