changes

2026-02-18 21:22:15 +00:00 · 2026-02-18 21:22:15 +00:00 · ffe080adef
commit ffe080adef
parent 524580eb25
82 changed files with 2652 additions and 2956 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -30,7 +30,69 @@ def _join_journey_times(
    return wide.join(journey_times, on="postcode", how="left")


-def _build_wide(
+_AREA_COLUMNS = [
+    "Postcode",
+    "lat",
+    "lon",
+    # Transport
+    "Public transport to Bank (mins)",
+    "Cycling to Bank (mins)",
+    "Public transport to Fitzrovia (mins)",
+    "Cycling to Fitzrovia (mins)",
+    # Deprivation
+    "Income Score (rate)",
+    "Employment Score (rate)",
+    "Education, Skills and Training Score",
+    "Health Deprivation and Disability Score",
+    "Living Environment Score",
+    "Indoors Sub-domain Score",
+    "Outdoors Sub-domain Score",
+    # Ethnicity
+    "% Asian",
+    "% Black",
+    "% Mixed",
+    "% White",
+    "% Other",
+    # Crime
+    "Anti-social behaviour (avg/yr)",
+    "Violence and sexual offences (avg/yr)",
+    "Criminal damage and arson (avg/yr)",
+    "Burglary (avg/yr)",
+    "Vehicle crime (avg/yr)",
+    "Robbery (avg/yr)",
+    "Other theft (avg/yr)",
+    "Shoplifting (avg/yr)",
+    "Drugs (avg/yr)",
+    "Possession of weapons (avg/yr)",
+    "Public order (avg/yr)",
+    "Bicycle theft (avg/yr)",
+    "Theft from the person (avg/yr)",
+    "Other crime (avg/yr)",
+    "Serious crime (avg/yr)",
+    "Minor crime (avg/yr)",
+    # Amenities
+    "Number of restaurants within 2km",
+    "Number of grocery shops and supermarkets within 2km",
+    "Number of parks within 2km",
+    "Number of public transport stations within 2km",
+    # Environment
+    "Noise (dB)",
+    "Max available download speed (Mbps)",
+    # Schools
+    "Good+ primary schools within 5km",
+    "Good+ secondary schools within 5km",
+    # GeoSure
+    "Environmental risk",
+    "Collapsible deposits risk",
+    "Compressible ground risk",
+    "Landslide risk",
+    "Running sand risk",
+    "Shrink-swell risk",
+    "Soluble rocks risk",
+]
+
+
+def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
    iod_path: Path,
@ -44,8 +106,11 @@ def _build_wide(
    broadband_path: Path,
    geosure_path: Path,
    rental_prices_path: Path,
-) -> pl.DataFrame:
-    """Build the wide dataframe by joining epc_pp with all auxiliary data."""
+) -> tuple[pl.DataFrame, pl.DataFrame]:
+    """Build postcode and properties dataframes from epc_pp + auxiliary data.
+
+    Returns (postcode_df, properties_df).
+    """
    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
        | (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
@ -180,7 +245,7 @@ def _build_wide(
        .group_by("bb_postcode")
        .agg(pl.col("max_download_speed").max())
    )
-    wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
+    wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")

    geosure = pl.scan_parquet(geosure_path)
    wide = wide.join(geosure, on="postcode", how="left")
@ -280,7 +345,18 @@ def _build_wide(
    )

    print("Collecting with streaming engine...")
-    return wide.collect(engine="streaming")
+    df = wide.collect(engine="streaming")
+
+    # Split into postcode-level and property-level dataframes
+    area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
+    postcode_df = df.select(area_cols).group_by("Postcode").first()
+    print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
+
+    property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
+    properties_df = df.select(property_cols)
+    print(f"Property rows: {properties_df.height}")
+
+    return postcode_df, properties_df


 def main():
@ -356,11 +432,14 @@ def main():
        help="ONS rental prices by LA and bedroom count parquet file",
    )
    parser.add_argument(
-        "--output", type=Path, required=True, help="Output parquet file path"
+        "--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
+    )
+    parser.add_argument(
+        "--output-properties", type=Path, required=True, help="Output properties parquet file path"
    )
    args = parser.parse_args()

-    wide = _build_wide(
+    postcode_df, properties_df = _build(
        epc_pp_path=args.epc_pp,
        arcgis_path=args.arcgis,
        iod_path=args.iod,
@ -376,13 +455,17 @@ def main():
        rental_prices_path=args.rental_prices,
    )

-    print(f"Columns: {wide.columns}")
-    print(f"Rows: {wide.height}")
+    print(f"\nPostcode columns: {postcode_df.columns}")
+    print(f"Postcode rows: {postcode_df.height}")
+    postcode_df.write_parquet(args.output_postcodes)
+    size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
+    print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")

-    wide.write_parquet(args.output)
-    size_mb = args.output.stat().st_size / (1024 * 1024)
-
-    print(f"Wrote {args.output} ({size_mb:.1f} MB)")
+    print(f"\nProperty columns: {properties_df.columns}")
+    print(f"Property rows: {properties_df.height}")
+    properties_df.write_parquet(args.output_properties)
+    size_mb = args.output_properties.stat().st_size / (1024 * 1024)
+    print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")


 if __name__ == "__main__":