This commit is contained in:
Andras Schmelczer 2026-02-18 21:22:15 +00:00
parent 524580eb25
commit ffe080adef
82 changed files with 2652 additions and 2956 deletions

View file

@ -30,7 +30,69 @@ def _join_journey_times(
return wide.join(journey_times, on="postcode", how="left")
def _build_wide(
_AREA_COLUMNS = [
"Postcode",
"lat",
"lon",
# Transport
"Public transport to Bank (mins)",
"Cycling to Bank (mins)",
"Public transport to Fitzrovia (mins)",
"Cycling to Fitzrovia (mins)",
# Deprivation
"Income Score (rate)",
"Employment Score (rate)",
"Education, Skills and Training Score",
"Health Deprivation and Disability Score",
"Living Environment Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
# Ethnicity
"% Asian",
"% Black",
"% Mixed",
"% White",
"% Other",
# Crime
"Anti-social behaviour (avg/yr)",
"Violence and sexual offences (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Burglary (avg/yr)",
"Vehicle crime (avg/yr)",
"Robbery (avg/yr)",
"Other theft (avg/yr)",
"Shoplifting (avg/yr)",
"Drugs (avg/yr)",
"Possession of weapons (avg/yr)",
"Public order (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other crime (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 2km",
"Number of public transport stations within 2km",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
# GeoSure
"Environmental risk",
"Collapsible deposits risk",
"Compressible ground risk",
"Landslide risk",
"Running sand risk",
"Shrink-swell risk",
"Soluble rocks risk",
]
def _build(
epc_pp_path: Path,
arcgis_path: Path,
iod_path: Path,
@ -44,8 +106,11 @@ def _build_wide(
broadband_path: Path,
geosure_path: Path,
rental_prices_path: Path,
) -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
Returns (postcode_df, properties_df).
"""
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
@ -180,7 +245,7 @@ def _build_wide(
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
)
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
geosure = pl.scan_parquet(geosure_path)
wide = wide.join(geosure, on="postcode", how="left")
@ -280,7 +345,18 @@ def _build_wide(
)
print("Collecting with streaming engine...")
return wide.collect(engine="streaming")
df = wide.collect(engine="streaming")
# Split into postcode-level and property-level dataframes
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
postcode_df = df.select(area_cols).group_by("Postcode").first()
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")
return postcode_df, properties_df
def main():
@ -356,11 +432,14 @@ def main():
help="ONS rental prices by LA and bedroom count parquet file",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
)
parser.add_argument(
"--output-properties", type=Path, required=True, help="Output properties parquet file path"
)
args = parser.parse_args()
wide = _build_wide(
postcode_df, properties_df = _build(
epc_pp_path=args.epc_pp,
arcgis_path=args.arcgis,
iod_path=args.iod,
@ -376,13 +455,17 @@ def main():
rental_prices_path=args.rental_prices,
)
print(f"Columns: {wide.columns}")
print(f"Rows: {wide.height}")
print(f"\nPostcode columns: {postcode_df.columns}")
print(f"Postcode rows: {postcode_df.height}")
postcode_df.write_parquet(args.output_postcodes)
size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")
wide.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
print(f"\nProperty columns: {properties_df.columns}")
print(f"Property rows: {properties_df.height}")
properties_df.write_parquet(args.output_properties)
size_mb = args.output_properties.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")
if __name__ == "__main__":