changes
This commit is contained in:
parent
524580eb25
commit
ffe080adef
82 changed files with 2652 additions and 2956 deletions
|
|
@ -30,7 +30,69 @@ def _join_journey_times(
|
|||
return wide.join(journey_times, on="postcode", how="left")
|
||||
|
||||
|
||||
def _build_wide(
|
||||
_AREA_COLUMNS = [
|
||||
"Postcode",
|
||||
"lat",
|
||||
"lon",
|
||||
# Transport
|
||||
"Public transport to Bank (mins)",
|
||||
"Cycling to Bank (mins)",
|
||||
"Public transport to Fitzrovia (mins)",
|
||||
"Cycling to Fitzrovia (mins)",
|
||||
# Deprivation
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Education, Skills and Training Score",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Living Environment Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
# Ethnicity
|
||||
"% Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
"% Other",
|
||||
# Crime
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
"Serious crime (avg/yr)",
|
||||
"Minor crime (avg/yr)",
|
||||
# Amenities
|
||||
"Number of restaurants within 2km",
|
||||
"Number of grocery shops and supermarkets within 2km",
|
||||
"Number of parks within 2km",
|
||||
"Number of public transport stations within 2km",
|
||||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
# Schools
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
# GeoSure
|
||||
"Environmental risk",
|
||||
"Collapsible deposits risk",
|
||||
"Compressible ground risk",
|
||||
"Landslide risk",
|
||||
"Running sand risk",
|
||||
"Shrink-swell risk",
|
||||
"Soluble rocks risk",
|
||||
]
|
||||
|
||||
|
||||
def _build(
|
||||
epc_pp_path: Path,
|
||||
arcgis_path: Path,
|
||||
iod_path: Path,
|
||||
|
|
@ -44,8 +106,11 @@ def _build_wide(
|
|||
broadband_path: Path,
|
||||
geosure_path: Path,
|
||||
rental_prices_path: Path,
|
||||
) -> pl.DataFrame:
|
||||
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
|
||||
|
||||
Returns (postcode_df, properties_df).
|
||||
"""
|
||||
wide = pl.scan_parquet(epc_pp_path).filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
|
|
@ -180,7 +245,7 @@ def _build_wide(
|
|||
.group_by("bb_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
)
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left").drop("bb_postcode")
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
|
||||
geosure = pl.scan_parquet(geosure_path)
|
||||
wide = wide.join(geosure, on="postcode", how="left")
|
||||
|
|
@ -280,7 +345,18 @@ def _build_wide(
|
|||
)
|
||||
|
||||
print("Collecting with streaming engine...")
|
||||
return wide.collect(engine="streaming")
|
||||
df = wide.collect(engine="streaming")
|
||||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
|
||||
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
||||
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
||||
|
||||
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
|
||||
properties_df = df.select(property_cols)
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
|
||||
return postcode_df, properties_df
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -356,11 +432,14 @@ def main():
|
|||
help="ONS rental prices by LA and bedroom count parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-properties", type=Path, required=True, help="Output properties parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
wide = _build_wide(
|
||||
postcode_df, properties_df = _build(
|
||||
epc_pp_path=args.epc_pp,
|
||||
arcgis_path=args.arcgis,
|
||||
iod_path=args.iod,
|
||||
|
|
@ -376,13 +455,17 @@ def main():
|
|||
rental_prices_path=args.rental_prices,
|
||||
)
|
||||
|
||||
print(f"Columns: {wide.columns}")
|
||||
print(f"Rows: {wide.height}")
|
||||
print(f"\nPostcode columns: {postcode_df.columns}")
|
||||
print(f"Postcode rows: {postcode_df.height}")
|
||||
postcode_df.write_parquet(args.output_postcodes)
|
||||
size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")
|
||||
|
||||
wide.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
||||
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f"\nProperty columns: {properties_df.columns}")
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
properties_df.write_parquet(args.output_properties)
|
||||
size_mb = args.output_properties.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue