From e0798b24f79b63cb9ee94d81c519711fea46a70b Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 8 Mar 2026 21:02:29 +0000 Subject: [PATCH] Add crime per 1k people --- Makefile.data | 10 +++- pipeline/download/lsoa_population.py | 71 ++++++++++++++++++++++++++++ pipeline/transform/merge.py | 23 +++++++++ server-rs/src/features.rs | 34 +++++++++++++ 4 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 pipeline/download/lsoa_population.py diff --git a/Makefile.data b/Makefile.data index 5eb8eb5..fcedd88 100644 --- a/Makefile.data +++ b/Makefile.data @@ -50,6 +50,7 @@ PBF := $(DATA_DIR)/great-britain-latest.osm.pbf PLACES := $(DATA_DIR)/places.parquet LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet +LSOA_POP := $(DATA_DIR)/lsoa_population.parquet # Sentinel files for directory targets (Make doesn't track directories well) GEOSURE_STAMP := $(GEOSURE_DIR)/.done @@ -63,7 +64,7 @@ PMTILES_VERSION := 1.22.3 download-arcgis download-price-paid download-deprivation download-ethnicity \ download-naptan download-pois download-ofsted download-broadband download-rental-prices \ download-postcodes download-geosure download-noise download-inspire \ - download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places \ + download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population \ transform-pois transform-epc-pp transform-crime transform-poi-proximity \ transform-school-proximity transform-geosure transform-postcode-boundaries \ generate-postcode-boundaries @@ -90,6 +91,7 @@ download-transit-network: $(TRANSIT_STAMP) download-greenspace: $(GREENSPACE) download-pbf: $(PBF) download-places: $(PLACES) +download-lsoa-population: $(LSOA_POP) transform-pois: $(POIS_FILTERED) transform-epc-pp: $(EPC_PP) transform-crime: $(CRIME) @@ -182,6 +184,9 @@ $(GREENSPACE): $(PBF) $(PLACES): $(PBF) uv run python -m pipeline.download.places --output $@ --pbf $(PBF) +$(LSOA_POP): + uv run python -m pipeline.download.lsoa_population --output $@ + # ── Transforms ──────────────────────────────────────────────────────────────── $(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) @@ -228,7 +233,7 @@ $(PC_BOUNDARIES): # ── Final merge → postcode.parquet + properties.parquet ────────────────────── $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \ - $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL) + $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL) $(LSOA_POP) uv run python -m pipeline.transform.merge \ --epc-pp $(EPC_PP) \ --arcgis $(ARCGIS) \ @@ -241,6 +246,7 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \ --broadband $(BROADBAND) \ --geosure $(GEOSURE) \ --rental-prices $(RENTAL) \ + --lsoa-population $(LSOA_POP) \ --output-postcodes $(POSTCODES_PQ) \ --output-properties $(PROPERTIES_PQ) @touch $@ diff --git a/pipeline/download/lsoa_population.py b/pipeline/download/lsoa_population.py new file mode 100644 index 0000000..1d29978 --- /dev/null +++ b/pipeline/download/lsoa_population.py @@ -0,0 +1,71 @@ +"""Download Census 2021 usual resident population by LSOA. + +Source: NOMIS (ONS Census 2021 — TS001 dataset) +License: Open Government Licence v3.0 +""" + +import argparse +from io import BytesIO +from pathlib import Path + +import httpx +import polars as pl + +# NOMIS API: Census 2021 TS001 (usual residents) by LSOA 2021 (TYPE151) +# c2021_restype_3=0 selects "Total: All usual residents" +# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset. +BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2021_1.data.csv?date=latest&geography=TYPE151&measures=20100&c2021_restype_3=0&select=GEOGRAPHY_CODE,OBS_VALUE" +PAGE_SIZE = 25000 + + +def download_and_convert(output_path: Path) -> None: + print("Downloading Census 2021 LSOA population from NOMIS...") + frames = [] + offset = 0 + while True: + url = f"{BASE_URL}&recordoffset={offset}" + response = httpx.get(url, follow_redirects=True, timeout=120) + response.raise_for_status() + if len(response.content) == 0: + break + chunk = pl.read_csv(BytesIO(response.content)) + if chunk.height == 0: + break + frames.append(chunk) + print(f" Fetched {chunk.height} rows (offset={offset})") + if chunk.height < PAGE_SIZE: + break + offset += PAGE_SIZE + + df = pl.concat(frames) + print(f"Total rows: {df.height}") + + result = df.rename({"GEOGRAPHY_CODE": "lsoa21", "OBS_VALUE": "population"}).with_columns( + pl.col("population").cast(pl.UInt32), + ) + + # Filter to England only (E prefix) + result = result.filter(pl.col("lsoa21").str.starts_with("E")) + + print(f"England LSOAs: {result.height}") + print(f"Population range: {result['population'].min()} - {result['population'].max()}") + print(f"Mean population: {result['population'].mean():.0f}") + + output_path.parent.mkdir(parents=True, exist_ok=True) + result.write_parquet(output_path, compression="zstd") + print(f"Saved to {output_path}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Download Census 2021 population by LSOA" + ) + parser.add_argument( + "--output", type=Path, required=True, help="Output parquet file path" + ) + args = parser.parse_args() + download_and_convert(args.output) + + +if __name__ == "__main__": + main() diff --git a/pipeline/transform/merge.py b/pipeline/transform/merge.py index 4dc520f..3ff9c81 100644 --- a/pipeline/transform/merge.py +++ b/pipeline/transform/merge.py @@ -43,6 +43,8 @@ _AREA_COLUMNS = [ "Other crime (avg/yr)", "Serious crime (avg/yr)", "Minor crime (avg/yr)", + "Serious crime per 1k residents (avg/yr)", + "Minor crime per 1k residents (avg/yr)", # Amenities "Number of restaurants within 2km", "Number of grocery shops and supermarkets within 2km", @@ -77,6 +79,7 @@ def _build( broadband_path: Path, geosure_path: Path, rental_prices_path: Path, + lsoa_population_path: Path, ) -> tuple[pl.DataFrame, pl.DataFrame]: """Build postcode and properties dataframes from epc_pp + auxiliary data. @@ -171,6 +174,17 @@ def _build( ).alias("minor_crime_avg_yr"), ) + lsoa_pop = pl.scan_parquet(lsoa_population_path) + wide = wide.join(lsoa_pop, on="lsoa21", how="left") + wide = wide.with_columns( + (pl.col("serious_crime_avg_yr") / pl.col("population") * 1000) + .round(1) + .alias("serious_crime_per_1k"), + (pl.col("minor_crime_avg_yr") / pl.col("population") * 1000) + .round(1) + .alias("minor_crime_per_1k"), + ).drop("population") + poi_counts = pl.scan_parquet(poi_proximity_path) wide = wide.join(poi_counts, on="postcode", how="left") @@ -301,6 +315,8 @@ def _build( "max_download_speed": "Max available download speed (Mbps)", "serious_crime_avg_yr": "Serious crime (avg/yr)", "minor_crime_avg_yr": "Minor crime (avg/yr)", + "serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)", + "minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)", "environmental_risk": "Environmental risk", "collapsible_deposits_risk": "Collapsible deposits risk", "compressible_ground_risk": "Compressible ground risk", @@ -389,6 +405,12 @@ def main(): required=True, help="ONS rental prices by LA and bedroom count parquet file", ) + parser.add_argument( + "--lsoa-population", + type=Path, + required=True, + help="Census 2021 population by LSOA parquet file", + ) parser.add_argument( "--output-postcodes", type=Path, required=True, help="Output postcode parquet file path" ) @@ -409,6 +431,7 @@ def main(): broadband_path=args.broadband, geosure_path=args.geosure, rental_prices_path=args.rental_prices, + lsoa_population_path=args.lsoa_population, ) print(f"\nPostcode columns: {postcode_df.columns}") diff --git a/server-rs/src/features.rs b/server-rs/src/features.rs index 5d747ad..c4f9140 100644 --- a/server-rs/src/features.rs +++ b/server-rs/src/features.rs @@ -606,6 +606,40 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[ modes: &[], linked: "", }, + FeatureConfig { + name: "Serious crime per 1k residents (avg/yr)", + bounds: Bounds::Percentile { + low: 2.0, + high: 98.0, + }, + step: 0.1, + description: "Serious crime rate per 1,000 residents per year", + detail: "Violence, robbery, burglary, and weapons possession per 1,000 usual residents per year in the LSOA. Uses police.uk street-level crime data (2023-2025) and Census 2021 population counts. Normalises for population density so areas are comparable regardless of size.", + source: "crime", + prefix: "", + suffix: "/yr", + raw: false, + absolute: false, + modes: &[], + linked: "", + }, + FeatureConfig { + name: "Minor crime per 1k residents (avg/yr)", + bounds: Bounds::Percentile { + low: 2.0, + high: 98.0, + }, + step: 0.1, + description: "Minor crime rate per 1,000 residents per year", + detail: "Anti-social behaviour, shoplifting, bicycle theft, and other lower-severity crime per 1,000 usual residents per year in the LSOA. Uses police.uk street-level crime data (2023-2025) and Census 2021 population counts. Normalises for population density so areas are comparable regardless of size.", + source: "crime", + prefix: "", + suffix: "/yr", + raw: false, + absolute: false, + modes: &[], + linked: "", + }, ], }, FeatureGroup {