Add crime per 1k people

This commit is contained in:
Andras Schmelczer 2026-03-08 21:02:29 +00:00
parent 245c16a212
commit e0798b24f7
4 changed files with 136 additions and 2 deletions

View file

@ -50,6 +50,7 @@ PBF := $(DATA_DIR)/great-britain-latest.osm.pbf
PLACES := $(DATA_DIR)/places.parquet
LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet
LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet
LSOA_POP := $(DATA_DIR)/lsoa_population.parquet
# Sentinel files for directory targets (Make doesn't track directories well)
GEOSURE_STAMP := $(GEOSURE_DIR)/.done
@ -63,7 +64,7 @@ PMTILES_VERSION := 1.22.3
download-arcgis download-price-paid download-deprivation download-ethnicity \
download-naptan download-pois download-ofsted download-broadband download-rental-prices \
download-postcodes download-geosure download-noise download-inspire \
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places \
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population \
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
transform-school-proximity transform-geosure transform-postcode-boundaries \
generate-postcode-boundaries
@ -90,6 +91,7 @@ download-transit-network: $(TRANSIT_STAMP)
download-greenspace: $(GREENSPACE)
download-pbf: $(PBF)
download-places: $(PLACES)
download-lsoa-population: $(LSOA_POP)
transform-pois: $(POIS_FILTERED)
transform-epc-pp: $(EPC_PP)
transform-crime: $(CRIME)
@ -182,6 +184,9 @@ $(GREENSPACE): $(PBF)
$(PLACES): $(PBF)
uv run python -m pipeline.download.places --output $@ --pbf $(PBF)
$(LSOA_POP):
uv run python -m pipeline.download.lsoa_population --output $@
# ── Transforms ────────────────────────────────────────────────────────────────
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN)
@ -228,7 +233,7 @@ $(PC_BOUNDARIES):
# ── Final merge → postcode.parquet + properties.parquet ──────────────────────
$(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL)
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL) $(LSOA_POP)
uv run python -m pipeline.transform.merge \
--epc-pp $(EPC_PP) \
--arcgis $(ARCGIS) \
@ -241,6 +246,7 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
--broadband $(BROADBAND) \
--geosure $(GEOSURE) \
--rental-prices $(RENTAL) \
--lsoa-population $(LSOA_POP) \
--output-postcodes $(POSTCODES_PQ) \
--output-properties $(PROPERTIES_PQ)
@touch $@

View file

@ -0,0 +1,71 @@
"""Download Census 2021 usual resident population by LSOA.
Source: NOMIS (ONS Census 2021 TS001 dataset)
License: Open Government Licence v3.0
"""
import argparse
from io import BytesIO
from pathlib import Path
import httpx
import polars as pl
# NOMIS API: Census 2021 TS001 (usual residents) by LSOA 2021 (TYPE151)
# c2021_restype_3=0 selects "Total: All usual residents"
# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset.
BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2021_1.data.csv?date=latest&geography=TYPE151&measures=20100&c2021_restype_3=0&select=GEOGRAPHY_CODE,OBS_VALUE"
PAGE_SIZE = 25000
def download_and_convert(output_path: Path) -> None:
print("Downloading Census 2021 LSOA population from NOMIS...")
frames = []
offset = 0
while True:
url = f"{BASE_URL}&recordoffset={offset}"
response = httpx.get(url, follow_redirects=True, timeout=120)
response.raise_for_status()
if len(response.content) == 0:
break
chunk = pl.read_csv(BytesIO(response.content))
if chunk.height == 0:
break
frames.append(chunk)
print(f" Fetched {chunk.height} rows (offset={offset})")
if chunk.height < PAGE_SIZE:
break
offset += PAGE_SIZE
df = pl.concat(frames)
print(f"Total rows: {df.height}")
result = df.rename({"GEOGRAPHY_CODE": "lsoa21", "OBS_VALUE": "population"}).with_columns(
pl.col("population").cast(pl.UInt32),
)
# Filter to England only (E prefix)
result = result.filter(pl.col("lsoa21").str.starts_with("E"))
print(f"England LSOAs: {result.height}")
print(f"Population range: {result['population'].min()} - {result['population'].max()}")
print(f"Mean population: {result['population'].mean():.0f}")
output_path.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Census 2021 population by LSOA"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()

View file

@ -43,6 +43,8 @@ _AREA_COLUMNS = [
"Other crime (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
"Serious crime per 1k residents (avg/yr)",
"Minor crime per 1k residents (avg/yr)",
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
@ -77,6 +79,7 @@ def _build(
broadband_path: Path,
geosure_path: Path,
rental_prices_path: Path,
lsoa_population_path: Path,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
@ -171,6 +174,17 @@ def _build(
).alias("minor_crime_avg_yr"),
)
lsoa_pop = pl.scan_parquet(lsoa_population_path)
wide = wide.join(lsoa_pop, on="lsoa21", how="left")
wide = wide.with_columns(
(pl.col("serious_crime_avg_yr") / pl.col("population") * 1000)
.round(1)
.alias("serious_crime_per_1k"),
(pl.col("minor_crime_avg_yr") / pl.col("population") * 1000)
.round(1)
.alias("minor_crime_per_1k"),
).drop("population")
poi_counts = pl.scan_parquet(poi_proximity_path)
wide = wide.join(poi_counts, on="postcode", how="left")
@ -301,6 +315,8 @@ def _build(
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
"environmental_risk": "Environmental risk",
"collapsible_deposits_risk": "Collapsible deposits risk",
"compressible_ground_risk": "Compressible ground risk",
@ -389,6 +405,12 @@ def main():
required=True,
help="ONS rental prices by LA and bedroom count parquet file",
)
parser.add_argument(
"--lsoa-population",
type=Path,
required=True,
help="Census 2021 population by LSOA parquet file",
)
parser.add_argument(
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
)
@ -409,6 +431,7 @@ def main():
broadband_path=args.broadband,
geosure_path=args.geosure,
rental_prices_path=args.rental_prices,
lsoa_population_path=args.lsoa_population,
)
print(f"\nPostcode columns: {postcode_df.columns}")

View file

@ -606,6 +606,40 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
modes: &[],
linked: "",
},
FeatureConfig {
name: "Serious crime per 1k residents (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Serious crime rate per 1,000 residents per year",
detail: "Violence, robbery, burglary, and weapons possession per 1,000 usual residents per year in the LSOA. Uses police.uk street-level crime data (2023-2025) and Census 2021 population counts. Normalises for population density so areas are comparable regardless of size.",
source: "crime",
prefix: "",
suffix: "/yr",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
FeatureConfig {
name: "Minor crime per 1k residents (avg/yr)",
bounds: Bounds::Percentile {
low: 2.0,
high: 98.0,
},
step: 0.1,
description: "Minor crime rate per 1,000 residents per year",
detail: "Anti-social behaviour, shoplifting, bicycle theft, and other lower-severity crime per 1,000 usual residents per year in the LSOA. Uses police.uk street-level crime data (2023-2025) and Census 2021 population counts. Normalises for population density so areas are comparable regardless of size.",
source: "crime",
prefix: "",
suffix: "/yr",
raw: false,
absolute: false,
modes: &[],
linked: "",
},
],
},
FeatureGroup {