Add crime per 1k people

This commit is contained in:
Andras Schmelczer 2026-03-08 21:02:29 +00:00
parent 245c16a212
commit e0798b24f7
4 changed files with 136 additions and 2 deletions

View file

@ -0,0 +1,71 @@
"""Download Census 2021 usual resident population by LSOA.
Source: NOMIS (ONS Census 2021 TS001 dataset)
License: Open Government Licence v3.0
"""
import argparse
from io import BytesIO
from pathlib import Path
import httpx
import polars as pl
# NOMIS API: Census 2021 TS001 (usual residents) by LSOA 2021 (TYPE151)
# c2021_restype_3=0 selects "Total: All usual residents"
# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset.
BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2021_1.data.csv?date=latest&geography=TYPE151&measures=20100&c2021_restype_3=0&select=GEOGRAPHY_CODE,OBS_VALUE"
PAGE_SIZE = 25000
def download_and_convert(output_path: Path) -> None:
print("Downloading Census 2021 LSOA population from NOMIS...")
frames = []
offset = 0
while True:
url = f"{BASE_URL}&recordoffset={offset}"
response = httpx.get(url, follow_redirects=True, timeout=120)
response.raise_for_status()
if len(response.content) == 0:
break
chunk = pl.read_csv(BytesIO(response.content))
if chunk.height == 0:
break
frames.append(chunk)
print(f" Fetched {chunk.height} rows (offset={offset})")
if chunk.height < PAGE_SIZE:
break
offset += PAGE_SIZE
df = pl.concat(frames)
print(f"Total rows: {df.height}")
result = df.rename({"GEOGRAPHY_CODE": "lsoa21", "OBS_VALUE": "population"}).with_columns(
pl.col("population").cast(pl.UInt32),
)
# Filter to England only (E prefix)
result = result.filter(pl.col("lsoa21").str.starts_with("E"))
print(f"England LSOAs: {result.height}")
print(f"Population range: {result['population'].min()} - {result['population'].max()}")
print(f"Mean population: {result['population'].mean():.0f}")
output_path.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Census 2021 population by LSOA"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()

View file

@ -43,6 +43,8 @@ _AREA_COLUMNS = [
"Other crime (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
"Serious crime per 1k residents (avg/yr)",
"Minor crime per 1k residents (avg/yr)",
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
@ -77,6 +79,7 @@ def _build(
broadband_path: Path,
geosure_path: Path,
rental_prices_path: Path,
lsoa_population_path: Path,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
@ -171,6 +174,17 @@ def _build(
).alias("minor_crime_avg_yr"),
)
lsoa_pop = pl.scan_parquet(lsoa_population_path)
wide = wide.join(lsoa_pop, on="lsoa21", how="left")
wide = wide.with_columns(
(pl.col("serious_crime_avg_yr") / pl.col("population") * 1000)
.round(1)
.alias("serious_crime_per_1k"),
(pl.col("minor_crime_avg_yr") / pl.col("population") * 1000)
.round(1)
.alias("minor_crime_per_1k"),
).drop("population")
poi_counts = pl.scan_parquet(poi_proximity_path)
wide = wide.join(poi_counts, on="postcode", how="left")
@ -301,6 +315,8 @@ def _build(
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
"environmental_risk": "Environmental risk",
"collapsible_deposits_risk": "Collapsible deposits risk",
"compressible_ground_risk": "Compressible ground risk",
@ -389,6 +405,12 @@ def main():
required=True,
help="ONS rental prices by LA and bedroom count parquet file",
)
parser.add_argument(
"--lsoa-population",
type=Path,
required=True,
help="Census 2021 population by LSOA parquet file",
)
parser.add_argument(
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
)
@ -409,6 +431,7 @@ def main():
broadband_path=args.broadband,
geosure_path=args.geosure,
rental_prices_path=args.rental_prices,
lsoa_population_path=args.lsoa_population,
)
print(f"\nPostcode columns: {postcode_df.columns}")