85 lines
2.8 KiB
Python
85 lines
2.8 KiB
Python
"""Download the ONS LSOA 2011 → LSOA 2021 lookup.
|
|
|
|
Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5)
|
|
License: Open Government Licence v3.0
|
|
|
|
The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to
|
|
remap older crime data (police.uk reported in 2011 codes pre-2022) into the
|
|
2021 codes the rest of the pipeline keys on, so the crime-over-time chart can
|
|
show the full history instead of only post-boundary-change years.
|
|
|
|
CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011
|
|
merged into one 2021), X (irregular reshape).
|
|
"""
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import polars as pl
|
|
|
|
BASE_URL = (
|
|
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
|
|
"LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query"
|
|
)
|
|
PAGE_SIZE = 2000
|
|
|
|
|
|
def download(output_path: Path) -> None:
|
|
print("Downloading ONS LSOA 2011 → 2021 lookup...")
|
|
rows: list[dict[str, str]] = []
|
|
offset = 0
|
|
while True:
|
|
params = {
|
|
"where": "1=1",
|
|
"outFields": "LSOA11CD,LSOA21CD,CHGIND",
|
|
"returnGeometry": "false",
|
|
"orderByFields": "LSOA11CD",
|
|
"f": "json",
|
|
"resultRecordCount": str(PAGE_SIZE),
|
|
"resultOffset": str(offset),
|
|
}
|
|
response = httpx.get(BASE_URL, params=params, timeout=60)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
features = data.get("features", [])
|
|
if not features:
|
|
break
|
|
for feat in features:
|
|
attrs = feat.get("attributes", {})
|
|
if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"):
|
|
rows.append(
|
|
{
|
|
"lsoa11": attrs["LSOA11CD"],
|
|
"lsoa21": attrs["LSOA21CD"],
|
|
"chgind": attrs.get("CHGIND") or "U",
|
|
}
|
|
)
|
|
print(f" Fetched {len(features)} rows (offset={offset})")
|
|
if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE:
|
|
break
|
|
offset += len(features)
|
|
|
|
if not rows:
|
|
raise RuntimeError("ONS lookup returned no rows")
|
|
|
|
df = pl.DataFrame(rows)
|
|
# England-only matches the rest of the pipeline.
|
|
df = df.filter(pl.col("lsoa11").str.starts_with("E"))
|
|
print(f"England LSOA mappings: {df.height}")
|
|
print(f" CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}")
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
df.write_parquet(output_path, compression="zstd")
|
|
print(f"Saved to {output_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup")
|
|
parser.add_argument("--output", type=Path, required=True)
|
|
args = parser.parse_args()
|
|
download(args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|