has issues
This commit is contained in:
parent
2e112d7398
commit
c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions
85
pipeline/download/lsoa_2011_to_2021.py
Normal file
85
pipeline/download/lsoa_2011_to_2021.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
"""Download the ONS LSOA 2011 → LSOA 2021 lookup.
|
||||
|
||||
Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5)
|
||||
License: Open Government Licence v3.0
|
||||
|
||||
The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to
|
||||
remap older crime data (police.uk reported in 2011 codes pre-2022) into the
|
||||
2021 codes the rest of the pipeline keys on, so the crime-over-time chart can
|
||||
show the full history instead of only post-boundary-change years.
|
||||
|
||||
CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011
|
||||
merged into one 2021), X (irregular reshape).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
|
||||
BASE_URL = (
|
||||
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
|
||||
"LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query"
|
||||
)
|
||||
PAGE_SIZE = 2000
|
||||
|
||||
|
||||
def download(output_path: Path) -> None:
|
||||
print("Downloading ONS LSOA 2011 → 2021 lookup...")
|
||||
rows: list[dict[str, str]] = []
|
||||
offset = 0
|
||||
while True:
|
||||
params = {
|
||||
"where": "1=1",
|
||||
"outFields": "LSOA11CD,LSOA21CD,CHGIND",
|
||||
"returnGeometry": "false",
|
||||
"orderByFields": "LSOA11CD",
|
||||
"f": "json",
|
||||
"resultRecordCount": str(PAGE_SIZE),
|
||||
"resultOffset": str(offset),
|
||||
}
|
||||
response = httpx.get(BASE_URL, params=params, timeout=60)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
features = data.get("features", [])
|
||||
if not features:
|
||||
break
|
||||
for feat in features:
|
||||
attrs = feat.get("attributes", {})
|
||||
if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"):
|
||||
rows.append(
|
||||
{
|
||||
"lsoa11": attrs["LSOA11CD"],
|
||||
"lsoa21": attrs["LSOA21CD"],
|
||||
"chgind": attrs.get("CHGIND") or "U",
|
||||
}
|
||||
)
|
||||
print(f" Fetched {len(features)} rows (offset={offset})")
|
||||
if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE:
|
||||
break
|
||||
offset += len(features)
|
||||
|
||||
if not rows:
|
||||
raise RuntimeError("ONS lookup returned no rows")
|
||||
|
||||
df = pl.DataFrame(rows)
|
||||
# England-only matches the rest of the pipeline.
|
||||
df = df.filter(pl.col("lsoa11").str.starts_with("E"))
|
||||
print(f"England LSOA mappings: {df.height}")
|
||||
print(f" CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup")
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
download(args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue