All good

2026-05-18 21:20:10 +01:00 · 2026-05-18 21:20:10 +01:00 · 6cc7288126
commit 6cc7288126
parent 6ea544a0f6
45 changed files with 929 additions and 1043 deletions
--- a/pipeline/init.py
+++ b/pipeline/init.py
@ -0,0 +1,3 @@
+from .local_temp import configure_tempfile_defaults
+
+configure_tempfile_defaults()
--- a/pipeline/download/arcgis.py
+++ b/pipeline/download/arcgis.py
@ -3,6 +3,7 @@ import tempfile
 import polars as pl
 from pathlib import Path

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download, extract_zip

 URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"
@ -40,7 +41,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        download_path = Path(cache_dir) / "arcgis_data.zip"
        extract_path = Path(cache_dir) / "arcgis_extracted"

--- a/pipeline/download/broadband.py
+++ b/pipeline/download/broadband.py
@ -7,6 +7,7 @@ from pathlib import Path

 import httpx

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download, extract_zip

 # Ofcom Connected Nations 2025 - Fixed broadband performance (output area & local authority level)
@ -84,7 +85,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        cache = Path(cache_dir)
        zip_path = cache / "broadband_performance.zip"
        extract_dir = cache / "extracted"
--- a/pipeline/download/deprivation_data.py
+++ b/pipeline/download/deprivation_data.py
@ -3,6 +3,7 @@ import tempfile
 import polars as pl
 from pathlib import Path

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download

 URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx"
@ -33,7 +34,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        xlsx_path = Path(cache_dir) / "IoD2025_Scores.xlsx"
        download(URL, xlsx_path, timeout=60)
        convert_to_parquet(xlsx_path, args.output)
--- a/pipeline/download/geolytix_retail_points.py
+++ b/pipeline/download/geolytix_retail_points.py
@ -8,6 +8,7 @@ from zipfile import ZipFile

 import polars as pl

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils.download import download


@ -70,7 +71,9 @@ def download_geolytix_retail_points(output_path: Path) -> None:
    """Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
    output_path.parent.mkdir(parents=True, exist_ok=True)

-    with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
+    with TemporaryDirectory(
+        prefix="geolytix_retail_points_", dir=local_tmp_dir()
+    ) as tmp:
        zip_path = Path(tmp) / "geolytix_retail_points.zip"
        download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
        df = read_latest_csv(zip_path)
--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@ -31,6 +31,8 @@ from pyproj import Transformer
 from rasterio.transform import rowcol
 from scipy.ndimage import maximum_filter

+from pipeline.local_temp import local_tmp_dir
+
 # Noise sources:
 # (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
 # Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
@ -437,7 +439,7 @@ def main() -> None:

    result = postcodes.select("postcode")

-    with tempfile.TemporaryDirectory() as tmp:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
        for (
            label,
            col_name,
--- a/pipeline/download/ofsted.py
+++ b/pipeline/download/ofsted.py
@ -3,6 +3,7 @@ import tempfile
 import polars as pl
 from pathlib import Path

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download

 # Management information - state-funded schools - latest inspections (as at 28 Feb 2026)
@ -36,7 +37,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        csv_path = Path(cache_dir) / "ofsted_latest_inspections.csv"
        download(URL, csv_path, timeout=60)
        convert_to_parquet(csv_path, args.output)
--- a/pipeline/download/os_greenspace.py
+++ b/pipeline/download/os_greenspace.py
@ -25,6 +25,7 @@ from pyproj import Transformer
 from shapely.errors import GEOSException
 from shapely.geometry import shape as to_shapely

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils.download import download, extract_zip

 logger = logging.getLogger(__name__)
@ -171,7 +172,7 @@ def _read_site_centroids(
 def download_greenspace(output: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        zip_path = Path(cache_dir) / "greenspace.zip"
        extract_dir = Path(cache_dir) / "extracted"

--- a/pipeline/download/pois.py
+++ b/pipeline/download/pois.py
@ -11,6 +11,7 @@ from shapely.geometry import Point
 from shapely.wkb import loads as load_wkb
 from tqdm import tqdm

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils.england_geometry import (
    ENGLAND_BBOX_EAST,
    ENGLAND_BBOX_NORTH,
@ -184,7 +185,7 @@ def main() -> None:

    england_polygon = load_england_polygon(args.boundary)

-    tmp_dir = Path(mkdtemp(prefix="pois_"))
+    tmp_dir = Path(mkdtemp(prefix="pois_", dir=local_tmp_dir()))
    with tqdm(
        unit=" elements",
        unit_scale=True,
--- a/pipeline/download/postcodes.py
+++ b/pipeline/download/postcodes.py
@ -12,6 +12,7 @@ import tarfile
 import tempfile
 from pathlib import Path

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download

 URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
@ -37,7 +38,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        cache = Path(cache_dir)
        archive_path = cache / "gb-postcodes-v5.tar.bz2"
        extract_dir = cache / "extracted"
--- a/pipeline/download/price_paid.py
+++ b/pipeline/download/price_paid.py
@ -3,6 +3,7 @@ import tempfile
 import polars as pl
 from pathlib import Path

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download

 URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv"
@ -55,7 +56,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        csv_path = Path(cache_dir) / "price-paid-complete.csv"

        download(URL, csv_path)
--- a/pipeline/download/rental_prices.py
+++ b/pipeline/download/rental_prices.py
@ -13,6 +13,7 @@ from pathlib import Path

 import polars as pl

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download

 URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"
@ -114,7 +115,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
        download(URL, xlsx_path, timeout=120)
        convert_to_parquet(xlsx_path, args.output)
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -36,6 +36,8 @@ from pathlib import Path

 from tqdm import tqdm

+from pipeline.local_temp import local_tmp_dir
+
 ENGLAND_PBF_URL = (
    "https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
 )
@ -164,7 +166,10 @@ def clean_gtfs(src: Path, dst: Path) -> None:
                    )

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)

@ -388,7 +393,10 @@ def convert_high_freq_to_frequency_based(
                    trip_id_idx = cols.index("trip_id")

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)
                    for line in f:
@ -408,7 +416,10 @@ def convert_high_freq_to_frequency_based(
                    trip_id_idx = cols.index("trip_id")

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)
                    for line in f:
@ -451,8 +462,8 @@ def download_tfl_transxchange(raw_dir: Path) -> Path:


 def download_naptan() -> None:
-    """Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs)."""
-    dest = Path("/tmp/Stops.csv")
+    """Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
+    dest = local_tmp_dir() / "Stops.csv"
    if dest.exists():
        print(f"NaPTAN Stops.csv already exists: {dest}")
        return
@ -661,7 +672,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    )

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)

@ -718,7 +732,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    lon_idx = cols.index("stop_lon")

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)

@ -749,7 +766,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    rt_idx = cols.index("route_type")

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)

@ -774,7 +794,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    trip_id_idx = cols.index("trip_id")

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)

@ -797,7 +820,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    end_idx = cols.index("end_date")

                    tmp = tempfile.NamedTemporaryFile(
-                        mode="wb", delete=False, suffix=".txt"
+                        mode="wb",
+                        delete=False,
+                        suffix=".txt",
+                        dir=local_tmp_dir(),
                    )
                    tmp.write(header)

--- a/pipeline/download/transxchange2gtfs_shim.js
+++ b/pipeline/download/transxchange2gtfs_shim.js
@ -15,6 +15,16 @@ if (!pkgDirArg || converterArgs.length < 2) {
 }

 const pkgDir = path.resolve(pkgDirArg);
+const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
+const localTmpDir =
+  process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
+const stopsCsv = path.join(localTmpDir, "Stops.csv");
+const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
+const converterTmpPatch =
+  `static TMP = ${JSON.stringify(converterTmpPrefix)}` +
+  ` + process.pid + ${JSON.stringify(path.sep)};`;
+
+fs.mkdirSync(localTmpDir, { recursive: true });

 function replaceOnce(relativePath, before, after) {
  const file = path.join(pkgDir, relativePath);
@ -37,6 +47,26 @@ function replaceOnce(relativePath, before, after) {
 // GTFS shapes are optional for R5 routing. Clear shape references and omit
 // shapes.txt so missing route geometry does not drop otherwise usable trips.
 function patchPackage() {
+  replaceOnce(
+    "dist/Container.js",
+    "static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
+    converterTmpPatch,
+  );
+  replaceOnce(
+    "dist/Container.js",
+    'fs.existsSync("/tmp/Stops.csv")',
+    `fs.existsSync(${JSON.stringify(stopsCsv)})`,
+  );
+  replaceOnce(
+    "dist/Container.js",
+    'fs.createReadStream("/tmp/Stops.csv", "utf8")',
+    `fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
+  );
+  replaceOnce(
+    "dist/converter/GetStopData.js",
+    'fs.createWriteStream("/tmp/Stops.csv")',
+    `fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
+  );
  replaceOnce(
    "dist/transxchange/TransXChangeJourneyStream.js",
    "distanceSoFarM += routeLink.Distance;",
--- a/pipeline/download/uprn_lookup.py
+++ b/pipeline/download/uprn_lookup.py
@ -13,6 +13,7 @@ from pathlib import Path

 import polars as pl

+from pipeline.local_temp import local_tmp_dir
 from pipeline.utils import download, extract_zip

 URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
@ -62,7 +63,7 @@ def main() -> None:
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory() as cache_dir:
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        zip_path = Path(cache_dir) / "uprn_lookup.zip"
        extract_path = Path(cache_dir) / "uprn_extracted"

--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -10,6 +10,8 @@ import pyarrow as pa
 import pyarrow.csv as pa_csv
 import pyarrow.parquet as pq

+from pipeline.local_temp import local_tmp_dir
+
 from ..utils import (
    fuzzy_join_on_postcode,
    normalize_address_key,
@ -192,7 +194,9 @@ def main():
    )
    args = parser.parse_args()

-    with tempfile.TemporaryDirectory(prefix="epc_certificates_") as tmpdir:
+    with tempfile.TemporaryDirectory(
+        prefix="epc_certificates_", dir=local_tmp_dir()
+    ) as tmpdir:
        _run(args.epc, args.price_paid, args.output, Path(tmpdir))


--- a/pipeline/transform/postcode_boundaries/uprn.py
+++ b/pipeline/transform/postcode_boundaries/uprn.py
@ -3,6 +3,8 @@ from pathlib import Path
 import numpy as np
 import polars as pl

+from pipeline.local_temp import local_tmp_dir
+
 from .memory import release_memory


@ -17,7 +19,9 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
    print("Loading UPRN lookup...")

    # Sort via streaming sink to avoid polars doubling memory during in-memory sort
-    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+    with tempfile.NamedTemporaryFile(
+        suffix=".parquet", delete=False, dir=local_tmp_dir()
+    ) as tmp:
        tmp_path = Path(tmp.name)
    (
        pl.scan_parquet(uprn_path)
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -79,6 +79,39 @@ def test_transform_grocery_retail_points_keeps_fascia_icon_category():
    ]


+def test_transform_grocery_retail_points_merges_cooperative_societies():
+    raw = pl.DataFrame(
+        {
+            "id": [101, 102, 103],
+            "retailer": [
+                "Central England Co-operative",
+                "Lincolnshire Co-operative",
+                "The Southern Co-operative",
+            ],
+            "fascia": [
+                "Central England Co-operative",
+                "The Co-operative Food",
+                None,
+            ],
+            "store_name": [
+                "Central Co-op Test",
+                "Lincolnshire Co-op Test",
+                "Southern Co-op Test",
+            ],
+            "long_wgs": [-0.141, -0.142, -0.143],
+            "lat_wgs": [51.515, 51.516, 51.517],
+        }
+    )
+
+    pois = transform_grocery_retail_points(raw, min_chain_locations=1)
+
+    assert pois.select("category", "icon_category").to_dicts() == [
+        {"category": "Co-op", "icon_category": "Co-op"},
+        {"category": "Co-op", "icon_category": "Co-op"},
+        {"category": "Co-op", "icon_category": "Co-op"},
+    ]
+
+
 def test_transform_grocery_retail_points_accepts_base_fascias():
    raw = pl.DataFrame(
        {
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -623,6 +623,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/outpost",
            "shop/pawnbroker",
            "shop/photo",
+            "shop/photo_studio",
            "shop/plant_hire",
            "shop/printer_ink",
            "shop/printing",
@ -843,6 +844,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "healthcare/physiotherapist",
            "healthcare/podiatrist",
+            "healthcare/occupational_therapist",
        ],
    ),
    (
@ -1171,7 +1173,6 @@ GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
    "Heron": "Heron Foods",
    "Marks and Spencer": "M&S",
    "Sainsburys": "Sainsbury's",
-    "The Co-operative Group": "Co-op",
 }


@ -1238,6 +1239,8 @@ def normalize_grocery_retailer(retailer: str | None) -> str:
    if retailer is None:
        return ""
    retailer = retailer.strip()
+    if retailer in COOP_RETAILERS:
+        return "Co-op"
    return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)


--- a/pipeline/utils/fuzzy_join.py
+++ b/pipeline/utils/fuzzy_join.py
@ -9,6 +9,8 @@ import polars as pl
 from thefuzz import fuzz
 from tqdm import tqdm

+from pipeline.local_temp import local_tmp_dir
+
 _NUMBER_RE = re.compile(r"\d+")
 _POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
 MIN_FUZZY_SCORE = 60
@ -57,7 +59,7 @@ def fuzzy_join_on_postcode(
    have null right columns.
    """

-    tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
+    tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir())
    left_path = Path(tmpdir) / "left.parquet"
    right_path = Path(tmpdir) / "right.parquet"