All good
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled

This commit is contained in:
Andras Schmelczer 2026-05-18 21:20:10 +01:00
parent 6ea544a0f6
commit 6cc7288126
45 changed files with 929 additions and 1043 deletions

View file

@ -0,0 +1,3 @@
from .local_temp import configure_tempfile_defaults
configure_tempfile_defaults()

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"
@ -40,7 +41,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
download_path = Path(cache_dir) / "arcgis_data.zip"
extract_path = Path(cache_dir) / "arcgis_extracted"

View file

@ -7,6 +7,7 @@ from pathlib import Path
import httpx
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
# Ofcom Connected Nations 2025 - Fixed broadband performance (output area & local authority level)
@ -84,7 +85,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
cache = Path(cache_dir)
zip_path = cache / "broadband_performance.zip"
extract_dir = cache / "extracted"

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "https://assets.publishing.service.gov.uk/media/691ded34513046b952c500bd/File_5_IoD2025_Scores_for_the_Indices_of_Deprivation.xlsx"
@ -33,7 +34,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
xlsx_path = Path(cache_dir) / "IoD2025_Scores.xlsx"
download(URL, xlsx_path, timeout=60)
convert_to_parquet(xlsx_path, args.output)

View file

@ -8,6 +8,7 @@ from zipfile import ZipFile
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.download import download
@ -70,7 +71,9 @@ def download_geolytix_retail_points(output_path: Path) -> None:
"""Download the GEOLYTIX ZIP, extract the latest CSV, and write parquet."""
output_path.parent.mkdir(parents=True, exist_ok=True)
with TemporaryDirectory(prefix="geolytix_retail_points_") as tmp:
with TemporaryDirectory(
prefix="geolytix_retail_points_", dir=local_tmp_dir()
) as tmp:
zip_path = Path(tmp) / "geolytix_retail_points.zip"
download(GEOLYTIX_RETAIL_POINTS_URL, zip_path, timeout=300)
df = read_latest_csv(zip_path)

View file

@ -31,6 +31,8 @@ from pyproj import Transformer
from rasterio.transform import rowcol
from scipy.ndimage import maximum_filter
from pipeline.local_temp import local_tmp_dir
# Noise sources:
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
@ -437,7 +439,7 @@ def main() -> None:
result = postcodes.select("postcode")
with tempfile.TemporaryDirectory() as tmp:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
for (
label,
col_name,

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
# Management information - state-funded schools - latest inspections (as at 28 Feb 2026)
@ -36,7 +37,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
csv_path = Path(cache_dir) / "ofsted_latest_inspections.csv"
download(URL, csv_path, timeout=60)
convert_to_parquet(csv_path, args.output)

View file

@ -25,6 +25,7 @@ from pyproj import Transformer
from shapely.errors import GEOSException
from shapely.geometry import shape as to_shapely
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.download import download, extract_zip
logger = logging.getLogger(__name__)
@ -171,7 +172,7 @@ def _read_site_centroids(
def download_greenspace(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
zip_path = Path(cache_dir) / "greenspace.zip"
extract_dir = Path(cache_dir) / "extracted"

View file

@ -11,6 +11,7 @@ from shapely.geometry import Point
from shapely.wkb import loads as load_wkb
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.england_geometry import (
ENGLAND_BBOX_EAST,
ENGLAND_BBOX_NORTH,
@ -184,7 +185,7 @@ def main() -> None:
england_polygon = load_england_polygon(args.boundary)
tmp_dir = Path(mkdtemp(prefix="pois_"))
tmp_dir = Path(mkdtemp(prefix="pois_", dir=local_tmp_dir()))
with tqdm(
unit=" elements",
unit_scale=True,

View file

@ -12,6 +12,7 @@ import tarfile
import tempfile
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "https://postcodes-mapit-static.s3.eu-west-2.amazonaws.com/data/gb-postcodes-v5.tar.bz2"
@ -37,7 +38,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
cache = Path(cache_dir)
archive_path = cache / "gb-postcodes-v5.tar.bz2"
extract_dir = cache / "extracted"

View file

@ -3,6 +3,7 @@ import tempfile
import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv"
@ -55,7 +56,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
csv_path = Path(cache_dir) / "price-paid-complete.csv"
download(URL, csv_path)

View file

@ -13,6 +13,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download
URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"
@ -114,7 +115,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
download(URL, xlsx_path, timeout=120)
convert_to_parquet(xlsx_path, args.output)

View file

@ -36,6 +36,8 @@ from pathlib import Path
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
ENGLAND_PBF_URL = (
"https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
)
@ -164,7 +166,10 @@ def clean_gtfs(src: Path, dst: Path) -> None:
)
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -388,7 +393,10 @@ def convert_high_freq_to_frequency_based(
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
for line in f:
@ -408,7 +416,10 @@ def convert_high_freq_to_frequency_based(
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
for line in f:
@ -451,8 +462,8 @@ def download_tfl_transxchange(raw_dir: Path) -> Path:
def download_naptan() -> None:
"""Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs)."""
dest = Path("/tmp/Stops.csv")
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
dest = local_tmp_dir() / "Stops.csv"
if dest.exists():
print(f"NaPTAN Stops.csv already exists: {dest}")
return
@ -661,7 +672,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
)
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -718,7 +732,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
lon_idx = cols.index("stop_lon")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -749,7 +766,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
rt_idx = cols.index("route_type")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -774,7 +794,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)
@ -797,7 +820,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
end_idx = cols.index("end_date")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
mode="wb",
delete=False,
suffix=".txt",
dir=local_tmp_dir(),
)
tmp.write(header)

View file

@ -15,6 +15,16 @@ if (!pkgDirArg || converterArgs.length < 2) {
}
const pkgDir = path.resolve(pkgDirArg);
const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
const localTmpDir =
process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
const stopsCsv = path.join(localTmpDir, "Stops.csv");
const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
const converterTmpPatch =
`static TMP = ${JSON.stringify(converterTmpPrefix)}` +
` + process.pid + ${JSON.stringify(path.sep)};`;
fs.mkdirSync(localTmpDir, { recursive: true });
function replaceOnce(relativePath, before, after) {
const file = path.join(pkgDir, relativePath);
@ -37,6 +47,26 @@ function replaceOnce(relativePath, before, after) {
// GTFS shapes are optional for R5 routing. Clear shape references and omit
// shapes.txt so missing route geometry does not drop otherwise usable trips.
function patchPackage() {
replaceOnce(
"dist/Container.js",
"static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
converterTmpPatch,
);
replaceOnce(
"dist/Container.js",
'fs.existsSync("/tmp/Stops.csv")',
`fs.existsSync(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/Container.js",
'fs.createReadStream("/tmp/Stops.csv", "utf8")',
`fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
);
replaceOnce(
"dist/converter/GetStopData.js",
'fs.createWriteStream("/tmp/Stops.csv")',
`fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/transxchange/TransXChangeJourneyStream.js",
"distanceSoFarM += routeLink.Distance;",

View file

@ -13,6 +13,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
@ -62,7 +63,7 @@ def main() -> None:
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
zip_path = Path(cache_dir) / "uprn_lookup.zip"
extract_path = Path(cache_dir) / "uprn_extracted"

View file

@ -10,6 +10,8 @@ import pyarrow as pa
import pyarrow.csv as pa_csv
import pyarrow.parquet as pq
from pipeline.local_temp import local_tmp_dir
from ..utils import (
fuzzy_join_on_postcode,
normalize_address_key,
@ -192,7 +194,9 @@ def main():
)
args = parser.parse_args()
with tempfile.TemporaryDirectory(prefix="epc_certificates_") as tmpdir:
with tempfile.TemporaryDirectory(
prefix="epc_certificates_", dir=local_tmp_dir()
) as tmpdir:
_run(args.epc, args.price_paid, args.output, Path(tmpdir))

View file

@ -3,6 +3,8 @@ from pathlib import Path
import numpy as np
import polars as pl
from pipeline.local_temp import local_tmp_dir
from .memory import release_memory
@ -17,7 +19,9 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
print("Loading UPRN lookup...")
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
with tempfile.NamedTemporaryFile(
suffix=".parquet", delete=False, dir=local_tmp_dir()
) as tmp:
tmp_path = Path(tmp.name)
(
pl.scan_parquet(uprn_path)

View file

@ -79,6 +79,39 @@ def test_transform_grocery_retail_points_keeps_fascia_icon_category():
]
def test_transform_grocery_retail_points_merges_cooperative_societies():
raw = pl.DataFrame(
{
"id": [101, 102, 103],
"retailer": [
"Central England Co-operative",
"Lincolnshire Co-operative",
"The Southern Co-operative",
],
"fascia": [
"Central England Co-operative",
"The Co-operative Food",
None,
],
"store_name": [
"Central Co-op Test",
"Lincolnshire Co-op Test",
"Southern Co-op Test",
],
"long_wgs": [-0.141, -0.142, -0.143],
"lat_wgs": [51.515, 51.516, 51.517],
}
)
pois = transform_grocery_retail_points(raw, min_chain_locations=1)
assert pois.select("category", "icon_category").to_dicts() == [
{"category": "Co-op", "icon_category": "Co-op"},
{"category": "Co-op", "icon_category": "Co-op"},
{"category": "Co-op", "icon_category": "Co-op"},
]
def test_transform_grocery_retail_points_accepts_base_fascias():
raw = pl.DataFrame(
{

View file

@ -623,6 +623,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/outpost",
"shop/pawnbroker",
"shop/photo",
"shop/photo_studio",
"shop/plant_hire",
"shop/printer_ink",
"shop/printing",
@ -843,6 +844,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"healthcare/physiotherapist",
"healthcare/podiatrist",
"healthcare/occupational_therapist",
],
),
(
@ -1171,7 +1173,6 @@ GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES: dict[str, str] = {
"Heron": "Heron Foods",
"Marks and Spencer": "M&S",
"Sainsburys": "Sainsbury's",
"The Co-operative Group": "Co-op",
}
@ -1238,6 +1239,8 @@ def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
retailer = retailer.strip()
if retailer in COOP_RETAILERS:
return "Co-op"
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)

View file

@ -9,6 +9,8 @@ import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
_NUMBER_RE = re.compile(r"\d+")
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
MIN_FUZZY_SCORE = 60
@ -57,7 +59,7 @@ def fuzzy_join_on_postcode(
have null right columns.
"""
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_", dir=local_tmp_dir())
left_path = Path(tmpdir) / "left.parquet"
right_path = Path(tmpdir) / "right.parquet"