version: '3' vars: DATA_DIR: /bulk/property-data TILES_OUTPUT: "{{.DATA_DIR}}/uk.pmtiles" ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet" PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet" IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet" POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet" POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet" POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet" EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet" WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet" EPC: "{{.DATA_DIR}}/certificates.csv" JOURNEY_TIMES_BANK: "{{.DATA_DIR}}/journey_times_bank.parquet" JOURNEY_TIMES_FITZROVIA: "{{.DATA_DIR}}/journey_times_fitzrovia.parquet" ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet" CRIME_DIR: "{{.DATA_DIR}}/crime" CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet" NOISE_OUTPUT: "{{.DATA_DIR}}/road_noise.parquet" OFSTED_OUTPUT: "{{.DATA_DIR}}/ofsted.parquet" NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet" BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet" SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet" POSTCODES_OUTPUT: "{{.DATA_DIR}}/postcodes" GEOSURE_OUTPUT: "{{.DATA_DIR}}/geosure" GEOSURE_PARQUET: "{{.DATA_DIR}}/geosure.parquet" INSPIRE_OUTPUT: "{{.DATA_DIR}}/inspire" OA_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/oa_boundaries.gpkg" UPRN_LOOKUP_OUTPUT: "{{.DATA_DIR}}/uprn_lookup.parquet" POSTCODE_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/new_postcode_boundaries" tasks: download:tiles: desc: Download UK map tiles (PMTiles format from Protomaps) status: - test -f {{.TILES_OUTPUT}} vars: PMTILES_VERSION: "1.22.3" PMTILES_BIN: "{{.DATA_DIR}}/pmtiles" cmds: - | echo "Downloading UK PMTiles (~1.5GB)..." echo "This extracts UK tiles from the Protomaps planet file." echo "" # Download pmtiles CLI if not present if [ ! -f "{{.PMTILES_BIN}}" ]; then echo "Downloading pmtiles CLI v{{.PMTILES_VERSION}}..." curl -sL "https://github.com/protomaps/go-pmtiles/releases/download/v{{.PMTILES_VERSION}}/go-pmtiles_{{.PMTILES_VERSION}}_Linux_x86_64.tar.gz" | tar -xz -C "{{.DATA_DIR}}" pmtiles chmod +x "{{.PMTILES_BIN}}" fi # Extract UK region (bbox: -10.5,49.5,2.5,61) # Using a recent daily build from Protomaps "{{.PMTILES_BIN}}" extract https://build.protomaps.com/20260201.pmtiles {{.TILES_OUTPUT}} --bbox=-10.5,49.5,2.5,61 prompt:epc: desc: Prompt user to download EPC dataset (requires registration) status: - test -f {{.EPC}} cmds: - | echo "" echo "=== EPC dataset not found ===" echo "The EPC certificates file is required: {{.EPC}}" echo "" echo "To obtain it, register at https://epc.opendatacommunities.org/login" echo "" exit 1 prompt:journey-times: desc: Download TFL journey times if missing (requires API key registration) status: - test -f {{.JOURNEY_TIMES_BANK}} || test -f {{.JOURNEY_TIMES_FITZROVIA}} deps: - download:arcgis cmds: - | echo "" echo "=== TFL journey times not found ===" echo "Register for a TFL API key at https://api-portal.tfl.gov.uk/signin" echo "Then set the TFL_API_KEY environment variable and run:" echo " task download:journey-times -- bank" echo " task download:journey-times -- fitzrovia" echo "" exit 1 download:arcgis: desc: Download and convert ArcGIS postcode data status: - test -f {{.ARCGIS_OUTPUT}} cmds: - uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}} download:price-paid: desc: Download and convert Land Registry price-paid data status: - test -f {{.PRICE_PAID_OUTPUT}} cmds: - uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}} download:deprivation: desc: Download and convert Index of Deprivation data status: - test -f {{.IOD_OUTPUT}} cmds: - uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}} download:ethnicity: desc: Download ethnicity by local authority data status: - test -f {{.ETHNICITY_OUTPUT}} cmds: - uv run python -m pipeline.download.ethnicity --output {{.ETHNICITY_OUTPUT}} download:naptan: desc: Download NaPTAN station data status: - test -f {{.NAPTAN_OUTPUT}} cmds: - uv run python -m pipeline.download.naptan --output {{.NAPTAN_OUTPUT}} download:pois: desc: Download and extract POIs from OpenStreetMap status: - test -f {{.POIS_RAW_OUTPUT}} cmds: - uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}} download:ofsted: desc: Download Ofsted school inspection outcomes status: - test -f {{.OFSTED_OUTPUT}} cmds: - uv run python -m pipeline.download.ofsted --output {{.OFSTED_OUTPUT}} download:broadband: desc: Download Ofcom broadband performance data status: - test -f {{.BROADBAND_OUTPUT}} cmds: - uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}} download:postcodes: desc: Download GB postcodes data from MapIt status: - test -f {{.POSTCODES_OUTPUT}} cmds: - uv run python -m pipeline.download.postcodes --output {{.POSTCODES_OUTPUT}} download:geosure: desc: Download OS GeoSure ground stability data (5km hex grid) status: - test -d {{.GEOSURE_OUTPUT}} cmds: - uv run python -m pipeline.download.geosure --output {{.GEOSURE_OUTPUT}} download:noise: desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids deps: - download:arcgis status: - test -f {{.NOISE_OUTPUT}} cmds: - uv run python -m pipeline.download.noise --arcgis {{.ARCGIS_OUTPUT}} --output {{.NOISE_OUTPUT}} transform:pois: desc: Transform raw POIs to filtered version with friendly names deps: - download:pois - download:naptan status: - test -f {{.POIS_FILTERED_OUTPUT}} cmds: - uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --naptan {{.NAPTAN_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}} transform:epc-pp: desc: Fuzzy join EPC and Price Paid data deps: - download:price-paid - prompt:epc status: - test -f {{.EPC_PP_OUTPUT}} cmds: - uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}} transform:crime: desc: Transform crime CSVs into yearly averages by LSOA status: - test -f {{.CRIME_OUTPUT}} cmds: - uv run python -m pipeline.transform.crime --input {{.CRIME_DIR}} --output {{.CRIME_OUTPUT}} transform:poi-proximity: desc: Compute POI proximity counts per postcode deps: - download:arcgis - transform:pois status: - test -f {{.POI_PROXIMITY_OUTPUT}} cmds: - uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}} transform:school-proximity: desc: Compute good+ school proximity counts per postcode deps: - download:ofsted - download:arcgis status: - test -f {{.SCHOOL_PROXIMITY_OUTPUT}} cmds: - uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}} transform:geosure: desc: Spatial-join GeoSure ground stability data to postcode centroids deps: - download:geosure - download:arcgis status: - test -f {{.GEOSURE_PARQUET}} cmds: - uv run python -m pipeline.transform.transform_geosure --geosure {{.GEOSURE_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.GEOSURE_PARQUET}} download:inspire: desc: Download INSPIRE Index Polygon GML files from HM Land Registry status: - test -d {{.INSPIRE_OUTPUT}} cmds: - uv run python -m pipeline.download.inspire --output {{.INSPIRE_OUTPUT}} download:oa-boundaries: desc: Download Output Areas (2021) boundary polygons (England & Wales) status: - test -f {{.OA_BOUNDARIES_OUTPUT}} cmds: - uv run python -m pipeline.download.oa_boundaries --output {{.OA_BOUNDARIES_OUTPUT}} download:uprn-lookup: desc: Download National Statistics UPRN Lookup and convert to parquet status: - test -f {{.UPRN_LOOKUP_OUTPUT}} cmds: - uv run python -m pipeline.download.uprn_lookup --output {{.UPRN_LOOKUP_OUTPUT}} transform:postcode-boundaries: desc: Generate postcode boundary polygons from OA boundaries + INSPIRE + UPRNs deps: - download:oa-boundaries - download:inspire - download:uprn-lookup cmds: - >- uv run python -m pipeline.transform.postcode_boundaries --uprn {{.UPRN_LOOKUP_OUTPUT}} --oa-boundaries {{.OA_BOUNDARIES_OUTPUT}} --inspire {{.INSPIRE_OUTPUT}} --output {{.POSTCODE_BOUNDARIES_OUTPUT}} download:journey-times: desc: "Fetch TfL journey times: task download:journey-times" deps: - download:arcgis status: - test -f {{.DATA_DIR}}/journey_times_*.parquet cmds: - uv run python -m pipeline.journey_times --destination {{.CLI_ARGS}} --output-dir {{.DATA_DIR}} --postcodes {{.ARCGIS_OUTPUT}} prepare: desc: Build wide property dataframe with all joins deps: # - transform:epc-pp # - download:arcgis # - download:deprivation # - download:ethnicity # - download:broadband # - download:noise # - transform:crime # - transform:poi-proximity # - transform:school-proximity # - transform:geosure # - prompt:journey-times status: - test -f {{.WIDE_OUTPUT}} cmds: - >- uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times-bank {{.JOURNEY_TIMES_BANK}} --journey-times-fitzrovia {{.JOURNEY_TIMES_FITZROVIA}} --ethnicity {{.ETHNICITY_OUTPUT}} --crime {{.CRIME_OUTPUT}} --noise {{.NOISE_OUTPUT}} --school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}} --broadband {{.BROADBAND_OUTPUT}} --geosure {{.GEOSURE_PARQUET}} --output {{.WIDE_OUTPUT}}