298 lines
10 KiB
YAML
298 lines
10 KiB
YAML
version: '3'
|
|
|
|
vars:
|
|
DATA_DIR: /bulk/property-data
|
|
TILES_OUTPUT: "{{.DATA_DIR}}/uk.pmtiles"
|
|
ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet"
|
|
PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet"
|
|
IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet"
|
|
POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet"
|
|
POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet"
|
|
POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet"
|
|
EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet"
|
|
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
|
|
EPC: "{{.DATA_DIR}}/certificates.csv"
|
|
JOURNEY_TIMES_BANK: "{{.DATA_DIR}}/journey_times_bank.parquet"
|
|
JOURNEY_TIMES_FITZROVIA: "{{.DATA_DIR}}/journey_times_fitzrovia.parquet"
|
|
ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet"
|
|
CRIME_DIR: "{{.DATA_DIR}}/crime"
|
|
CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet"
|
|
NOISE_OUTPUT: "{{.DATA_DIR}}/road_noise.parquet"
|
|
OFSTED_OUTPUT: "{{.DATA_DIR}}/ofsted.parquet"
|
|
NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet"
|
|
BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet"
|
|
SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet"
|
|
POSTCODES_OUTPUT: "{{.DATA_DIR}}/postcodes"
|
|
GEOSURE_OUTPUT: "{{.DATA_DIR}}/geosure"
|
|
GEOSURE_PARQUET: "{{.DATA_DIR}}/geosure.parquet"
|
|
INSPIRE_OUTPUT: "{{.DATA_DIR}}/inspire"
|
|
OA_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/oa_boundaries.gpkg"
|
|
UPRN_LOOKUP_OUTPUT: "{{.DATA_DIR}}/uprn_lookup.parquet"
|
|
POSTCODE_BOUNDARIES_OUTPUT: "{{.DATA_DIR}}/new_postcode_boundaries"
|
|
|
|
tasks:
|
|
download:tiles:
|
|
desc: Download UK map tiles (PMTiles format from Protomaps)
|
|
status:
|
|
- test -f {{.TILES_OUTPUT}}
|
|
vars:
|
|
PMTILES_VERSION: "1.22.3"
|
|
PMTILES_BIN: "{{.DATA_DIR}}/pmtiles"
|
|
cmds:
|
|
- |
|
|
echo "Downloading UK PMTiles (~1.5GB)..."
|
|
echo "This extracts UK tiles from the Protomaps planet file."
|
|
echo ""
|
|
# Download pmtiles CLI if not present
|
|
if [ ! -f "{{.PMTILES_BIN}}" ]; then
|
|
echo "Downloading pmtiles CLI v{{.PMTILES_VERSION}}..."
|
|
curl -sL "https://github.com/protomaps/go-pmtiles/releases/download/v{{.PMTILES_VERSION}}/go-pmtiles_{{.PMTILES_VERSION}}_Linux_x86_64.tar.gz" | tar -xz -C "{{.DATA_DIR}}" pmtiles
|
|
chmod +x "{{.PMTILES_BIN}}"
|
|
fi
|
|
# Extract UK region (bbox: -10.5,49.5,2.5,61)
|
|
# Using a recent daily build from Protomaps
|
|
"{{.PMTILES_BIN}}" extract https://build.protomaps.com/20260201.pmtiles {{.TILES_OUTPUT}} --bbox=-10.5,49.5,2.5,61
|
|
|
|
prompt:epc:
|
|
desc: Prompt user to download EPC dataset (requires registration)
|
|
status:
|
|
- test -f {{.EPC}}
|
|
cmds:
|
|
- |
|
|
echo ""
|
|
echo "=== EPC dataset not found ==="
|
|
echo "The EPC certificates file is required: {{.EPC}}"
|
|
echo ""
|
|
echo "To obtain it, register at https://epc.opendatacommunities.org/login"
|
|
echo ""
|
|
exit 1
|
|
|
|
prompt:journey-times:
|
|
desc: Download TFL journey times if missing (requires API key registration)
|
|
status:
|
|
- test -f {{.JOURNEY_TIMES_BANK}} || test -f {{.JOURNEY_TIMES_FITZROVIA}}
|
|
deps:
|
|
- download:arcgis
|
|
cmds:
|
|
- |
|
|
echo ""
|
|
echo "=== TFL journey times not found ==="
|
|
echo "Register for a TFL API key at https://api-portal.tfl.gov.uk/signin"
|
|
echo "Then set the TFL_API_KEY environment variable and run:"
|
|
echo " task download:journey-times -- bank"
|
|
echo " task download:journey-times -- fitzrovia"
|
|
echo ""
|
|
exit 1
|
|
|
|
download:arcgis:
|
|
desc: Download and convert ArcGIS postcode data
|
|
status:
|
|
- test -f {{.ARCGIS_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
|
|
|
|
download:price-paid:
|
|
desc: Download and convert Land Registry price-paid data
|
|
status:
|
|
- test -f {{.PRICE_PAID_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
|
|
|
|
download:deprivation:
|
|
desc: Download and convert Index of Deprivation data
|
|
status:
|
|
- test -f {{.IOD_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
|
|
|
|
download:ethnicity:
|
|
desc: Download ethnicity by local authority data
|
|
status:
|
|
- test -f {{.ETHNICITY_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.ethnicity --output {{.ETHNICITY_OUTPUT}}
|
|
|
|
download:naptan:
|
|
desc: Download NaPTAN station data
|
|
status:
|
|
- test -f {{.NAPTAN_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.naptan --output {{.NAPTAN_OUTPUT}}
|
|
|
|
download:pois:
|
|
desc: Download and extract POIs from OpenStreetMap
|
|
status:
|
|
- test -f {{.POIS_RAW_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
|
|
|
|
download:ofsted:
|
|
desc: Download Ofsted school inspection outcomes
|
|
status:
|
|
- test -f {{.OFSTED_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.ofsted --output {{.OFSTED_OUTPUT}}
|
|
|
|
download:broadband:
|
|
desc: Download Ofcom broadband performance data
|
|
status:
|
|
- test -f {{.BROADBAND_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}}
|
|
|
|
download:postcodes:
|
|
desc: Download GB postcodes data from MapIt
|
|
status:
|
|
- test -f {{.POSTCODES_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.postcodes --output {{.POSTCODES_OUTPUT}}
|
|
|
|
download:geosure:
|
|
desc: Download OS GeoSure ground stability data (5km hex grid)
|
|
status:
|
|
- test -d {{.GEOSURE_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.geosure --output {{.GEOSURE_OUTPUT}}
|
|
|
|
download:noise:
|
|
desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids
|
|
deps:
|
|
- download:arcgis
|
|
status:
|
|
- test -f {{.NOISE_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.noise --arcgis {{.ARCGIS_OUTPUT}} --output {{.NOISE_OUTPUT}}
|
|
|
|
transform:pois:
|
|
desc: Transform raw POIs to filtered version with friendly names
|
|
deps:
|
|
- download:pois
|
|
- download:naptan
|
|
status:
|
|
- test -f {{.POIS_FILTERED_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --naptan {{.NAPTAN_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
|
|
|
|
transform:epc-pp:
|
|
desc: Fuzzy join EPC and Price Paid data
|
|
deps:
|
|
- download:price-paid
|
|
- prompt:epc
|
|
status:
|
|
- test -f {{.EPC_PP_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
|
|
|
|
transform:crime:
|
|
desc: Transform crime CSVs into yearly averages by LSOA
|
|
status:
|
|
- test -f {{.CRIME_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.transform.crime --input {{.CRIME_DIR}} --output {{.CRIME_OUTPUT}}
|
|
|
|
transform:poi-proximity:
|
|
desc: Compute POI proximity counts per postcode
|
|
deps:
|
|
- download:arcgis
|
|
- transform:pois
|
|
status:
|
|
- test -f {{.POI_PROXIMITY_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
|
|
|
|
transform:school-proximity:
|
|
desc: Compute good+ school proximity counts per postcode
|
|
deps:
|
|
- download:ofsted
|
|
- download:arcgis
|
|
status:
|
|
- test -f {{.SCHOOL_PROXIMITY_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}}
|
|
|
|
transform:geosure:
|
|
desc: Spatial-join GeoSure ground stability data to postcode centroids
|
|
deps:
|
|
- download:geosure
|
|
- download:arcgis
|
|
status:
|
|
- test -f {{.GEOSURE_PARQUET}}
|
|
cmds:
|
|
- uv run python -m pipeline.transform.transform_geosure --geosure {{.GEOSURE_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.GEOSURE_PARQUET}}
|
|
|
|
download:inspire:
|
|
desc: Download INSPIRE Index Polygon GML files from HM Land Registry
|
|
status:
|
|
- test -d {{.INSPIRE_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.inspire --output {{.INSPIRE_OUTPUT}}
|
|
|
|
download:oa-boundaries:
|
|
desc: Download Output Areas (2021) boundary polygons (England & Wales)
|
|
status:
|
|
- test -f {{.OA_BOUNDARIES_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.oa_boundaries --output {{.OA_BOUNDARIES_OUTPUT}}
|
|
|
|
download:uprn-lookup:
|
|
desc: Download National Statistics UPRN Lookup and convert to parquet
|
|
status:
|
|
- test -f {{.UPRN_LOOKUP_OUTPUT}}
|
|
cmds:
|
|
- uv run python -m pipeline.download.uprn_lookup --output {{.UPRN_LOOKUP_OUTPUT}}
|
|
|
|
transform:postcode-boundaries:
|
|
desc: Generate postcode boundary polygons from OA boundaries + INSPIRE + UPRNs
|
|
deps:
|
|
- download:oa-boundaries
|
|
- download:inspire
|
|
- download:uprn-lookup
|
|
cmds:
|
|
- >-
|
|
uv run python -m pipeline.transform.postcode_boundaries
|
|
--uprn {{.UPRN_LOOKUP_OUTPUT}}
|
|
--oa-boundaries {{.OA_BOUNDARIES_OUTPUT}}
|
|
--inspire {{.INSPIRE_OUTPUT}}
|
|
--output {{.POSTCODE_BOUNDARIES_OUTPUT}}
|
|
|
|
download:journey-times:
|
|
desc: "Fetch TfL journey times: task download:journey-times"
|
|
deps:
|
|
- download:arcgis
|
|
status:
|
|
- test -f {{.DATA_DIR}}/journey_times_*.parquet
|
|
cmds:
|
|
- uv run python -m pipeline.journey_times --destination {{.CLI_ARGS}} --output-dir {{.DATA_DIR}} --postcodes {{.ARCGIS_OUTPUT}}
|
|
|
|
prepare:
|
|
desc: Build wide property dataframe with all joins
|
|
deps:
|
|
# - transform:epc-pp
|
|
# - download:arcgis
|
|
# - download:deprivation
|
|
# - download:ethnicity
|
|
# - download:broadband
|
|
# - download:noise
|
|
# - transform:crime
|
|
# - transform:poi-proximity
|
|
# - transform:school-proximity
|
|
# - transform:geosure
|
|
# - prompt:journey-times
|
|
status:
|
|
- test -f {{.WIDE_OUTPUT}}
|
|
cmds:
|
|
- >-
|
|
uv run python -m pipeline.transform.merge
|
|
--epc-pp {{.EPC_PP_OUTPUT}}
|
|
--arcgis {{.ARCGIS_OUTPUT}}
|
|
--iod {{.IOD_OUTPUT}}
|
|
--poi-proximity {{.POI_PROXIMITY_OUTPUT}}
|
|
--journey-times-bank {{.JOURNEY_TIMES_BANK}}
|
|
--journey-times-fitzrovia {{.JOURNEY_TIMES_FITZROVIA}}
|
|
--ethnicity {{.ETHNICITY_OUTPUT}}
|
|
--crime {{.CRIME_OUTPUT}}
|
|
--noise {{.NOISE_OUTPUT}}
|
|
--school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}}
|
|
--broadband {{.BROADBAND_OUTPUT}}
|
|
--geosure {{.GEOSURE_PARQUET}}
|
|
--output {{.WIDE_OUTPUT}}
|