perfect-postcode/Makefile.data

235 lines
9.4 KiB
Text

# Data pipeline — download sources and build wide.parquet
#
# Usage:
# make -f Makefile.data prepare # Build wide.parquet (+ all deps)
# make -f Makefile.data tiles # Download UK map tiles
# make -f Makefile.data download-pois # Download a single dataset
# make -f Makefile.data help # List all targets
#
# Or include from the main Makefile and use targets directly.
SHELL := /bin/bash
.DELETE_ON_ERROR:
DATA_DIR := /bulk/property-data
# ── Output files ──────────────────────────────────────────────────────────────
TILES := $(DATA_DIR)/uk.pmtiles
ARCGIS := $(DATA_DIR)/arcgis_data.parquet
PRICE_PAID := $(DATA_DIR)/price-paid-complete.parquet
IOD := $(DATA_DIR)/IoD2025_Scores.parquet
POIS_RAW := $(DATA_DIR)/uk_pois.parquet
POIS_FILTERED := $(DATA_DIR)/filtered_uk_pois.parquet
POI_PROXIMITY := $(DATA_DIR)/poi_proximity.parquet
EPC_PP := $(DATA_DIR)/epc_pp.parquet
WIDE := $(DATA_DIR)/wide.parquet
EPC := $(DATA_DIR)/certificates.csv
JT_BANK := $(DATA_DIR)/journey_times_bank.parquet
JT_FITZROVIA := $(DATA_DIR)/journey_times_fitzrovia.parquet
ETHNICITY := $(DATA_DIR)/ethnicity_by_la.parquet
CRIME_DIR := $(DATA_DIR)/crime
CRIME := $(DATA_DIR)/crime_by_lsoa.parquet
NOISE := $(DATA_DIR)/road_noise.parquet
OFSTED := $(DATA_DIR)/ofsted.parquet
NAPTAN := $(DATA_DIR)/naptan.parquet
BROADBAND := $(DATA_DIR)/broadband.parquet
SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet
GEOSURE_DIR := $(DATA_DIR)/geosure
GEOSURE := $(DATA_DIR)/geosure.parquet
INSPIRE_DIR := $(DATA_DIR)/inspire
OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg
UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet
PC_BOUNDARIES := $(DATA_DIR)/new_postcode_boundaries
# Sentinel files for directory targets (Make doesn't track directories well)
GEOSURE_STAMP := $(GEOSURE_DIR)/.done
INSPIRE_STAMP := $(INSPIRE_DIR)/.done
MANUAL_DATA := $(DATA_DIR)/manual_data
PMTILES_VERSION := 1.22.3
PMTILES_BIN := $(DATA_DIR)/pmtiles
# ── Phony aliases ─────────────────────────────────────────────────────────────
.PHONY: prepare tiles \
download-arcgis download-price-paid download-deprivation download-ethnicity \
download-naptan download-pois download-ofsted download-broadband \
download-postcodes download-geosure download-noise download-inspire \
download-oa-boundaries download-uprn-lookup \
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
transform-school-proximity transform-geosure transform-postcode-boundaries \
journey-times
prepare: $(WIDE)
tiles: $(TILES)
download-arcgis: $(ARCGIS)
download-price-paid: $(PRICE_PAID)
download-deprivation: $(IOD)
download-ethnicity: $(ETHNICITY)
download-naptan: $(NAPTAN)
download-pois: $(POIS_RAW)
download-ofsted: $(OFSTED)
download-broadband: $(BROADBAND)
download-postcodes: $(POSTCODES)
download-geosure: $(GEOSURE_STAMP)
download-noise: $(NOISE)
download-inspire: $(INSPIRE_STAMP)
download-oa-boundaries: $(OA_BOUNDARIES)
download-uprn-lookup: $(UPRN_LOOKUP)
transform-pois: $(POIS_FILTERED)
transform-epc-pp: $(EPC_PP)
transform-crime: $(CRIME)
transform-poi-proximity: $(POI_PROXIMITY)
transform-school-proximity: $(SCHOOL_PROX)
transform-geosure: $(GEOSURE)
transform-postcode-boundaries: $(PC_BOUNDARIES)
# ── Downloads ─────────────────────────────────────────────────────────────────
$(TILES):
@echo "Downloading UK PMTiles (~1.5GB)..."
@echo "This extracts UK tiles from the Protomaps planet file."
@if [ ! -f "$(PMTILES_BIN)" ]; then \
echo "Downloading pmtiles CLI v$(PMTILES_VERSION)..."; \
curl -sL "https://github.com/protomaps/go-pmtiles/releases/download/v$(PMTILES_VERSION)/go-pmtiles_$(PMTILES_VERSION)_Linux_x86_64.tar.gz" \
| tar -xz -C "$(DATA_DIR)" pmtiles; \
chmod +x "$(PMTILES_BIN)"; \
fi
"$(PMTILES_BIN)" extract https://build.protomaps.com/20260201.pmtiles $@ --bbox=-10.5,49.5,2.5,61
# EPC requires manual registration — fail with instructions
$(EPC):
@echo ""
@echo "=== EPC dataset not found ==="
@echo "The EPC certificates file is required: $@"
@echo ""
@echo "To obtain it, register at https://epc.opendatacommunities.org/login"
@echo ""
@exit 1
$(ARCGIS):
uv run python -m pipeline.download.arcgis --output $@
$(PRICE_PAID):
uv run python -m pipeline.download.price_paid --output $@
$(IOD):
uv run python -m pipeline.download.deprivation_data --output $@
$(ETHNICITY):
uv run python -m pipeline.download.ethnicity --output $@
$(NAPTAN):
uv run python -m pipeline.download.naptan --output $@
$(POIS_RAW):
uv run python -m pipeline.download.pois --output $@
$(OFSTED):
uv run python -m pipeline.download.ofsted --output $@
$(BROADBAND):
uv run python -m pipeline.download.broadband --output $@
$(POSTCODES):
uv run python -m pipeline.download.postcodes --output $@
$(GEOSURE_STAMP):
uv run python -m pipeline.download.geosure --output $(GEOSURE_DIR)
@touch $@
$(NOISE): $(ARCGIS)
uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@
$(INSPIRE_STAMP):
uv run python -m pipeline.download.inspire --output $(INSPIRE_DIR)
@touch $@
$(OA_BOUNDARIES):
uv run python -m pipeline.download.oa_boundaries --output $@
$(UPRN_LOOKUP):
uv run python -m pipeline.download.uprn_lookup --output $@
# ── Journey times (requires TFL_API_KEY) ──────────────────────────────────────
$(JT_BANK):
@if [ -f "$(MANUAL_DATA)/journey_times_bank.parquet" ]; then \
echo "Copying journey_times_bank.parquet from manual_data/"; \
cp "$(MANUAL_DATA)/journey_times_bank.parquet" $@; \
else \
echo ""; \
echo "=== TFL journey times (bank) not found ==="; \
echo "Either place the file in $(MANUAL_DATA)/journey_times_bank.parquet"; \
echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin"; \
echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=bank"; \
echo ""; \
exit 1; \
fi
$(JT_FITZROVIA):
@if [ -f "$(MANUAL_DATA)/journey_times_fitzrovia.parquet" ]; then \
echo "Copying journey_times_fitzrovia.parquet from manual_data/"; \
cp "$(MANUAL_DATA)/journey_times_fitzrovia.parquet" $@; \
else \
echo ""; \
echo "=== TFL journey times (fitzrovia) not found ==="; \
echo "Either place the file in $(MANUAL_DATA)/journey_times_fitzrovia.parquet"; \
echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin"; \
echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=fitzrovia"; \
echo ""; \
exit 1; \
fi
journey-times: $(ARCGIS)
ifndef DEST
$(error DEST required — e.g. make journey-times DEST=bank)
endif
uv run python -m pipeline.journey_times --destination $(DEST) --output-dir $(DATA_DIR) --postcodes $(ARCGIS)
# ── Transforms ────────────────────────────────────────────────────────────────
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN)
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --output $@
$(EPC_PP): $(PRICE_PAID) $(EPC)
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
$(CRIME):
uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED)
uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --output $@
$(SCHOOL_PROX): $(OFSTED) $(ARCGIS)
uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@
$(GEOSURE): $(GEOSURE_STAMP) $(ARCGIS)
uv run python -m pipeline.transform.transform_geosure --geosure $(GEOSURE_DIR) --arcgis $(ARCGIS) --output $@
$(PC_BOUNDARIES): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP)
uv run python -m pipeline.transform.postcode_boundaries \
--uprn $(UPRN_LOOKUP) \
--oa-boundaries $(OA_BOUNDARIES) \
--inspire $(INSPIRE_DIR) \
--output $@
# ── Final merge ───────────────────────────────────────────────────────────────
$(WIDE): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) $(JT_BANK) $(JT_FITZROVIA) \
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE)
uv run python -m pipeline.transform.merge \
--epc-pp $(EPC_PP) \
--arcgis $(ARCGIS) \
--iod $(IOD) \
--poi-proximity $(POI_PROXIMITY) \
--journey-times-bank $(JT_BANK) \
--journey-times-fitzrovia $(JT_FITZROVIA) \
--ethnicity $(ETHNICITY) \
--crime $(CRIME) \
--noise $(NOISE) \
--school-proximity $(SCHOOL_PROX) \
--broadband $(BROADBAND) \
--geosure $(GEOSURE) \
--output $@