# Usage: # make -f Makefile.data prepare # Build all parquets (+ all deps) # # Or include from the main Makefile and use targets directly. SHELL := /bin/bash .DELETE_ON_ERROR: DATA_DIR := ./property-data MANUAL_DATA := ./manual-data # ── Output files ────────────────────────────────────────────────────────────── TILES := $(DATA_DIR)/uk.pmtiles ARCGIS := $(DATA_DIR)/arcgis_data.parquet PRICE_PAID := $(DATA_DIR)/price-paid-complete.parquet IOD := $(DATA_DIR)/IoD2025_Scores.parquet POIS_RAW := $(DATA_DIR)/uk_pois.parquet GROCERY_RETAIL_POINTS := $(DATA_DIR)/geolytix_retail_points.parquet POIS_FILTERED := $(DATA_DIR)/filtered_uk_pois.parquet POI_PROXIMITY := $(DATA_DIR)/poi_proximity.parquet EPC_PP := $(DATA_DIR)/epc_pp.parquet POSTCODES_RAW := $(DATA_DIR)/gb-postcodes-v5 POSTCODES_PQ := $(DATA_DIR)/postcode.parquet PROPERTIES_PQ := $(DATA_DIR)/properties.parquet MERGE_STAMP := $(DATA_DIR)/.merge_done PRICE_INDEX := $(DATA_DIR)/price_index.parquet PRICES_STAMP := $(DATA_DIR)/.prices_done EPC := $(MANUAL_DATA)/domestic-csv.zip ETHNICITY := $(DATA_DIR)/ethnicity_by_la.parquet CRIME_DIR := $(DATA_DIR)/crime CRIME := $(DATA_DIR)/crime_by_lsoa.parquet CRIME_STAMP := $(CRIME_DIR)/.downloaded NOISE := $(DATA_DIR)/road_noise.parquet OFSTED := $(DATA_DIR)/ofsted.parquet NAPTAN := $(DATA_DIR)/naptan.parquet BROADBAND := $(DATA_DIR)/broadband.parquet SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet RENTAL := $(DATA_DIR)/rental_prices.parquet INSPIRE_DIR := $(DATA_DIR)/inspire OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet PC_BOUNDARIES := $(DATA_DIR)/postcode_boundaries TRANSIT_DIR := $(DATA_DIR)/transit TRANSIT_STAMP := $(TRANSIT_DIR)/.done R5_NETWORK_CACHE := $(DATA_DIR)/r5-network/network.dat GREENSPACE := $(DATA_DIR)/greenspace_water.parquet OS_GREENSPACE := $(DATA_DIR)/os_greenspace.parquet PBF := $(DATA_DIR)/england-latest.osm.pbf FR_TOW := $(DATA_DIR)/FR_TOW_V1_ALL.zip TREE_DENSITY_PC := $(DATA_DIR)/tree_density_by_postcode.parquet TREE_DENSITY_STREETS := $(DATA_DIR)/tree_density_by_street.parquet TREE_DENSITY_ADDR := $(DATA_DIR)/tree_density_by_address.parquet OFS_REGISTER := $(DATA_DIR)/ofs_register.xlsx PLACES := $(DATA_DIR)/places.parquet LSOA_POP := $(DATA_DIR)/lsoa_population.parquet MEDIAN_AGE := $(DATA_DIR)/median_age.parquet ELECTION := $(DATA_DIR)/election_results.parquet ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json MAP_ASSETS_DIR := frontend/public/assets # Sentinel files for directory targets (Make doesn't track directories well) INSPIRE_STAMP := $(INSPIRE_DIR)/.done MAP_ASSETS_STAMP := $(MAP_ASSETS_DIR)/.done PMTILES_VERSION := 1.22.3 VALIDATE_OUTPUTS := uv run python -m pipeline.validate_outputs POI_PROXIMITY_DEPS := pipeline/transform/poi_proximity.py pipeline/utils/poi_counts.py MERGE_DEPS := pipeline/transform/merge.py PRICE_INDEX_DEPS := pipeline/transform/price_estimation/index.py pipeline/transform/price_estimation/shrinkage.py pipeline/transform/price_estimation/utils.py PRICE_ESTIMATE_DEPS := pipeline/transform/price_estimation/estimate.py pipeline/transform/price_estimation/knn.py pipeline/transform/price_estimation/utils.py TREE_DENSITY_DEPS := pipeline/transform/tree_density.py CRIME_DOWNLOAD_DEPS := pipeline/download/crime.py INSPIRE_DOWNLOAD_DEPS := pipeline/download/inspire.py TRANSIT_DOWNLOAD_DEPS := pipeline/download/transit_network.py pipeline/download/transxchange2gtfs_shim.js MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_poi.py # ── Phony aliases ───────────────────────────────────────────────────────────── .PHONY: prepare merge tiles \ download-arcgis download-price-paid download-deprivation download-ethnicity \ download-naptan download-pois download-grocery-retail-points download-ofsted download-broadband download-rental-prices \ download-postcodes download-noise download-inspire download-crime \ download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-os-greenspace download-pbf download-fr-tow download-ofs-register download-places download-lsoa-population download-median-age download-england-boundary download-rightmove-outcodes \ download-map-assets \ transform-pois transform-epc-pp transform-crime transform-poi-proximity \ transform-school-proximity transform-tree-density \ generate-postcode-boundaries generate-travel-times prepare: $(PRICES_STAMP) download-places tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX) $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --parquet $(PRICE_INDEX) merge: $(MERGE_STAMP) | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) tiles: $(TILES) download-arcgis: $(ARCGIS) download-price-paid: $(PRICE_PAID) download-deprivation: $(IOD) download-ethnicity: $(ETHNICITY) download-crime: $(CRIME_STAMP) $(VALIDATE_OUTPUTS) --file $(CRIME_DIR)/archive_manifest.json --glob "$(CRIME_DIR)::**/*.csv" --zip-glob "$(CRIME_DIR)/_archives::*.zip" download-naptan: $(NAPTAN) download-pois: $(POIS_RAW) download-grocery-retail-points: $(GROCERY_RETAIL_POINTS) download-ofsted: $(OFSTED) download-broadband: $(BROADBAND) download-postcodes: $(POSTCODES_RAW) download-rental-prices: $(RENTAL) download-noise: $(NOISE) download-inspire: $(INSPIRE_STAMP) $(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip" download-oa-boundaries: $(OA_BOUNDARIES) download-uprn-lookup: $(UPRN_LOOKUP) download-transit-network: $(TRANSIT_STAMP) $(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip download-greenspace: $(GREENSPACE) download-os-greenspace: $(OS_GREENSPACE) download-pbf: $(PBF) download-fr-tow: $(FR_TOW) download-ofs-register: $(OFS_REGISTER) download-places: $(PLACES) download-lsoa-population: $(LSOA_POP) download-median-age: $(MEDIAN_AGE) download-election-results: $(ELECTION) download-england-boundary: $(ENGLAND_BOUNDARY) download-rightmove-outcodes: $(RM_OUTCODES) download-map-assets: $(MAP_ASSETS_STAMP) $(VALIDATE_OUTPUTS) --file $(MAP_ASSETS_DIR)/sprites/light.json --file $(MAP_ASSETS_DIR)/sprites/light.png --file $(MAP_ASSETS_DIR)/sprites/dark.json --file $(MAP_ASSETS_DIR)/sprites/dark.png --glob "$(MAP_ASSETS_DIR)/fonts::**/*.pbf" --glob "$(MAP_ASSETS_DIR)/twemoji::*.png" --glob "$(MAP_ASSETS_DIR)/poi-icons::**/*" transform-pois: $(POIS_FILTERED) transform-epc-pp: $(EPC_PP) transform-crime: $(CRIME) transform-poi-proximity: $(POI_PROXIMITY) transform-school-proximity: $(SCHOOL_PROX) transform-tree-density: $(TREE_DENSITY_PC) generate-postcode-boundaries: $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip" uv run python -m pipeline.transform.postcode_boundaries \ --uprn $(UPRN_LOOKUP) \ --oa-boundaries $(OA_BOUNDARIES) \ --inspire $(INSPIRE_DIR) \ --output $(PC_BOUNDARIES) generate-travel-times: $(ARCGIS) $(PLACES) $(PBF) download-transit-network @if [ -f "$(R5_NETWORK_CACHE)" ] && { [ "$(PBF)" -nt "$(R5_NETWORK_CACHE)" ] || [ "$(TRANSIT_STAMP)" -nt "$(R5_NETWORK_CACHE)" ]; }; then \ echo "R5 inputs are newer than $(R5_NETWORK_CACHE); deleting stale cache"; \ rm -f "$(R5_NETWORK_CACHE)"; \ fi ./r5-java/run.sh # ── Downloads ───────────────────────────────────────────────────────────────── $(TILES): uv run -m pipeline.download.tiles --output $@ --pmtiles-version $(PMTILES_VERSION) # EPC requires manual registration — fail with instructions $(EPC): @echo "" @echo "=== EPC dataset not found ===" @echo "The EPC certificates archive is required: $@" @echo "" @echo "To obtain it, register at https://get-energy-performance-data.communities.gov.uk/filter-properties?property_type=domestic" @echo "and place domestic-csv.zip in manual-data/" @echo "" @exit 1 $(ARCGIS): uv run python -m pipeline.download.arcgis --output $@ $(PRICE_PAID): uv run python -m pipeline.download.price_paid --output $@ $(IOD): uv run python -m pipeline.download.deprivation_data --output $@ $(ETHNICITY): pipeline/download/ethnicity.py uv run python -m pipeline.download.ethnicity --output $@ $(CRIME_STAMP): $(CRIME_DOWNLOAD_DEPS) @rm -f $@ uv run python -m pipeline.download.crime --output $(CRIME_DIR) $(VALIDATE_OUTPUTS) --file $(CRIME_DIR)/archive_manifest.json --glob "$(CRIME_DIR)::**/*.csv" --zip-glob "$(CRIME_DIR)/_archives::*.zip" @touch $@ $(NAPTAN): uv run python -m pipeline.download.naptan --output $@ $(PBF): @mkdir -p $(DATA_DIR) curl -L -o $@.tmp https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf mv $@.tmp $@ $(FR_TOW): @mkdir -p $(DATA_DIR) curl -L -A "Mozilla/5.0" -o $@.tmp "https://www.mediafire.com/file_premium/p5fve6wswwwjqrq/FR_TOW_V1_ALL.zip/file" mv $@.tmp $@ $(OFS_REGISTER): @mkdir -p $(DATA_DIR) curl -fL -A "Mozilla/5.0" -o $@.tmp https://register-api.officeforstudents.org.uk/api/Download/ mv $@.tmp $@ $(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY) pipeline/download/pois.py uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY) $(GROCERY_RETAIL_POINTS): uv run python -m pipeline.download.geolytix_retail_points --output $@ $(OFSTED): uv run python -m pipeline.download.ofsted --output $@ $(BROADBAND): uv run python -m pipeline.download.broadband --output $@ $(POSTCODES_RAW): uv run python -m pipeline.download.postcodes --output $@ $(NOISE): $(ARCGIS) pipeline/download/noise.py uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@ $(INSPIRE_STAMP): $(INSPIRE_DOWNLOAD_DEPS) @rm -f $@ uv run python -m pipeline.download.inspire --output $(INSPIRE_DIR) $(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip" @touch $@ $(OA_BOUNDARIES): uv run python -m pipeline.download.oa_boundaries --output $@ $(UPRN_LOOKUP): uv run python -m pipeline.download.uprn_lookup --output $@ $(TRANSIT_STAMP): $(TRANSIT_DOWNLOAD_DEPS) @rm -f $@ uv run python -m pipeline.download.transit_network --output $(TRANSIT_DIR) $(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip @touch $@ $(RENTAL): pipeline/download/rental_prices.py uv run python -m pipeline.download.rental_prices --output $@ $(GREENSPACE): $(PBF) uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF) $(OS_GREENSPACE): uv run python -m pipeline.download.os_greenspace --output $@ $(PLACES): $(PBF) $(ENGLAND_BOUNDARY) $(NAPTAN) $(OFS_REGISTER) $(ARCGIS) uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY) --naptan $(NAPTAN) --university-register $(OFS_REGISTER) --postcodes $(ARCGIS) $(LSOA_POP): uv run python -m pipeline.download.lsoa_population --output $@ $(MEDIAN_AGE): uv run python -m pipeline.download.median_age --output $@ $(ELECTION): pipeline/download/election_results.py uv run python -m pipeline.download.election_results --output $@ $(ENGLAND_BOUNDARY): uv run python -m pipeline.download.england_boundary --output $@ $(RM_OUTCODES): $(MERGE_STAMP) uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@ $(MAP_ASSETS_STAMP): $(MAP_ASSETS_DEPS) @rm -f $@ uv run python -m pipeline.download.map_assets --output $(MAP_ASSETS_DIR) $(VALIDATE_OUTPUTS) --file $(MAP_ASSETS_DIR)/sprites/light.json --file $(MAP_ASSETS_DIR)/sprites/light.png --file $(MAP_ASSETS_DIR)/sprites/dark.json --file $(MAP_ASSETS_DIR)/sprites/dark.png --glob "$(MAP_ASSETS_DIR)/fonts::**/*.pbf" --glob "$(MAP_ASSETS_DIR)/twemoji::*.png" --glob "$(MAP_ASSETS_DIR)/poi-icons::**/*" @touch $@ # ── Transforms ──────────────────────────────────────────────────────────────── $(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(GROCERY_RETAIL_POINTS) $(ENGLAND_BOUNDARY) uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --boundary $(ENGLAND_BOUNDARY) --grocery-retail-points $(GROCERY_RETAIL_POINTS) --output $@ $(EPC_PP): $(PRICE_PAID) $(EPC) pipeline/transform/join_epc_pp.py pipeline/utils/fuzzy_join.py uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@ $(CRIME): $(CRIME_STAMP) $(VALIDATE_OUTPUTS) --file $(CRIME_DIR)/archive_manifest.json --glob "$(CRIME_DIR)::**/*.csv" --zip-glob "$(CRIME_DIR)/_archives::*.zip" uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@ $(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DEPS) uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --greenspace $(OS_GREENSPACE) --output $@ $(SCHOOL_PROX): $(OFSTED) $(ARCGIS) uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@ $(TREE_DENSITY_PC): $(FR_TOW) $(ARCGIS) $(PRICE_PAID) $(TREE_DENSITY_DEPS) uv run python -m pipeline.transform.tree_density \ --tow-zip $(FR_TOW) \ --arcgis $(ARCGIS) \ --price-paid $(PRICE_PAID) \ --output-postcodes $(TREE_DENSITY_PC) \ --output-streets $(TREE_DENSITY_STREETS) \ --output-addresses $(TREE_DENSITY_ADDR) # Postcode boundaries require manual generation — fail with instructions $(PC_BOUNDARIES): @echo "" @echo "=== Postcode boundaries not found ===" @echo "The postcode boundaries directory is required: $@" @echo "" @echo "Generate it with:" @echo " uv run python -m pipeline.transform.postcode_boundaries \\" @echo " --uprn $(UPRN_LOOKUP) \\" @echo " --oa-boundaries $(OA_BOUNDARIES) \\" @echo " --inspire $(INSPIRE_DIR) \\" @echo " --output $@" @echo "" @exit 1 # ── Final merge → postcode.parquet + properties.parquet ────────────────────── $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \ $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(RENTAL) $(LSOA_POP) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) $(MERGE_DEPS) @rm -f $@ uv run python -m pipeline.transform.merge \ --epc-pp $(EPC_PP) \ --arcgis $(ARCGIS) \ --iod $(IOD) \ --poi-proximity $(POI_PROXIMITY) \ --ethnicity $(ETHNICITY) \ --crime $(CRIME) \ --noise $(NOISE) \ --school-proximity $(SCHOOL_PROX) \ --broadband $(BROADBAND) \ --rental-prices $(RENTAL) \ --lsoa-population $(LSOA_POP) \ --median-age $(MEDIAN_AGE) \ --election-results $(ELECTION) \ --tree-density-postcodes $(TREE_DENSITY_PC) \ --output-postcodes $(POSTCODES_PQ) \ --output-properties $(PROPERTIES_PQ) $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) @touch $@ # ── Price estimation (post-merge) ─────────────────────────────────────────── $(POSTCODES_PQ) $(PROPERTIES_PQ) &: $(MERGE_STAMP) $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) $(PRICE_INDEX): $(MERGE_STAMP) $(PRICE_INDEX_DEPS) | $(PROPERTIES_PQ) $(POSTCODES_PQ) uv run python -m pipeline.transform.price_estimation.index --input $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --output $@ $(VALIDATE_OUTPUTS) --parquet $@ $(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX) $(PRICE_ESTIMATE_DEPS) | $(PROPERTIES_PQ) $(POSTCODES_PQ) @rm -f $@ uv run python -m pipeline.transform.price_estimation.estimate --properties $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --index $(PRICE_INDEX) $(VALIDATE_OUTPUTS) --parquet $(PROPERTIES_PQ) --parquet $(POSTCODES_PQ) --parquet $(PRICE_INDEX) @touch $@