# Data pipeline — download sources and build wide.parquet # # Usage: # make -f Makefile.data prepare # Build wide.parquet (+ all deps) # make -f Makefile.data tiles # Download UK map tiles # # Or include from the main Makefile and use targets directly. SHELL := /bin/bash .DELETE_ON_ERROR: DATA_DIR := ./property-data MANUAL_DATA := ./manual-data # ── Output files ────────────────────────────────────────────────────────────── TILES := $(DATA_DIR)/uk.pmtiles ARCGIS := $(DATA_DIR)/arcgis_data.parquet PRICE_PAID := $(DATA_DIR)/price-paid-complete.parquet IOD := $(DATA_DIR)/IoD2025_Scores.parquet POIS_RAW := $(DATA_DIR)/uk_pois.parquet POIS_FILTERED := $(DATA_DIR)/filtered_uk_pois.parquet POI_PROXIMITY := $(DATA_DIR)/poi_proximity.parquet EPC_PP := $(DATA_DIR)/epc_pp.parquet WIDE := $(DATA_DIR)/wide.parquet PRICE_INDEX := $(DATA_DIR)/price_index.parquet PRICES_STAMP := $(DATA_DIR)/.prices_done EPC := $(MANUAL_DATA)/certificates.csv JT_BANK := $(MANUAL_DATA)/journey_times_bank.parquet JT_FITZROVIA := $(MANUAL_DATA)/journey_times_fitzrovia.parquet ETHNICITY := $(DATA_DIR)/ethnicity_by_la.parquet CRIME_DIR := $(MANUAL_DATA)/crime CRIME := $(DATA_DIR)/crime_by_lsoa.parquet NOISE := $(DATA_DIR)/road_noise.parquet OFSTED := $(DATA_DIR)/ofsted.parquet NAPTAN := $(DATA_DIR)/naptan.parquet BROADBAND := $(DATA_DIR)/broadband.parquet SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet RENTAL := $(DATA_DIR)/rental_prices.parquet GEOSURE_DIR := $(DATA_DIR)/geosure GEOSURE := $(DATA_DIR)/geosure.parquet INSPIRE_DIR := $(DATA_DIR)/inspire OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries TRANSIT_DIR := $(DATA_DIR)/transit TRANSIT_STAMP := $(TRANSIT_DIR)/.done GREENSPACE := $(DATA_DIR)/greenspace_water.parquet PBF := $(DATA_DIR)/great-britain-latest.osm.pbf PLACES := $(DATA_DIR)/places.parquet RIGHTMOVE_BUY := $(DATA_DIR)/rightmove_buy.parquet RIGHTMOVE_RENT := $(DATA_DIR)/rightmove_rent.parquet ONLINE_STAMP := $(DATA_DIR)/.online_done # Sentinel files for directory targets (Make doesn't track directories well) GEOSURE_STAMP := $(GEOSURE_DIR)/.done INSPIRE_STAMP := $(INSPIRE_DIR)/.done PMTILES_VERSION := 1.22.3 # ── Phony aliases ───────────────────────────────────────────────────────────── .PHONY: prepare wide tiles \ download-arcgis download-price-paid download-deprivation download-ethnicity \ download-naptan download-pois download-ofsted download-broadband download-rental-prices \ download-postcodes download-geosure download-noise download-inspire \ download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places \ transform-pois transform-epc-pp transform-crime transform-poi-proximity \ transform-school-proximity transform-geosure transform-postcode-boundaries \ generate-postcode-boundaries \ journey-times prepare: $(DATA_DIR)/.prices_done wide: $(WIDE) tiles: $(TILES) download-arcgis: $(ARCGIS) download-price-paid: $(PRICE_PAID) download-deprivation: $(IOD) download-ethnicity: $(ETHNICITY) download-naptan: $(NAPTAN) download-pois: $(POIS_RAW) download-ofsted: $(OFSTED) download-broadband: $(BROADBAND) download-postcodes: $(POSTCODES) download-geosure: $(GEOSURE_STAMP) download-rental-prices: $(RENTAL) download-noise: $(NOISE) download-inspire: $(INSPIRE_STAMP) download-oa-boundaries: $(OA_BOUNDARIES) download-uprn-lookup: $(UPRN_LOOKUP) download-transit-network: $(TRANSIT_STAMP) download-greenspace: $(GREENSPACE) download-pbf: $(PBF) download-places: $(PLACES) transform-pois: $(POIS_FILTERED) transform-epc-pp: $(EPC_PP) transform-crime: $(CRIME) transform-poi-proximity: $(POI_PROXIMITY) transform-school-proximity: $(SCHOOL_PROX) transform-geosure: $(GEOSURE) transform-postcode-boundaries: $(PC_BOUNDARIES) generate-postcode-boundaries: $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) uv run python -m pipeline.transform.postcode_boundaries \ --uprn $(UPRN_LOOKUP) \ --oa-boundaries $(OA_BOUNDARIES) \ --inspire $(INSPIRE_DIR) \ --output $(PC_BOUNDARIES) # ── Downloads ───────────────────────────────────────────────────────────────── $(TILES): uv run -m pipeline.download.tiles --output $@ --pmtiles-version $(PMTILES_VERSION) # EPC requires manual registration — fail with instructions $(EPC): @echo "" @echo "=== EPC dataset not found ===" @echo "The EPC certificates file is required: $@" @echo "" @echo "To obtain it, register at https://epc.opendatacommunities.org/login" @echo "and place certificates.csv in manual-data/" @echo "" @exit 1 $(ARCGIS): uv run python -m pipeline.download.arcgis --output $@ $(PRICE_PAID): uv run python -m pipeline.download.price_paid --output $@ $(IOD): uv run python -m pipeline.download.deprivation_data --output $@ $(ETHNICITY): uv run python -m pipeline.download.ethnicity --output $@ $(NAPTAN): uv run python -m pipeline.download.naptan --output $@ $(PBF): @mkdir -p $(DATA_DIR) curl -L -o $@.tmp https://download.geofabrik.de/europe/great-britain-latest.osm.pbf mv $@.tmp $@ $(POIS_RAW): $(PBF) uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) $(OFSTED): uv run python -m pipeline.download.ofsted --output $@ $(BROADBAND): uv run python -m pipeline.download.broadband --output $@ $(POSTCODES): uv run python -m pipeline.download.postcodes --output $@ $(GEOSURE_STAMP): uv run python -m pipeline.download.geosure --output $(GEOSURE_DIR) @touch $@ $(NOISE): $(ARCGIS) uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@ $(INSPIRE_STAMP): uv run python -m pipeline.download.inspire --output $(INSPIRE_DIR) @touch $@ $(OA_BOUNDARIES): uv run python -m pipeline.download.oa_boundaries --output $@ $(UPRN_LOOKUP): uv run python -m pipeline.download.uprn_lookup --output $@ $(TRANSIT_STAMP): uv run python -m pipeline.download.transit_network --output $(TRANSIT_DIR) @touch $@ $(RENTAL): uv run python -m pipeline.download.rental_prices --output $@ $(GREENSPACE): $(PBF) uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF) $(PLACES): $(PBF) uv run python -m pipeline.download.places --output $@ --pbf $(PBF) # ── Journey times (requires TFL_API_KEY) ────────────────────────────────────── $(JT_BANK): @echo "" @echo "=== TFL journey times (bank) not found ===" @echo "Place journey_times_bank.parquet in $(MANUAL_DATA)/" @echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin" @echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=bank" @echo "" @exit 1 $(JT_FITZROVIA): @echo "" @echo "=== TFL journey times (fitzrovia) not found ===" @echo "Place journey_times_fitzrovia.parquet in $(MANUAL_DATA)/" @echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin" @echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=fitzrovia" @echo "" @exit 1 journey-times: $(ARCGIS) ifndef DEST $(error DEST required — e.g. make journey-times DEST=bank) endif uv run python -m pipeline.journey_times --destination $(DEST) --output-dir $(DATA_DIR) --postcodes $(ARCGIS) # ── Transforms ──────────────────────────────────────────────────────────────── $(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --output $@ $(EPC_PP): $(PRICE_PAID) $(EPC) uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@ $(CRIME): @if [ ! -d "$(CRIME_DIR)" ]; then \ echo ""; \ echo "=== Crime dataset not found ==="; \ echo "Place police.uk crime CSVs in $(CRIME_DIR)/"; \ echo "Download from https://data.police.uk/data/"; \ echo ""; \ exit 1; \ fi uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@ $(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --output $@ $(SCHOOL_PROX): $(OFSTED) $(ARCGIS) uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@ $(GEOSURE): $(GEOSURE_STAMP) $(ARCGIS) uv run python -m pipeline.transform.transform_geosure --geosure $(GEOSURE_DIR) --arcgis $(ARCGIS) --output $@ # Postcode boundaries require manual generation — fail with instructions $(PC_BOUNDARIES): @echo "" @echo "=== Postcode boundaries not found ===" @echo "The postcode boundaries directory is required: $@" @echo "" @echo "Generate it with:" @echo " uv run python -m pipeline.transform.postcode_boundaries \\" @echo " --uprn $(UPRN_LOOKUP) \\" @echo " --oa-boundaries $(OA_BOUNDARIES) \\" @echo " --inspire $(INSPIRE_DIR) \\" @echo " --output $@" @echo "" @exit 1 # ── Final merge ─────────────────────────────────────────────────────────────── $(WIDE): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) $(JT_BANK) $(JT_FITZROVIA) \ $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL) uv run python -m pipeline.transform.merge \ --epc-pp $(EPC_PP) \ --arcgis $(ARCGIS) \ --iod $(IOD) \ --poi-proximity $(POI_PROXIMITY) \ --journey-times-bank $(JT_BANK) \ --journey-times-fitzrovia $(JT_FITZROVIA) \ --ethnicity $(ETHNICITY) \ --crime $(CRIME) \ --noise $(NOISE) \ --school-proximity $(SCHOOL_PROX) \ --broadband $(BROADBAND) \ --geosure $(GEOSURE) \ --rental-prices $(RENTAL) \ --output $@ # ── Online listings (post-merge, pre-pricing) ─────────────────────────────── $(ONLINE_STAMP): $(WIDE) $(RIGHTMOVE_BUY) $(RIGHTMOVE_RENT) uv run python -m pipeline.transform.add_online_listings \ --input $(WIDE) \ --buy $(RIGHTMOVE_BUY) \ --rent $(RIGHTMOVE_RENT) @touch $@ # ── Price estimation (post-merge + online) ────────────────────────────────── $(PRICE_INDEX): $(ONLINE_STAMP) uv run python -m pipeline.transform.price_estimation.index --input $(WIDE) --output $@ $(PRICES_STAMP): $(ONLINE_STAMP) $(PRICE_INDEX) uv run python -m pipeline.transform.price_estimation.estimate --input $(WIDE) --index $(PRICE_INDEX) @touch $@