271 lines
11 KiB
Text
271 lines
11 KiB
Text
# Data pipeline — download sources and build postcode.parquet + properties.parquet
|
|
#
|
|
# Usage:
|
|
# make -f Makefile.data prepare # Build all parquets (+ all deps)
|
|
# make -f Makefile.data tiles # Download UK map tiles
|
|
#
|
|
# Or include from the main Makefile and use targets directly.
|
|
|
|
SHELL := /bin/bash
|
|
.DELETE_ON_ERROR:
|
|
|
|
DATA_DIR := ./property-data
|
|
MANUAL_DATA := ./manual-data
|
|
|
|
# ── Output files ──────────────────────────────────────────────────────────────
|
|
|
|
TILES := $(DATA_DIR)/uk.pmtiles
|
|
ARCGIS := $(DATA_DIR)/arcgis_data.parquet
|
|
PRICE_PAID := $(DATA_DIR)/price-paid-complete.parquet
|
|
IOD := $(DATA_DIR)/IoD2025_Scores.parquet
|
|
POIS_RAW := $(DATA_DIR)/uk_pois.parquet
|
|
POIS_FILTERED := $(DATA_DIR)/filtered_uk_pois.parquet
|
|
POI_PROXIMITY := $(DATA_DIR)/poi_proximity.parquet
|
|
EPC_PP := $(DATA_DIR)/epc_pp.parquet
|
|
POSTCODES_PQ := $(DATA_DIR)/postcode.parquet
|
|
PROPERTIES_PQ := $(DATA_DIR)/properties.parquet
|
|
MERGE_STAMP := $(DATA_DIR)/.merge_done
|
|
PRICE_INDEX := $(DATA_DIR)/price_index.parquet
|
|
PRICES_STAMP := $(DATA_DIR)/.prices_done
|
|
EPC := $(MANUAL_DATA)/certificates.csv
|
|
ETHNICITY := $(DATA_DIR)/ethnicity_by_la.parquet
|
|
CRIME_DIR := $(MANUAL_DATA)/crime
|
|
CRIME := $(DATA_DIR)/crime_by_lsoa.parquet
|
|
NOISE := $(DATA_DIR)/road_noise.parquet
|
|
OFSTED := $(DATA_DIR)/ofsted.parquet
|
|
NAPTAN := $(DATA_DIR)/naptan.parquet
|
|
BROADBAND := $(DATA_DIR)/broadband.parquet
|
|
SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet
|
|
RENTAL := $(DATA_DIR)/rental_prices.parquet
|
|
GEOSURE_DIR := $(DATA_DIR)/geosure
|
|
GEOSURE := $(DATA_DIR)/geosure.parquet
|
|
INSPIRE_DIR := $(DATA_DIR)/inspire
|
|
OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg
|
|
UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet
|
|
PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries
|
|
TRANSIT_DIR := $(DATA_DIR)/transit
|
|
TRANSIT_STAMP := $(TRANSIT_DIR)/.done
|
|
GREENSPACE := $(DATA_DIR)/greenspace_water.parquet
|
|
PBF := $(DATA_DIR)/england-latest.osm.pbf
|
|
PLACES := $(DATA_DIR)/places.parquet
|
|
LISTINGS_BUY := $(DATA_DIR)/online_listings_buy.parquet
|
|
LISTINGS_RENT := $(DATA_DIR)/online_listings_rent.parquet
|
|
LSOA_POP := $(DATA_DIR)/lsoa_population.parquet
|
|
ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson
|
|
RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json
|
|
|
|
# Sentinel files for directory targets (Make doesn't track directories well)
|
|
GEOSURE_STAMP := $(GEOSURE_DIR)/.done
|
|
INSPIRE_STAMP := $(INSPIRE_DIR)/.done
|
|
|
|
PMTILES_VERSION := 1.22.3
|
|
|
|
# ── Phony aliases ─────────────────────────────────────────────────────────────
|
|
|
|
.PHONY: prepare merge tiles \
|
|
download-arcgis download-price-paid download-deprivation download-ethnicity \
|
|
download-naptan download-pois download-ofsted download-broadband download-rental-prices \
|
|
download-postcodes download-geosure download-noise download-inspire \
|
|
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-pbf download-places download-lsoa-population download-england-boundary download-rightmove-outcodes \
|
|
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
|
|
transform-school-proximity transform-geosure transform-postcode-boundaries \
|
|
generate-postcode-boundaries
|
|
|
|
prepare: $(PRICES_STAMP)
|
|
merge: $(MERGE_STAMP)
|
|
tiles: $(TILES)
|
|
download-arcgis: $(ARCGIS)
|
|
download-price-paid: $(PRICE_PAID)
|
|
download-deprivation: $(IOD)
|
|
download-ethnicity: $(ETHNICITY)
|
|
download-naptan: $(NAPTAN)
|
|
download-pois: $(POIS_RAW)
|
|
download-ofsted: $(OFSTED)
|
|
download-broadband: $(BROADBAND)
|
|
download-postcodes: $(POSTCODES)
|
|
download-geosure: $(GEOSURE_STAMP)
|
|
download-rental-prices: $(RENTAL)
|
|
download-noise: $(NOISE)
|
|
download-inspire: $(INSPIRE_STAMP)
|
|
download-oa-boundaries: $(OA_BOUNDARIES)
|
|
download-uprn-lookup: $(UPRN_LOOKUP)
|
|
download-transit-network: $(TRANSIT_STAMP)
|
|
download-greenspace: $(GREENSPACE)
|
|
download-pbf: $(PBF)
|
|
download-places: $(PLACES)
|
|
download-lsoa-population: $(LSOA_POP)
|
|
download-england-boundary: $(ENGLAND_BOUNDARY)
|
|
download-rightmove-outcodes: $(RM_OUTCODES)
|
|
transform-pois: $(POIS_FILTERED)
|
|
transform-epc-pp: $(EPC_PP)
|
|
transform-crime: $(CRIME)
|
|
transform-poi-proximity: $(POI_PROXIMITY)
|
|
transform-school-proximity: $(SCHOOL_PROX)
|
|
transform-geosure: $(GEOSURE)
|
|
transform-postcode-boundaries: $(PC_BOUNDARIES)
|
|
generate-postcode-boundaries: $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP)
|
|
uv run python -m pipeline.transform.postcode_boundaries \
|
|
--uprn $(UPRN_LOOKUP) \
|
|
--oa-boundaries $(OA_BOUNDARIES) \
|
|
--inspire $(INSPIRE_DIR) \
|
|
--output $(PC_BOUNDARIES)
|
|
|
|
# ── Downloads ─────────────────────────────────────────────────────────────────
|
|
|
|
$(TILES):
|
|
uv run -m pipeline.download.tiles --output $@ --pmtiles-version $(PMTILES_VERSION)
|
|
|
|
# EPC requires manual registration — fail with instructions
|
|
$(EPC):
|
|
@echo ""
|
|
@echo "=== EPC dataset not found ==="
|
|
@echo "The EPC certificates file is required: $@"
|
|
@echo ""
|
|
@echo "To obtain it, register at https://epc.opendatacommunities.org/login"
|
|
@echo "and place certificates.csv in manual-data/"
|
|
@echo ""
|
|
@exit 1
|
|
|
|
$(ARCGIS):
|
|
uv run python -m pipeline.download.arcgis --output $@
|
|
|
|
$(PRICE_PAID):
|
|
uv run python -m pipeline.download.price_paid --output $@
|
|
|
|
$(IOD):
|
|
uv run python -m pipeline.download.deprivation_data --output $@
|
|
|
|
$(ETHNICITY):
|
|
uv run python -m pipeline.download.ethnicity --output $@
|
|
|
|
$(NAPTAN):
|
|
uv run python -m pipeline.download.naptan --output $@
|
|
|
|
$(PBF):
|
|
@mkdir -p $(DATA_DIR)
|
|
curl -L -o $@.tmp https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf
|
|
mv $@.tmp $@
|
|
|
|
$(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY)
|
|
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
|
|
|
|
$(OFSTED):
|
|
uv run python -m pipeline.download.ofsted --output $@
|
|
|
|
$(BROADBAND):
|
|
uv run python -m pipeline.download.broadband --output $@
|
|
|
|
$(POSTCODES):
|
|
uv run python -m pipeline.download.postcodes --output $@
|
|
|
|
$(GEOSURE_STAMP):
|
|
uv run python -m pipeline.download.geosure --output $(GEOSURE_DIR)
|
|
@touch $@
|
|
|
|
$(NOISE): $(ARCGIS)
|
|
uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@
|
|
|
|
$(INSPIRE_STAMP):
|
|
uv run python -m pipeline.download.inspire --output $(INSPIRE_DIR)
|
|
@touch $@
|
|
|
|
$(OA_BOUNDARIES):
|
|
uv run python -m pipeline.download.oa_boundaries --output $@
|
|
|
|
$(UPRN_LOOKUP):
|
|
uv run python -m pipeline.download.uprn_lookup --output $@
|
|
|
|
$(TRANSIT_STAMP):
|
|
uv run python -m pipeline.download.transit_network --output $(TRANSIT_DIR)
|
|
@touch $@
|
|
|
|
$(RENTAL):
|
|
uv run python -m pipeline.download.rental_prices --output $@
|
|
|
|
$(GREENSPACE): $(PBF)
|
|
uv run python -m pipeline.download.greenspace_water --output $@ --pbf $(PBF)
|
|
|
|
$(PLACES): $(PBF) $(ENGLAND_BOUNDARY)
|
|
uv run python -m pipeline.download.places --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
|
|
|
|
$(LSOA_POP):
|
|
uv run python -m pipeline.download.lsoa_population --output $@
|
|
|
|
$(ENGLAND_BOUNDARY):
|
|
uv run python -m pipeline.download.england_boundary --output $@
|
|
|
|
$(RM_OUTCODES): $(MERGE_STAMP)
|
|
uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@
|
|
|
|
# ── Transforms ────────────────────────────────────────────────────────────────
|
|
|
|
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(ENGLAND_BOUNDARY)
|
|
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --boundary $(ENGLAND_BOUNDARY) --output $@
|
|
|
|
$(EPC_PP): $(PRICE_PAID) $(EPC)
|
|
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
|
|
|
|
$(CRIME):
|
|
@if [ ! -d "$(CRIME_DIR)" ]; then \
|
|
echo ""; \
|
|
echo "=== Crime dataset not found ==="; \
|
|
echo "Place police.uk crime CSVs in $(CRIME_DIR)/"; \
|
|
echo "Download from https://data.police.uk/data/"; \
|
|
echo ""; \
|
|
exit 1; \
|
|
fi
|
|
uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@
|
|
|
|
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED)
|
|
uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --output $@
|
|
|
|
$(SCHOOL_PROX): $(OFSTED) $(ARCGIS)
|
|
uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@
|
|
|
|
$(GEOSURE): $(GEOSURE_STAMP) $(ARCGIS)
|
|
uv run python -m pipeline.transform.transform_geosure --geosure $(GEOSURE_DIR) --arcgis $(ARCGIS) --output $@
|
|
|
|
# Postcode boundaries require manual generation — fail with instructions
|
|
$(PC_BOUNDARIES):
|
|
@echo ""
|
|
@echo "=== Postcode boundaries not found ==="
|
|
@echo "The postcode boundaries directory is required: $@"
|
|
@echo ""
|
|
@echo "Generate it with:"
|
|
@echo " uv run python -m pipeline.transform.postcode_boundaries \\"
|
|
@echo " --uprn $(UPRN_LOOKUP) \\"
|
|
@echo " --oa-boundaries $(OA_BOUNDARIES) \\"
|
|
@echo " --inspire $(INSPIRE_DIR) \\"
|
|
@echo " --output $@"
|
|
@echo ""
|
|
@exit 1
|
|
|
|
# ── Final merge → postcode.parquet + properties.parquet ──────────────────────
|
|
|
|
$(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
|
|
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(GEOSURE) $(RENTAL) $(LSOA_POP)
|
|
uv run python -m pipeline.transform.merge \
|
|
--epc-pp $(EPC_PP) \
|
|
--arcgis $(ARCGIS) \
|
|
--iod $(IOD) \
|
|
--poi-proximity $(POI_PROXIMITY) \
|
|
--ethnicity $(ETHNICITY) \
|
|
--crime $(CRIME) \
|
|
--noise $(NOISE) \
|
|
--school-proximity $(SCHOOL_PROX) \
|
|
--broadband $(BROADBAND) \
|
|
--geosure $(GEOSURE) \
|
|
--rental-prices $(RENTAL) \
|
|
--lsoa-population $(LSOA_POP) \
|
|
--output-postcodes $(POSTCODES_PQ) \
|
|
--output-properties $(PROPERTIES_PQ)
|
|
@touch $@
|
|
|
|
# ── Price estimation (post-merge) ───────────────────────────────────────────
|
|
|
|
$(PRICE_INDEX): $(MERGE_STAMP)
|
|
uv run python -m pipeline.transform.price_estimation.index --input $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --output $@
|
|
|
|
$(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX)
|
|
uv run python -m pipeline.transform.price_estimation.estimate --properties $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --index $(PRICE_INDEX)
|
|
@touch $@
|