This commit is contained in:
Andras Schmelczer 2026-05-14 08:09:19 +01:00
parent a8165249a4
commit a4103b0896
64 changed files with 5376 additions and 3832 deletions

View file

@ -66,9 +66,17 @@ MAP_ASSETS_STAMP := $(MAP_ASSETS_DIR)/.done
PMTILES_VERSION := 1.22.3
VALIDATE_OUTPUTS := uv run python -m pipeline.validate_outputs
POI_PROXIMITY_DEPS := pipeline/transform/poi_proximity.py pipeline/utils/poi_counts.py
MERGE_DEPS := pipeline/transform/merge.py
PRICE_INDEX_DEPS := pipeline/transform/price_estimation/index.py pipeline/transform/price_estimation/shrinkage.py pipeline/transform/price_estimation/utils.py
PRICE_ESTIMATE_DEPS := pipeline/transform/price_estimation/estimate.py pipeline/transform/price_estimation/knn.py pipeline/transform/price_estimation/utils.py
TREE_DENSITY_DEPS := pipeline/transform/tree_density.py
CRIME_DOWNLOAD_DEPS := pipeline/download/crime.py
INSPIRE_DOWNLOAD_DEPS := pipeline/download/inspire.py
TRANSIT_DOWNLOAD_DEPS := pipeline/download/transit_network.py pipeline/download/transxchange2gtfs_shim.js
MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_poi.py
# ── Phony aliases ─────────────────────────────────────────────────────────────
@ -82,14 +90,17 @@ TREE_DENSITY_DEPS := pipeline/transform/tree_density.py
transform-school-proximity transform-tree-density \
generate-postcode-boundaries generate-travel-times
prepare: $(PRICES_STAMP) download-places tiles generate-postcode-boundaries download-map-assets generate-travel-times
merge: $(MERGE_STAMP)
prepare: $(PRICES_STAMP) download-places tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --parquet $(PRICE_INDEX)
merge: $(MERGE_STAMP) | $(POSTCODES_PQ) $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ)
tiles: $(TILES)
download-arcgis: $(ARCGIS)
download-price-paid: $(PRICE_PAID)
download-deprivation: $(IOD)
download-ethnicity: $(ETHNICITY)
download-crime: $(CRIME_STAMP)
$(VALIDATE_OUTPUTS) --file $(CRIME_DIR)/archive_manifest.json --glob "$(CRIME_DIR)::**/*.csv" --zip-glob "$(CRIME_DIR)/_archives::*.zip"
download-naptan: $(NAPTAN)
download-pois: $(POIS_RAW)
download-grocery-retail-points: $(GROCERY_RETAIL_POINTS)
@ -99,9 +110,11 @@ download-postcodes: $(POSTCODES_RAW)
download-rental-prices: $(RENTAL)
download-noise: $(NOISE)
download-inspire: $(INSPIRE_STAMP)
$(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip"
download-oa-boundaries: $(OA_BOUNDARIES)
download-uprn-lookup: $(UPRN_LOOKUP)
download-transit-network: $(TRANSIT_STAMP)
$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip
download-greenspace: $(GREENSPACE)
download-os-greenspace: $(OS_GREENSPACE)
download-pbf: $(PBF)
@ -114,13 +127,15 @@ download-election-results: $(ELECTION)
download-england-boundary: $(ENGLAND_BOUNDARY)
download-rightmove-outcodes: $(RM_OUTCODES)
download-map-assets: $(MAP_ASSETS_STAMP)
$(VALIDATE_OUTPUTS) --file $(MAP_ASSETS_DIR)/sprites/light.json --file $(MAP_ASSETS_DIR)/sprites/light.png --file $(MAP_ASSETS_DIR)/sprites/dark.json --file $(MAP_ASSETS_DIR)/sprites/dark.png --glob "$(MAP_ASSETS_DIR)/fonts::**/*.pbf" --glob "$(MAP_ASSETS_DIR)/twemoji::*.png" --glob "$(MAP_ASSETS_DIR)/poi-icons::**/*"
transform-pois: $(POIS_FILTERED)
transform-epc-pp: $(EPC_PP)
transform-crime: $(CRIME)
transform-poi-proximity: $(POI_PROXIMITY)
transform-school-proximity: $(SCHOOL_PROX)
transform-tree-density: $(TREE_DENSITY_ADDR)
transform-tree-density: $(TREE_DENSITY_PC)
generate-postcode-boundaries: $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP)
$(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip"
uv run python -m pipeline.transform.postcode_boundaries \
--uprn $(UPRN_LOOKUP) \
--oa-boundaries $(OA_BOUNDARIES) \
@ -158,11 +173,13 @@ $(PRICE_PAID):
$(IOD):
uv run python -m pipeline.download.deprivation_data --output $@
$(ETHNICITY):
$(ETHNICITY): pipeline/download/ethnicity.py
uv run python -m pipeline.download.ethnicity --output $@
$(CRIME_STAMP):
$(CRIME_STAMP): $(CRIME_DOWNLOAD_DEPS)
@rm -f $@
uv run python -m pipeline.download.crime --output $(CRIME_DIR)
$(VALIDATE_OUTPUTS) --file $(CRIME_DIR)/archive_manifest.json --glob "$(CRIME_DIR)::**/*.csv" --zip-glob "$(CRIME_DIR)/_archives::*.zip"
@touch $@
$(NAPTAN):
@ -183,7 +200,7 @@ $(OFS_REGISTER):
curl -fL -A "Mozilla/5.0" -o $@.tmp https://register-api.officeforstudents.org.uk/api/Download/
mv $@.tmp $@
$(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY)
$(POIS_RAW): $(PBF) $(ENGLAND_BOUNDARY) pipeline/download/pois.py
uv run python -m pipeline.download.pois --output $@ --pbf $(PBF) --boundary $(ENGLAND_BOUNDARY)
$(GROCERY_RETAIL_POINTS):
@ -198,11 +215,13 @@ $(BROADBAND):
$(POSTCODES_RAW):
uv run python -m pipeline.download.postcodes --output $@
$(NOISE): $(ARCGIS)
$(NOISE): $(ARCGIS) pipeline/download/noise.py
uv run python -m pipeline.download.noise --arcgis $(ARCGIS) --output $@
$(INSPIRE_STAMP):
$(INSPIRE_STAMP): $(INSPIRE_DOWNLOAD_DEPS)
@rm -f $@
uv run python -m pipeline.download.inspire --output $(INSPIRE_DIR)
$(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip"
@touch $@
$(OA_BOUNDARIES):
@ -211,11 +230,13 @@ $(OA_BOUNDARIES):
$(UPRN_LOOKUP):
uv run python -m pipeline.download.uprn_lookup --output $@
$(TRANSIT_STAMP):
$(TRANSIT_STAMP): $(TRANSIT_DOWNLOAD_DEPS)
@rm -f $@
uv run python -m pipeline.download.transit_network --output $(TRANSIT_DIR)
$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip
@touch $@
$(RENTAL):
$(RENTAL): pipeline/download/rental_prices.py
uv run python -m pipeline.download.rental_prices --output $@
$(GREENSPACE): $(PBF)
@ -234,7 +255,7 @@ $(LSOA_POP):
$(MEDIAN_AGE):
uv run python -m pipeline.download.median_age --output $@
$(ELECTION):
$(ELECTION): pipeline/download/election_results.py
uv run python -m pipeline.download.election_results --output $@
$(ENGLAND_BOUNDARY):
@ -243,8 +264,10 @@ $(ENGLAND_BOUNDARY):
$(RM_OUTCODES): $(MERGE_STAMP)
uv run python -m pipeline.download.rightmove_outcodes --postcodes $(POSTCODES_PQ) --output $@
$(MAP_ASSETS_STAMP):
$(MAP_ASSETS_STAMP): $(MAP_ASSETS_DEPS)
@rm -f $@
uv run python -m pipeline.download.map_assets --output $(MAP_ASSETS_DIR)
$(VALIDATE_OUTPUTS) --file $(MAP_ASSETS_DIR)/sprites/light.json --file $(MAP_ASSETS_DIR)/sprites/light.png --file $(MAP_ASSETS_DIR)/sprites/dark.json --file $(MAP_ASSETS_DIR)/sprites/dark.png --glob "$(MAP_ASSETS_DIR)/fonts::**/*.pbf" --glob "$(MAP_ASSETS_DIR)/twemoji::*.png" --glob "$(MAP_ASSETS_DIR)/poi-icons::**/*"
@touch $@
# ── Transforms ────────────────────────────────────────────────────────────────
@ -252,10 +275,11 @@ $(MAP_ASSETS_STAMP):
$(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(GROCERY_RETAIL_POINTS) $(ENGLAND_BOUNDARY)
uv run python -m pipeline.transform.transform_poi --input $(POIS_RAW) --naptan $(NAPTAN) --boundary $(ENGLAND_BOUNDARY) --grocery-retail-points $(GROCERY_RETAIL_POINTS) --output $@
$(EPC_PP): $(PRICE_PAID) $(EPC)
$(EPC_PP): $(PRICE_PAID) $(EPC) pipeline/transform/join_epc_pp.py pipeline/utils/fuzzy_join.py
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
$(CRIME): $(CRIME_STAMP)
$(VALIDATE_OUTPUTS) --file $(CRIME_DIR)/archive_manifest.json --glob "$(CRIME_DIR)::**/*.csv" --zip-glob "$(CRIME_DIR)/_archives::*.zip"
uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DEPS)
@ -264,14 +288,14 @@ $(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DE
$(SCHOOL_PROX): $(OFSTED) $(ARCGIS)
uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@
$(TREE_DENSITY_ADDR): $(FR_TOW) $(ARCGIS) $(PRICE_PAID) $(TREE_DENSITY_DEPS)
$(TREE_DENSITY_PC): $(FR_TOW) $(ARCGIS) $(PRICE_PAID) $(TREE_DENSITY_DEPS)
uv run python -m pipeline.transform.tree_density \
--tow-zip $(FR_TOW) \
--arcgis $(ARCGIS) \
--price-paid $(PRICE_PAID) \
--output-postcodes $(TREE_DENSITY_PC) \
--output-streets $(TREE_DENSITY_STREETS) \
--output-addresses $@
--output-addresses $(TREE_DENSITY_ADDR)
# Postcode boundaries require manual generation — fail with instructions
$(PC_BOUNDARIES):
@ -291,7 +315,8 @@ $(PC_BOUNDARIES):
# ── Final merge → postcode.parquet + properties.parquet ──────────────────────
$(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(RENTAL) $(LSOA_POP) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_ADDR) $(MERGE_DEPS)
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(RENTAL) $(LSOA_POP) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) $(MERGE_DEPS)
@rm -f $@
uv run python -m pipeline.transform.merge \
--epc-pp $(EPC_PP) \
--arcgis $(ARCGIS) \
@ -306,16 +331,23 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
--lsoa-population $(LSOA_POP) \
--median-age $(MEDIAN_AGE) \
--election-results $(ELECTION) \
--tree-density-addresses $(TREE_DENSITY_ADDR) \
--tree-density-postcodes $(TREE_DENSITY_PC) \
--output-postcodes $(POSTCODES_PQ) \
--output-properties $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ)
@touch $@
# ── Price estimation (post-merge) ───────────────────────────────────────────
$(PRICE_INDEX): $(MERGE_STAMP)
uv run python -m pipeline.transform.price_estimation.index --input $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --output $@
$(POSTCODES_PQ) $(PROPERTIES_PQ) &: $(MERGE_STAMP)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ)
$(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX)
$(PRICE_INDEX): $(MERGE_STAMP) $(PRICE_INDEX_DEPS) | $(PROPERTIES_PQ) $(POSTCODES_PQ)
uv run python -m pipeline.transform.price_estimation.index --input $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --output $@
$(VALIDATE_OUTPUTS) --parquet $@
$(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX) $(PRICE_ESTIMATE_DEPS) | $(PROPERTIES_PQ) $(POSTCODES_PQ)
@rm -f $@
uv run python -m pipeline.transform.price_estimation.estimate --properties $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --index $(PRICE_INDEX)
$(VALIDATE_OUTPUTS) --parquet $(PROPERTIES_PQ) --parquet $(POSTCODES_PQ) --parquet $(PRICE_INDEX)
@touch $@