Progress
This commit is contained in:
parent
5b68c8da04
commit
536fd14378
28 changed files with 1683 additions and 313 deletions
117
Makefile.data
117
Makefile.data
|
|
@ -3,15 +3,14 @@
|
|||
# Usage:
|
||||
# make -f Makefile.data prepare # Build wide.parquet (+ all deps)
|
||||
# make -f Makefile.data tiles # Download UK map tiles
|
||||
# make -f Makefile.data download-pois # Download a single dataset
|
||||
# make -f Makefile.data help # List all targets
|
||||
#
|
||||
# Or include from the main Makefile and use targets directly.
|
||||
|
||||
SHELL := /bin/bash
|
||||
.DELETE_ON_ERROR:
|
||||
|
||||
DATA_DIR := /bulk/property-data
|
||||
DATA_DIR := ./property-data
|
||||
MANUAL_DATA := ./manual-data
|
||||
|
||||
# ── Output files ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -24,11 +23,13 @@ POIS_FILTERED := $(DATA_DIR)/filtered_uk_pois.parquet
|
|||
POI_PROXIMITY := $(DATA_DIR)/poi_proximity.parquet
|
||||
EPC_PP := $(DATA_DIR)/epc_pp.parquet
|
||||
WIDE := $(DATA_DIR)/wide.parquet
|
||||
EPC := $(DATA_DIR)/certificates.csv
|
||||
JT_BANK := $(DATA_DIR)/journey_times_bank.parquet
|
||||
JT_FITZROVIA := $(DATA_DIR)/journey_times_fitzrovia.parquet
|
||||
PRICE_INDEX := $(DATA_DIR)/price_index.parquet
|
||||
PRICES_STAMP := $(DATA_DIR)/.prices_done
|
||||
EPC := $(MANUAL_DATA)/certificates.csv
|
||||
JT_BANK := $(MANUAL_DATA)/journey_times_bank.parquet
|
||||
JT_FITZROVIA := $(MANUAL_DATA)/journey_times_fitzrovia.parquet
|
||||
ETHNICITY := $(DATA_DIR)/ethnicity_by_la.parquet
|
||||
CRIME_DIR := $(DATA_DIR)/crime
|
||||
CRIME_DIR := $(MANUAL_DATA)/crime
|
||||
CRIME := $(DATA_DIR)/crime_by_lsoa.parquet
|
||||
NOISE := $(DATA_DIR)/road_noise.parquet
|
||||
OFSTED := $(DATA_DIR)/ofsted.parquet
|
||||
|
|
@ -40,28 +41,28 @@ GEOSURE := $(DATA_DIR)/geosure.parquet
|
|||
INSPIRE_DIR := $(DATA_DIR)/inspire
|
||||
OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg
|
||||
UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet
|
||||
PC_BOUNDARIES := $(DATA_DIR)/new_postcode_boundaries
|
||||
PC_BOUNDARIES := $(MANUAL_DATA)/postcode_boundaries
|
||||
|
||||
# Sentinel files for directory targets (Make doesn't track directories well)
|
||||
GEOSURE_STAMP := $(GEOSURE_DIR)/.done
|
||||
INSPIRE_STAMP := $(INSPIRE_DIR)/.done
|
||||
|
||||
MANUAL_DATA := $(DATA_DIR)/manual_data
|
||||
PMTILES_VERSION := 1.22.3
|
||||
PMTILES_BIN := $(DATA_DIR)/pmtiles
|
||||
|
||||
# ── Phony aliases ─────────────────────────────────────────────────────────────
|
||||
|
||||
.PHONY: prepare tiles \
|
||||
.PHONY: prepare wide tiles \
|
||||
download-arcgis download-price-paid download-deprivation download-ethnicity \
|
||||
download-naptan download-pois download-ofsted download-broadband \
|
||||
download-postcodes download-geosure download-noise download-inspire \
|
||||
download-oa-boundaries download-uprn-lookup \
|
||||
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
|
||||
transform-school-proximity transform-geosure transform-postcode-boundaries \
|
||||
generate-postcode-boundaries \
|
||||
journey-times
|
||||
|
||||
prepare: $(WIDE)
|
||||
prepare: $(DATA_DIR)/.prices_done
|
||||
wide: $(WIDE)
|
||||
tiles: $(TILES)
|
||||
download-arcgis: $(ARCGIS)
|
||||
download-price-paid: $(PRICE_PAID)
|
||||
|
|
@ -84,19 +85,17 @@ transform-poi-proximity: $(POI_PROXIMITY)
|
|||
transform-school-proximity: $(SCHOOL_PROX)
|
||||
transform-geosure: $(GEOSURE)
|
||||
transform-postcode-boundaries: $(PC_BOUNDARIES)
|
||||
generate-postcode-boundaries: $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP)
|
||||
uv run python -m pipeline.transform.postcode_boundaries \
|
||||
--uprn $(UPRN_LOOKUP) \
|
||||
--oa-boundaries $(OA_BOUNDARIES) \
|
||||
--inspire $(INSPIRE_DIR) \
|
||||
--output $(PC_BOUNDARIES)
|
||||
|
||||
# ── Downloads ─────────────────────────────────────────────────────────────────
|
||||
|
||||
$(TILES):
|
||||
@echo "Downloading UK PMTiles (~1.5GB)..."
|
||||
@echo "This extracts UK tiles from the Protomaps planet file."
|
||||
@if [ ! -f "$(PMTILES_BIN)" ]; then \
|
||||
echo "Downloading pmtiles CLI v$(PMTILES_VERSION)..."; \
|
||||
curl -sL "https://github.com/protomaps/go-pmtiles/releases/download/v$(PMTILES_VERSION)/go-pmtiles_$(PMTILES_VERSION)_Linux_x86_64.tar.gz" \
|
||||
| tar -xz -C "$(DATA_DIR)" pmtiles; \
|
||||
chmod +x "$(PMTILES_BIN)"; \
|
||||
fi
|
||||
"$(PMTILES_BIN)" extract https://build.protomaps.com/20260201.pmtiles $@ --bbox=-10.5,49.5,2.5,61
|
||||
uv run -m pipeline.download.tiles --output $@ --pmtiles-version $(PMTILES_VERSION)
|
||||
|
||||
# EPC requires manual registration — fail with instructions
|
||||
$(EPC):
|
||||
|
|
@ -105,6 +104,7 @@ $(EPC):
|
|||
@echo "The EPC certificates file is required: $@"
|
||||
@echo ""
|
||||
@echo "To obtain it, register at https://epc.opendatacommunities.org/login"
|
||||
@echo "and place certificates.csv in manual-data/"
|
||||
@echo ""
|
||||
@exit 1
|
||||
|
||||
|
|
@ -155,32 +155,22 @@ $(UPRN_LOOKUP):
|
|||
# ── Journey times (requires TFL_API_KEY) ──────────────────────────────────────
|
||||
|
||||
$(JT_BANK):
|
||||
@if [ -f "$(MANUAL_DATA)/journey_times_bank.parquet" ]; then \
|
||||
echo "Copying journey_times_bank.parquet from manual_data/"; \
|
||||
cp "$(MANUAL_DATA)/journey_times_bank.parquet" $@; \
|
||||
else \
|
||||
echo ""; \
|
||||
echo "=== TFL journey times (bank) not found ==="; \
|
||||
echo "Either place the file in $(MANUAL_DATA)/journey_times_bank.parquet"; \
|
||||
echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin"; \
|
||||
echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=bank"; \
|
||||
echo ""; \
|
||||
exit 1; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "=== TFL journey times (bank) not found ==="
|
||||
@echo "Place journey_times_bank.parquet in $(MANUAL_DATA)/"
|
||||
@echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin"
|
||||
@echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=bank"
|
||||
@echo ""
|
||||
@exit 1
|
||||
|
||||
$(JT_FITZROVIA):
|
||||
@if [ -f "$(MANUAL_DATA)/journey_times_fitzrovia.parquet" ]; then \
|
||||
echo "Copying journey_times_fitzrovia.parquet from manual_data/"; \
|
||||
cp "$(MANUAL_DATA)/journey_times_fitzrovia.parquet" $@; \
|
||||
else \
|
||||
echo ""; \
|
||||
echo "=== TFL journey times (fitzrovia) not found ==="; \
|
||||
echo "Either place the file in $(MANUAL_DATA)/journey_times_fitzrovia.parquet"; \
|
||||
echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin"; \
|
||||
echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=fitzrovia"; \
|
||||
echo ""; \
|
||||
exit 1; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "=== TFL journey times (fitzrovia) not found ==="
|
||||
@echo "Place journey_times_fitzrovia.parquet in $(MANUAL_DATA)/"
|
||||
@echo "or register for a TFL API key at https://api-portal.tfl.gov.uk/signin"
|
||||
@echo "and run: TFL_API_KEY=... make -f Makefile.data journey-times DEST=fitzrovia"
|
||||
@echo ""
|
||||
@exit 1
|
||||
|
||||
journey-times: $(ARCGIS)
|
||||
ifndef DEST
|
||||
|
|
@ -197,6 +187,14 @@ $(EPC_PP): $(PRICE_PAID) $(EPC)
|
|||
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
|
||||
|
||||
$(CRIME):
|
||||
@if [ ! -d "$(CRIME_DIR)" ]; then \
|
||||
echo ""; \
|
||||
echo "=== Crime dataset not found ==="; \
|
||||
echo "Place police.uk crime CSVs in $(CRIME_DIR)/"; \
|
||||
echo "Download from https://data.police.uk/data/"; \
|
||||
echo ""; \
|
||||
exit 1; \
|
||||
fi
|
||||
uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@
|
||||
|
||||
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED)
|
||||
|
|
@ -208,12 +206,20 @@ $(SCHOOL_PROX): $(OFSTED) $(ARCGIS)
|
|||
$(GEOSURE): $(GEOSURE_STAMP) $(ARCGIS)
|
||||
uv run python -m pipeline.transform.transform_geosure --geosure $(GEOSURE_DIR) --arcgis $(ARCGIS) --output $@
|
||||
|
||||
$(PC_BOUNDARIES): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP)
|
||||
uv run python -m pipeline.transform.postcode_boundaries \
|
||||
--uprn $(UPRN_LOOKUP) \
|
||||
--oa-boundaries $(OA_BOUNDARIES) \
|
||||
--inspire $(INSPIRE_DIR) \
|
||||
--output $@
|
||||
# Postcode boundaries require manual generation — fail with instructions
|
||||
$(PC_BOUNDARIES):
|
||||
@echo ""
|
||||
@echo "=== Postcode boundaries not found ==="
|
||||
@echo "The postcode boundaries directory is required: $@"
|
||||
@echo ""
|
||||
@echo "Generate it with:"
|
||||
@echo " uv run python -m pipeline.transform.postcode_boundaries \\"
|
||||
@echo " --uprn $(UPRN_LOOKUP) \\"
|
||||
@echo " --oa-boundaries $(OA_BOUNDARIES) \\"
|
||||
@echo " --inspire $(INSPIRE_DIR) \\"
|
||||
@echo " --output $@"
|
||||
@echo ""
|
||||
@exit 1
|
||||
|
||||
# ── Final merge ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -233,3 +239,12 @@ $(WIDE): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) $(JT_BANK) $(JT_FITZROVIA)
|
|||
--broadband $(BROADBAND) \
|
||||
--geosure $(GEOSURE) \
|
||||
--output $@
|
||||
|
||||
# ── Price estimation (post-merge) ────────────────────────────────────────────
|
||||
|
||||
$(PRICE_INDEX): $(WIDE)
|
||||
uv run python -m pipeline.transform.price_index --input $(WIDE) --output $@
|
||||
|
||||
$(PRICES_STAMP): $(WIDE) $(PRICE_INDEX)
|
||||
uv run python -m pipeline.transform.price_estimate --input $(WIDE) --index $(PRICE_INDEX)
|
||||
@touch $@
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue