perfect-postcode/Taskfile.data.yml

208 lines
6.9 KiB
YAML

version: '3'
vars:
DATA_DIR: /bulk/property-data
ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet"
PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet"
IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet"
POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet"
POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet"
POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet"
EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet"
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
EPC: "{{.DATA_DIR}}/certificates.csv"
JOURNEY_TIMES_BANK: "{{.DATA_DIR}}/journey_times_bank.parquet"
JOURNEY_TIMES_FITZROVIA: "{{.DATA_DIR}}/journey_times_fitzrovia_checkpoint.parquet"
ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet"
CRIME_DIR: "{{.DATA_DIR}}/crime"
CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet"
NOISE_OUTPUT: "{{.DATA_DIR}}/road_noise.parquet"
OFSTED_OUTPUT: "{{.DATA_DIR}}/ofsted.parquet"
NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet"
BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet"
SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet"
tasks:
prompt:epc:
desc: Prompt user to download EPC dataset (requires registration)
status:
- test -f {{.EPC}}
cmds:
- |
echo ""
echo "=== EPC dataset not found ==="
echo "The EPC certificates file is required: {{.EPC}}"
echo ""
echo "To obtain it, register at https://epc.opendatacommunities.org/login"
echo ""
exit 1
prompt:journey-times:
desc: Download TFL journey times if missing (requires API key registration)
status:
- test -f {{.JOURNEY_TIMES_BANK}} || test -f {{.JOURNEY_TIMES_FITZROVIA}}
deps:
- download:arcgis
cmds:
- |
echo ""
echo "=== TFL journey times not found ==="
echo "Register for a TFL API key at https://api-portal.tfl.gov.uk/signin"
echo "Then set the TFL_API_KEY environment variable and run:"
echo " task download:journey-times -- bank"
echo " task download:journey-times -- fitzrovia"
echo ""
exit 1
download:arcgis:
desc: Download and convert ArcGIS postcode data
status:
- test -f {{.ARCGIS_OUTPUT}}
cmds:
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
download:price-paid:
desc: Download and convert Land Registry price-paid data
status:
- test -f {{.PRICE_PAID_OUTPUT}}
cmds:
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
download:deprivation:
desc: Download and convert Index of Deprivation data
status:
- test -f {{.IOD_OUTPUT}}
cmds:
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
download:ethnicity:
desc: Download ethnicity by local authority data
status:
- test -f {{.ETHNICITY_OUTPUT}}
cmds:
- uv run python -m pipeline.download.ethnicity --output {{.ETHNICITY_OUTPUT}}
download:naptan:
desc: Download NaPTAN station data
status:
- test -f {{.NAPTAN_OUTPUT}}
cmds:
- uv run python -m pipeline.download.naptan --output {{.NAPTAN_OUTPUT}}
download:pois:
desc: Download and extract POIs from OpenStreetMap
status:
- test -f {{.POIS_RAW_OUTPUT}}
cmds:
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
download:ofsted:
desc: Download Ofsted school inspection outcomes
status:
- test -f {{.OFSTED_OUTPUT}}
cmds:
- uv run python -m pipeline.download.ofsted --output {{.OFSTED_OUTPUT}}
download:broadband:
desc: Download Ofcom broadband performance data
status:
- test -f {{.BROADBAND_OUTPUT}}
cmds:
- uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}}
download:noise:
desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids
deps:
- download:arcgis
status:
- test -f {{.NOISE_OUTPUT}}
cmds:
- uv run python -m pipeline.download.noise --arcgis {{.ARCGIS_OUTPUT}} --output {{.NOISE_OUTPUT}}
transform:pois:
desc: Transform raw POIs to filtered version with friendly names
deps:
- download:pois
- download:naptan
status:
- test -f {{.POIS_FILTERED_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --naptan {{.NAPTAN_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
transform:epc-pp:
desc: Fuzzy join EPC and Price Paid data
deps:
- download:price-paid
- prompt:epc
status:
- test -f {{.EPC_PP_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
transform:crime:
desc: Transform crime CSVs into yearly averages by LSOA
status:
- test -f {{.CRIME_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.crime --input {{.CRIME_DIR}} --output {{.CRIME_OUTPUT}}
transform:poi-proximity:
desc: Compute POI proximity counts per postcode
deps:
- download:arcgis
- transform:pois
status:
- test -f {{.POI_PROXIMITY_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
transform:school-proximity:
desc: Compute good+ school proximity counts per postcode
deps:
- download:ofsted
- download:arcgis
status:
- test -f {{.SCHOOL_PROXIMITY_OUTPUT}}
cmds:
- uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}}
download:journey-times:
desc: "Fetch TfL journey times: task download:journey-times"
deps:
- download:arcgis
status:
- test -f {{.DATA_DIR}}/journey_times_*.parquet
cmds:
- uv run python -m pipeline.journey_times --destination {{.CLI_ARGS}} --output-dir {{.DATA_DIR}} --postcodes {{.ARCGIS_OUTPUT}}
prepare:
desc: Build wide property dataframe with all joins
deps:
# - transform:epc-pp
# - download:arcgis
# - download:deprivation
# - download:ethnicity
# - download:broadband
# - download:noise
# - transform:crime
# - transform:poi-proximity
# - transform:school-proximity
# - prompt:journey-times
status:
- test -f {{.WIDE_OUTPUT}}
cmds:
- >-
uv run python -m pipeline.transform.merge
--epc-pp {{.EPC_PP_OUTPUT}}
--arcgis {{.ARCGIS_OUTPUT}}
--iod {{.IOD_OUTPUT}}
--poi-proximity {{.POI_PROXIMITY_OUTPUT}}
--journey-times-bank {{.JOURNEY_TIMES_BANK}}
--journey-times-fitzrovia {{.JOURNEY_TIMES_FITZROVIA}}
--ethnicity {{.ETHNICITY_OUTPUT}}
--crime {{.CRIME_OUTPUT}}
--noise {{.NOISE_OUTPUT}}
--school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}}
--broadband {{.BROADBAND_OUTPUT}}
--output {{.WIDE_OUTPUT}}