From 5e210e14bd9acb02f89f9f0971171d8fb398724a Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 1 Feb 2026 13:05:00 +0000 Subject: [PATCH] Break up taskfile --- Taskfile.data.yml | 191 ++++++++++++++++++++++++++++++++++++++++++++++ Taskfile.yml | 124 +++++------------------------- 2 files changed, 211 insertions(+), 104 deletions(-) create mode 100644 Taskfile.data.yml diff --git a/Taskfile.data.yml b/Taskfile.data.yml new file mode 100644 index 0000000..5907545 --- /dev/null +++ b/Taskfile.data.yml @@ -0,0 +1,191 @@ +version: '3' + +vars: + DATA_DIR: /bulk/property-data + ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet" + PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet" + IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet" + POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet" + POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet" + POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet" + EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet" + WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet" + EPC: "{{.DATA_DIR}}/certificates.csv" + JOURNEY_TIMES: "./data_sources/processed/journey_times_bank_checkpoint.parquet" + ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet" + CRIME_DIR: "{{.DATA_DIR}}/crime" + CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet" + NOISE_OUTPUT: "{{.DATA_DIR}}/road_noise.parquet" + OFSTED_OUTPUT: "{{.DATA_DIR}}/ofsted.parquet" + NAPTAN_OUTPUT: "{{.DATA_DIR}}/naptan.parquet" + BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet" + SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet" + +tasks: + prompt:epc: + desc: Prompt user to download EPC dataset (requires registration) + status: + - test -f {{.EPC}} + cmds: + - | + echo "" + echo "=== EPC dataset not found ===" + echo "The EPC certificates file is required: {{.EPC}}" + echo "" + echo "To obtain it, register at https://epc.opendatacommunities.org/login" + echo "" + exit 1 + + prompt:journey-times: + desc: Download TFL journey times if missing (requires API key registration) + status: + - test -f {{.JOURNEY_TIMES}} + deps: + - download:arcgis + cmds: + - | + echo "" + echo "=== TFL journey times not found ===" + echo "Register for a TFL API key at https://api-portal.tfl.gov.uk/signin" + echo "Then set the TFL_API_KEY environment variable and re-run this task." + echo "" + exit 1 + + download:arcgis: + desc: Download and convert ArcGIS postcode data + status: + - test -f {{.ARCGIS_OUTPUT}} + cmds: + - uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}} + + download:price-paid: + desc: Download and convert Land Registry price-paid data + status: + - test -f {{.PRICE_PAID_OUTPUT}} + cmds: + - uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}} + + download:deprivation: + desc: Download and convert Index of Deprivation data + status: + - test -f {{.IOD_OUTPUT}} + cmds: + - uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}} + + download:ethnicity: + desc: Download ethnicity by local authority data + status: + - test -f {{.ETHNICITY_OUTPUT}} + cmds: + - uv run python -m pipeline.download.ethnicity --output {{.ETHNICITY_OUTPUT}} + + download:naptan: + desc: Download NaPTAN station data + status: + - test -f {{.NAPTAN_OUTPUT}} + cmds: + - uv run python -m pipeline.download.naptan --output {{.NAPTAN_OUTPUT}} + + download:pois: + desc: Download and extract POIs from OpenStreetMap + status: + - test -f {{.POIS_RAW_OUTPUT}} + cmds: + - uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}} + + download:ofsted: + desc: Download Ofsted school inspection outcomes + status: + - test -f {{.OFSTED_OUTPUT}} + cmds: + - uv run python -m pipeline.download.ofsted --output {{.OFSTED_OUTPUT}} + + download:broadband: + desc: Download Ofcom broadband performance data + status: + - test -f {{.BROADBAND_OUTPUT}} + cmds: + - uv run python -m pipeline.download.broadband --output {{.BROADBAND_OUTPUT}} + + download:noise: + desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids + deps: + - download:arcgis + status: + - test -f {{.NOISE_OUTPUT}} + cmds: + - uv run python -m pipeline.download.noise --arcgis {{.ARCGIS_OUTPUT}} --output {{.NOISE_OUTPUT}} + + transform:pois: + desc: Transform raw POIs to filtered version with friendly names + deps: + - download:pois + - download:naptan + status: + - test -f {{.POIS_FILTERED_OUTPUT}} + cmds: + - uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --naptan {{.NAPTAN_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}} + + transform:epc-pp: + desc: Fuzzy join EPC and Price Paid data + deps: + - download:price-paid + - prompt:epc + status: + - test -f {{.EPC_PP_OUTPUT}} + cmds: + - uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}} + + transform:crime: + desc: Transform crime CSVs into yearly averages by LSOA + status: + - test -f {{.CRIME_OUTPUT}} + cmds: + - uv run python -m pipeline.transform.crime --input {{.CRIME_DIR}} --output {{.CRIME_OUTPUT}} + + transform:poi-proximity: + desc: Compute POI proximity counts per postcode + deps: + - download:arcgis + - transform:pois + status: + - test -f {{.POI_PROXIMITY_OUTPUT}} + cmds: + - uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}} + + transform:school-proximity: + desc: Compute good+ school proximity counts per postcode + deps: + - download:ofsted + - download:arcgis + status: + - test -f {{.SCHOOL_PROXIMITY_OUTPUT}} + cmds: + - uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}} + + download:journey-times: + desc: Fetch TfL journey times for all postcodes + deps: + - download:arcgis + status: + - test -f {{.JOURNEY_TIMES}} + cmds: + - uv run python -m pipeline.journey_times + + prepare: + desc: Build wide property dataframe with all joins + deps: + - transform:epc-pp + - download:arcgis + - download:deprivation + - download:ethnicity + - download:broadband + - download:noise + - transform:crime + - transform:poi-proximity + - transform:school-proximity + - prompt:journey-times + status: + - test -f {{.WIDE_OUTPUT}} + cmds: + - uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --ethnicity {{.ETHNICITY_OUTPUT}} --crime {{.CRIME_OUTPUT}} --noise {{.NOISE_OUTPUT}} --school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}} --broadband {{.BROADBAND_OUTPUT}} --output {{.WIDE_OUTPUT}} diff --git a/Taskfile.yml b/Taskfile.yml index b6334c6..498cd36 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -1,17 +1,14 @@ version: '3' +includes: + data: + taskfile: ./Taskfile.data.yml + flatten: true + vars: - DATA_DIR: data - ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet" - PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet" - IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet" - POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet" - POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet" - POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet" - EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet" + DATA_DIR: /bulk/property-data WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet" - EPC_CSV: "{{.DATA_DIR}}/epc/certificates.csv" - JOURNEY_TIMES: "{{.DATA_DIR}}/journey_times_bank_checkpoint.parquet" + POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet" tasks: install: @@ -20,94 +17,6 @@ tasks: - uv sync - cd frontend && npm install - download:arcgis: - internal: true - desc: Download and convert ArcGIS postcode data - generates: - - "{{.ARCGIS_OUTPUT}}" - cmds: - - uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}} - - download:price-paid: - internal: true - desc: Download and convert Land Registry price-paid data - generates: - - "{{.PRICE_PAID_OUTPUT}}" - cmds: - - uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}} - - download:deprivation: - internal: true - desc: Download and convert Index of Deprivation data - generates: - - "{{.IOD_OUTPUT}}" - cmds: - - uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}} - - download:pois: - internal: true - desc: Download and extract POIs from OpenStreetMap - generates: - - "{{.POIS_RAW_OUTPUT}}" - cmds: - - uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}} - - transform:pois: - internal: true - desc: Transform raw POIs to filtered version with friendly names - deps: - - download:pois - sources: - - "{{.POIS_RAW_OUTPUT}}" - generates: - - "{{.POIS_FILTERED_OUTPUT}}" - cmds: - - uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}} - - transform:epc-pp: - internal: true - desc: Fuzzy join EPC and Price Paid data - deps: - - download:price-paid - sources: - - "{{.PRICE_PAID_OUTPUT}}" - - "{{.EPC_CSV}}" - generates: - - "{{.EPC_PP_OUTPUT}}" - cmds: - - uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC_CSV}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}} - - transform:poi-proximity: - internal: true - desc: Compute POI proximity counts per postcode - deps: - - download:arcgis - - transform:pois - sources: - - "{{.ARCGIS_OUTPUT}}" - - "{{.POIS_FILTERED_OUTPUT}}" - generates: - - "{{.POI_PROXIMITY_OUTPUT}}" - cmds: - - uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}} - - prepare: - desc: Build wide property dataframe with all joins - deps: - - join:epc-pp - - download:arcgis - - download:deprivation - - transform:poi-proximity - sources: - - "{{.EPC_PP_OUTPUT}}" - - "{{.ARCGIS_OUTPUT}}" - - "{{.IOD_OUTPUT}}" - - "{{.POI_PROXIMITY_OUTPUT}}" - generates: - - "{{.WIDE_OUTPUT}}" - cmds: - - uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --output {{.WIDE_OUTPUT}} - test: cmds: - uv run -m pipeline.utils.test_fuzzy_join @@ -115,10 +24,16 @@ tasks: - uv run pytest pipeline/utils/test_poi_counts.py dev:server: - desc: Run Rust backend on port 8001 + desc: Run Rust backend on port 8001 (debug build, fast compile) dir: server-rs cmds: - - cargo run --release -- {{.WIDE_OUTPUT}} + - cargo run -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} + + dev:server:release: + desc: Run Rust backend on port 8001 (release build) + dir: server-rs + cmds: + - cargo run --release -- --data {{.WIDE_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} dev:frontend: desc: Run frontend dev server on port 3030 (proxies /api to :8001) @@ -128,7 +43,7 @@ tasks: build:server: desc: Build server for production - dir: frontend + dir: server-rs cmds: - cargo build --release @@ -147,9 +62,10 @@ tasks: - task: lint:rust lint:python: - desc: Lint Python code with ruff + desc: Lint Python code with ruff and check for unused dependencies cmds: - uv run ruff check . + - uv run deptry . lint:frontend: desc: Lint frontend TypeScript code @@ -159,11 +75,12 @@ tasks: - npm run format:check lint:rust: - desc: Lint Rust code with clippy and check formatting + desc: Lint Rust code with clippy, check formatting, and detect unused dependencies dir: server-rs cmds: - cargo clippy -- -D warnings - cargo fmt --check + - cargo machete format: desc: Format all code (Python, TypeScript, and Rust) @@ -198,4 +115,3 @@ tasks: - task: build:server - task: build:frontend - task: test -