Format python
This commit is contained in:
parent
85f5770e09
commit
4c258018c3
17 changed files with 348 additions and 248 deletions
54
Taskfile.yml
54
Taskfile.yml
|
|
@ -21,47 +21,43 @@ tasks:
|
||||||
- cd frontend && npm install
|
- cd frontend && npm install
|
||||||
|
|
||||||
download:arcgis:
|
download:arcgis:
|
||||||
|
internal: true
|
||||||
desc: Download and convert ArcGIS postcode data
|
desc: Download and convert ArcGIS postcode data
|
||||||
sources:
|
|
||||||
- pipeline/download/arcgis.py
|
|
||||||
generates:
|
generates:
|
||||||
- "{{.ARCGIS_OUTPUT}}"
|
- "{{.ARCGIS_OUTPUT}}"
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
|
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
|
||||||
|
|
||||||
download:price-paid:
|
download:price-paid:
|
||||||
|
internal: true
|
||||||
desc: Download and convert Land Registry price-paid data
|
desc: Download and convert Land Registry price-paid data
|
||||||
sources:
|
|
||||||
- pipeline/download/price_paid.py
|
|
||||||
generates:
|
generates:
|
||||||
- "{{.PRICE_PAID_OUTPUT}}"
|
- "{{.PRICE_PAID_OUTPUT}}"
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
|
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
|
||||||
|
|
||||||
download:deprivation:
|
download:deprivation:
|
||||||
|
internal: true
|
||||||
desc: Download and convert Index of Deprivation data
|
desc: Download and convert Index of Deprivation data
|
||||||
sources:
|
|
||||||
- pipeline/download/deprivation_data.py
|
|
||||||
generates:
|
generates:
|
||||||
- "{{.IOD_OUTPUT}}"
|
- "{{.IOD_OUTPUT}}"
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
|
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
|
||||||
|
|
||||||
download:pois:
|
download:pois:
|
||||||
|
internal: true
|
||||||
desc: Download and extract POIs from OpenStreetMap
|
desc: Download and extract POIs from OpenStreetMap
|
||||||
sources:
|
|
||||||
- pipeline/download/pois.py
|
|
||||||
generates:
|
generates:
|
||||||
- "{{.POIS_RAW_OUTPUT}}"
|
- "{{.POIS_RAW_OUTPUT}}"
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
|
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
|
||||||
|
|
||||||
transform:pois:
|
transform:pois:
|
||||||
|
internal: true
|
||||||
desc: Transform raw POIs to filtered version with friendly names
|
desc: Transform raw POIs to filtered version with friendly names
|
||||||
deps:
|
deps:
|
||||||
- download:pois
|
- download:pois
|
||||||
sources:
|
sources:
|
||||||
- pipeline/transform/transform_poi.py
|
|
||||||
- "{{.POIS_RAW_OUTPUT}}"
|
- "{{.POIS_RAW_OUTPUT}}"
|
||||||
generates:
|
generates:
|
||||||
- "{{.POIS_FILTERED_OUTPUT}}"
|
- "{{.POIS_FILTERED_OUTPUT}}"
|
||||||
|
|
@ -69,12 +65,11 @@ tasks:
|
||||||
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
|
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
|
||||||
|
|
||||||
transform:epc-pp:
|
transform:epc-pp:
|
||||||
|
internal: true
|
||||||
desc: Fuzzy join EPC and Price Paid data
|
desc: Fuzzy join EPC and Price Paid data
|
||||||
deps:
|
deps:
|
||||||
- download:price-paid
|
- download:price-paid
|
||||||
sources:
|
sources:
|
||||||
- pipeline/transform/join_epc_pp.py
|
|
||||||
- pipeline/utils/fuzzy_join.py
|
|
||||||
- "{{.PRICE_PAID_OUTPUT}}"
|
- "{{.PRICE_PAID_OUTPUT}}"
|
||||||
- "{{.EPC_CSV}}"
|
- "{{.EPC_CSV}}"
|
||||||
generates:
|
generates:
|
||||||
|
|
@ -83,13 +78,12 @@ tasks:
|
||||||
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC_CSV}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
|
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC_CSV}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
|
||||||
|
|
||||||
transform:poi-proximity:
|
transform:poi-proximity:
|
||||||
|
internal: true
|
||||||
desc: Compute POI proximity counts per postcode
|
desc: Compute POI proximity counts per postcode
|
||||||
deps:
|
deps:
|
||||||
- download:arcgis
|
- download:arcgis
|
||||||
- transform:pois
|
- transform:pois
|
||||||
sources:
|
sources:
|
||||||
- pipeline/transform/poi_proximity.py
|
|
||||||
- pipeline/utils/poi_counts.py
|
|
||||||
- "{{.ARCGIS_OUTPUT}}"
|
- "{{.ARCGIS_OUTPUT}}"
|
||||||
- "{{.POIS_FILTERED_OUTPUT}}"
|
- "{{.POIS_FILTERED_OUTPUT}}"
|
||||||
generates:
|
generates:
|
||||||
|
|
@ -97,7 +91,7 @@ tasks:
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
|
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
|
||||||
|
|
||||||
transform:wide:
|
prepare:
|
||||||
desc: Build wide property dataframe with all joins
|
desc: Build wide property dataframe with all joins
|
||||||
deps:
|
deps:
|
||||||
- join:epc-pp
|
- join:epc-pp
|
||||||
|
|
@ -105,7 +99,6 @@ tasks:
|
||||||
- download:deprivation
|
- download:deprivation
|
||||||
- transform:poi-proximity
|
- transform:poi-proximity
|
||||||
sources:
|
sources:
|
||||||
- pipeline/transform/merge.py
|
|
||||||
- "{{.EPC_PP_OUTPUT}}"
|
- "{{.EPC_PP_OUTPUT}}"
|
||||||
- "{{.ARCGIS_OUTPUT}}"
|
- "{{.ARCGIS_OUTPUT}}"
|
||||||
- "{{.IOD_OUTPUT}}"
|
- "{{.IOD_OUTPUT}}"
|
||||||
|
|
@ -115,36 +108,37 @@ tasks:
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --output {{.WIDE_OUTPUT}}
|
- uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --output {{.WIDE_OUTPUT}}
|
||||||
|
|
||||||
prepare:
|
|
||||||
desc: Prepare the application (install, download data, run pipeline)
|
|
||||||
deps:
|
|
||||||
- transform:wide
|
|
||||||
|
|
||||||
test:
|
test:
|
||||||
cmds:
|
cmds:
|
||||||
- uv run -m pipeline.utils.test_fuzzy_join
|
- uv run -m pipeline.utils.test_fuzzy_join
|
||||||
- uv run pytest pipeline/utils/test_haversine.py
|
- uv run pytest pipeline/utils/test_haversine.py
|
||||||
- uv run pytest pipeline/utils/test_poi_counts.py
|
- uv run pytest pipeline/utils/test_poi_counts.py
|
||||||
|
|
||||||
server:
|
dev:server:
|
||||||
desc: Run Rust backend on port 8001
|
desc: Run Rust backend on port 8001
|
||||||
dir: server-rs
|
dir: server-rs
|
||||||
cmds:
|
cmds:
|
||||||
- cargo run --release -- {{.WIDE_OUTPUT}}
|
- cargo run --release -- {{.WIDE_OUTPUT}}
|
||||||
|
|
||||||
frontend:
|
dev:frontend:
|
||||||
desc: Run frontend dev server on port 3030 (proxies /api to :8001)
|
desc: Run frontend dev server on port 3030 (proxies /api to :8001)
|
||||||
dir: frontend
|
dir: frontend
|
||||||
cmds:
|
cmds:
|
||||||
- npm run dev
|
- npm run dev
|
||||||
|
|
||||||
build:
|
build:server:
|
||||||
|
desc: Build server for production
|
||||||
|
dir: frontend
|
||||||
|
cmds:
|
||||||
|
- cargo build --release
|
||||||
|
|
||||||
|
build:frontend:
|
||||||
desc: Build frontend for production
|
desc: Build frontend for production
|
||||||
dir: frontend
|
dir: frontend
|
||||||
cmds:
|
cmds:
|
||||||
|
- npm run typecheck
|
||||||
- npm run build
|
- npm run build
|
||||||
|
|
||||||
|
|
||||||
lint:
|
lint:
|
||||||
desc: Lint all code (Python, TypeScript, and Rust)
|
desc: Lint all code (Python, TypeScript, and Rust)
|
||||||
cmds:
|
cmds:
|
||||||
|
|
@ -195,17 +189,13 @@ tasks:
|
||||||
desc: Format Rust code with cargo fmt
|
desc: Format Rust code with cargo fmt
|
||||||
dir: server-rs
|
dir: server-rs
|
||||||
cmds:
|
cmds:
|
||||||
- cargo fmt
|
- cargo fmt --all
|
||||||
|
|
||||||
check:
|
check:
|
||||||
desc: Run all checks (lint, typecheck, build)
|
desc: Run all checks (lint, typecheck, build)
|
||||||
cmds:
|
cmds:
|
||||||
- task: lint
|
- task: lint
|
||||||
- task: typecheck
|
- task: build:server
|
||||||
- task: build
|
- task: build:frontend
|
||||||
|
- task: test
|
||||||
|
|
||||||
typecheck:
|
|
||||||
desc: Type check frontend TypeScript code
|
|
||||||
dir: frontend
|
|
||||||
cmds:
|
|
||||||
- npm run typecheck
|
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,6 @@ def download_with_progress(url: str, output_path: Path) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_zip(zip_path: Path, extract_path: Path) -> None:
|
def extract_zip(zip_path: Path, extract_path: Path) -> None:
|
||||||
extract_path.mkdir(exist_ok=True)
|
extract_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
|
@ -44,7 +43,7 @@ def extract_zip(zip_path: Path, extract_path: Path) -> None:
|
||||||
|
|
||||||
|
|
||||||
def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
|
def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
|
||||||
df = pl.scan_csv(data_path / 'Data/NSPL_MAY_2025_UK.csv', try_parse_dates=True)
|
df = pl.scan_csv(data_path / "Data/NSPL_MAY_2025_UK.csv", try_parse_dates=True)
|
||||||
print(f"Columns: {df.collect_schema().names()}")
|
print(f"Columns: {df.collect_schema().names()}")
|
||||||
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
df.sink_parquet(parquet_path, compression="zstd")
|
df.sink_parquet(parquet_path, compression="zstd")
|
||||||
|
|
@ -52,8 +51,12 @@ def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
parser = argparse.ArgumentParser(description="Download and convert ArcGIS postcode data")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
|
description="Download and convert ArcGIS postcode data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output parquet file path"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory() as cache_dir:
|
||||||
|
|
@ -64,5 +67,6 @@ def main() -> None:
|
||||||
extract_zip(download_path, extract_path)
|
extract_zip(download_path, extract_path)
|
||||||
convert_to_parquet(extract_path, args.output)
|
convert_to_parquet(extract_path, args.output)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -41,8 +41,12 @@ def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
parser = argparse.ArgumentParser(description="Download and convert Index of Deprivation data")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
|
description="Download and convert Index of Deprivation data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output parquet file path"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory() as cache_dir:
|
||||||
|
|
|
||||||
|
|
@ -8,16 +8,12 @@ import osmium
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
BATCH_SIZE = 50_000
|
BATCH_SIZE = 50_000
|
||||||
|
|
||||||
MIN_OCCURENCE_COUNT = 20
|
MIN_OCCURENCE_COUNT = 20
|
||||||
|
|
||||||
GEOFABRIK_GB_URL = (
|
GEOFABRIK_GB_URL = "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
|
||||||
"https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
|
|
||||||
)
|
|
||||||
|
|
||||||
UK_BBOX_WEST = -7.57
|
UK_BBOX_WEST = -7.57
|
||||||
UK_BBOX_SOUTH = 49.96
|
UK_BBOX_SOUTH = 49.96
|
||||||
|
|
@ -38,7 +34,6 @@ POI_TAG_KEYS: list[str] = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def download_pbf(pbf_file: Path) -> None:
|
def download_pbf(pbf_file: Path) -> None:
|
||||||
pbf_file.parent.mkdir(parents=True, exist_ok=True)
|
pbf_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
tmp = pbf_file.with_suffix(".pbf.tmp")
|
tmp = pbf_file.with_suffix(".pbf.tmp")
|
||||||
|
|
@ -91,7 +86,12 @@ class POIHandler(osmium.SimpleHandler):
|
||||||
self._batch.clear()
|
self._batch.clear()
|
||||||
|
|
||||||
def _add_poi(
|
def _add_poi(
|
||||||
self, osm_id: str, tags: osmium.osm.TagList, category: str, lat: float, lng: float
|
self,
|
||||||
|
osm_id: str,
|
||||||
|
tags: osmium.osm.TagList,
|
||||||
|
category: str,
|
||||||
|
lat: float,
|
||||||
|
lng: float,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._batch.append(
|
self._batch.append(
|
||||||
{
|
{
|
||||||
|
|
@ -123,8 +123,12 @@ class POIHandler(osmium.SimpleHandler):
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
parser = argparse.ArgumentParser(description="Download and extract POIs from OpenStreetMap")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
|
description="Download and extract POIs from OpenStreetMap"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output parquet file path"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory() as cache_dir:
|
||||||
|
|
|
||||||
|
|
@ -73,8 +73,12 @@ def convert_to_parquet(csv_path: Path, parquet_path: Path) -> None:
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
parser = argparse.ArgumentParser(description="Download and convert Land Registry price-paid data")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
|
description="Download and convert Land Registry price-paid data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output parquet file path"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as cache_dir:
|
with tempfile.TemporaryDirectory() as cache_dir:
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,7 @@
|
||||||
"""Configuration constants for journey times processing."""
|
"""Configuration constants for journey times processing."""
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from .models import Destination
|
from .models import Destination
|
||||||
|
|
||||||
DATA_DIR = Path("./data_sources")
|
|
||||||
OUTPUT_DIR = DATA_DIR / "processed"
|
|
||||||
|
|
||||||
MAX_DELAY = 10
|
MAX_DELAY = 10
|
||||||
REQUESTS_PER_MIN = 500
|
REQUESTS_PER_MIN = 500
|
||||||
|
|
|
||||||
|
|
@ -99,9 +99,7 @@ async def fetch_journey_for_mode(
|
||||||
journeys = data.get("journeys", [])
|
journeys = data.get("journeys", [])
|
||||||
if journeys:
|
if journeys:
|
||||||
durations = [
|
durations = [
|
||||||
j["duration"]
|
j["duration"] for j in journeys if j.get("duration") is not None
|
||||||
for j in journeys
|
|
||||||
if j.get("duration") is not None
|
|
||||||
]
|
]
|
||||||
if durations:
|
if durations:
|
||||||
return min(durations)
|
return min(durations)
|
||||||
|
|
|
||||||
|
|
@ -9,79 +9,108 @@ pl.Config.set_tbl_cols(-1)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
|
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
|
||||||
parser.add_argument("--epc", type=Path, required=True, help="EPC certificates CSV file")
|
parser.add_argument(
|
||||||
parser.add_argument("--price-paid", type=Path, required=True, help="Price paid parquet file")
|
"--epc", type=Path, required=True, help="EPC certificates CSV file"
|
||||||
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--price-paid", type=Path, required=True, help="Price paid parquet file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output parquet file path"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
epc = pl.scan_csv(args.epc).select(
|
epc = (
|
||||||
pl.col('ADDRESS').alias('epc_address'),
|
pl.scan_csv(args.epc)
|
||||||
'POSTCODE',
|
.select(
|
||||||
'CURRENT_ENERGY_RATING',
|
pl.col("ADDRESS").alias("epc_address"),
|
||||||
'POTENTIAL_ENERGY_RATING',
|
"POSTCODE",
|
||||||
pl.col('PROPERTY_TYPE').alias('epc_property_type'),
|
"CURRENT_ENERGY_RATING",
|
||||||
'BUILT_FORM',
|
"POTENTIAL_ENERGY_RATING",
|
||||||
'INSPECTION_DATE',
|
pl.col("PROPERTY_TYPE").alias("epc_property_type"),
|
||||||
'TOTAL_FLOOR_AREA',
|
"BUILT_FORM",
|
||||||
'NUMBER_HABITABLE_ROOMS',
|
"INSPECTION_DATE",
|
||||||
'FLOOR_HEIGHT',
|
"TOTAL_FLOOR_AREA",
|
||||||
'CONSTRUCTION_AGE_BAND'
|
"NUMBER_HABITABLE_ROOMS",
|
||||||
).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
|
"FLOOR_HEIGHT",
|
||||||
|
"CONSTRUCTION_AGE_BAND",
|
||||||
|
)
|
||||||
|
.filter(pl.col("epc_address").is_not_null())
|
||||||
|
.sort("INSPECTION_DATE", descending=True)
|
||||||
|
.group_by("epc_address", "POSTCODE")
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
print("EPC dataset")
|
print("EPC dataset")
|
||||||
print(epc.head().collect())
|
print(epc.head().collect())
|
||||||
|
|
||||||
# https://www.gov.uk/guidance/about-the-price-paid-data
|
# https://www.gov.uk/guidance/about-the-price-paid-data
|
||||||
property_type_map = {"D": "Detached", "S": "Semi-Detached", "T": "Terraced", "F": "Flats/Maisonettes", "O": "Other"}
|
property_type_map = {
|
||||||
|
"D": "Detached",
|
||||||
|
"S": "Semi-Detached",
|
||||||
|
"T": "Terraced",
|
||||||
|
"F": "Flats/Maisonettes",
|
||||||
|
"O": "Other",
|
||||||
|
}
|
||||||
duration_map = {"F": "Freehold", "L": "Leasehold"}
|
duration_map = {"F": "Freehold", "L": "Leasehold"}
|
||||||
|
|
||||||
price_paid = (pl.scan_parquet(args.price_paid).select(
|
price_paid = (
|
||||||
"price",
|
pl.scan_parquet(args.price_paid)
|
||||||
"date_of_transfer",
|
.select(
|
||||||
pl.col('property_type').alias("pp_property_type").replace(property_type_map),
|
"price",
|
||||||
"postcode",
|
"date_of_transfer",
|
||||||
'paon',
|
pl.col("property_type")
|
||||||
'saon',
|
.alias("pp_property_type")
|
||||||
'street',
|
.replace(property_type_map),
|
||||||
'locality',
|
"postcode",
|
||||||
'town_city',
|
"paon",
|
||||||
pl.col('duration').replace(duration_map)
|
"saon",
|
||||||
)
|
"street",
|
||||||
.filter(pl.col('pp_property_type') != 'Other').with_columns(
|
"locality",
|
||||||
pl.concat_str(
|
"town_city",
|
||||||
[pl.col('saon'), pl.col('paon'), pl.col('street')],
|
pl.col("duration").replace(duration_map),
|
||||||
separator=' ',
|
|
||||||
ignore_nulls=True,
|
|
||||||
).alias('pp_address'),
|
|
||||||
)
|
)
|
||||||
.sort('date_of_transfer')
|
.filter(pl.col("pp_property_type") != "Other")
|
||||||
.group_by('pp_address', 'postcode', maintain_order=True)
|
.with_columns(
|
||||||
|
pl.concat_str(
|
||||||
|
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||||
|
separator=" ",
|
||||||
|
ignore_nulls=True,
|
||||||
|
).alias("pp_address"),
|
||||||
|
)
|
||||||
|
.sort("date_of_transfer")
|
||||||
|
.group_by("pp_address", "postcode", maintain_order=True)
|
||||||
.agg(
|
.agg(
|
||||||
pl.struct(
|
pl.struct(
|
||||||
pl.col('date_of_transfer').dt.year().alias('year'),
|
pl.col("date_of_transfer").dt.year().alias("year"),
|
||||||
'price',
|
"price",
|
||||||
).alias('historical_prices'),
|
).alias("historical_prices"),
|
||||||
pl.col('pp_property_type').last(),
|
pl.col("pp_property_type").last(),
|
||||||
pl.col('duration').last(),
|
pl.col("duration").last(),
|
||||||
pl.col('price').last().alias('latest_price'),
|
pl.col("price").last().alias("latest_price"),
|
||||||
pl.col('date_of_transfer').last(),
|
pl.col("date_of_transfer").last(),
|
||||||
)
|
)
|
||||||
).filter(pl.col('pp_address').is_not_null())
|
).filter(pl.col("pp_address").is_not_null())
|
||||||
|
|
||||||
print("Price paid dataset")
|
print("Price paid dataset")
|
||||||
print(price_paid.head().collect())
|
print(price_paid.head().collect())
|
||||||
|
|
||||||
joined = fuzzy_join_on_postcode(
|
joined = (
|
||||||
left=price_paid,
|
fuzzy_join_on_postcode(
|
||||||
right=epc,
|
left=price_paid,
|
||||||
left_address_col='pp_address',
|
right=epc,
|
||||||
right_address_col='epc_address',
|
left_address_col="pp_address",
|
||||||
left_postcode_col='postcode',
|
right_address_col="epc_address",
|
||||||
right_postcode_col='POSTCODE',
|
left_postcode_col="postcode",
|
||||||
).drop('POSTCODE').collect()
|
right_postcode_col="POSTCODE",
|
||||||
|
)
|
||||||
|
.drop("POSTCODE")
|
||||||
|
.collect()
|
||||||
|
)
|
||||||
|
|
||||||
matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
|
matched = joined.filter(
|
||||||
|
pl.col("epc_address").is_not_null() & pl.col("pp_address").is_not_null()
|
||||||
|
)
|
||||||
total = joined.height
|
total = joined.height
|
||||||
print(f"Unique properties: {total}")
|
print(f"Unique properties: {total}")
|
||||||
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
|
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,9 @@ def _build_wide(
|
||||||
"lsoa21",
|
"lsoa21",
|
||||||
)
|
)
|
||||||
wide = wide.join(arcgis, on="postcode", how="inner")
|
wide = wide.join(arcgis, on="postcode", how="inner")
|
||||||
print(f" {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB")
|
print(
|
||||||
|
f" {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB"
|
||||||
|
)
|
||||||
|
|
||||||
# Journey times (optional)
|
# Journey times (optional)
|
||||||
if journey_times_path and journey_times_path.exists():
|
if journey_times_path and journey_times_path.exists():
|
||||||
|
|
@ -42,9 +44,7 @@ def _build_wide(
|
||||||
if iod_path and iod_path.exists():
|
if iod_path and iod_path.exists():
|
||||||
print("Joining IoD scores...")
|
print("Joining IoD scores...")
|
||||||
iod = pl.read_parquet(iod_path)
|
iod = pl.read_parquet(iod_path)
|
||||||
wide = wide.join(
|
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||||
iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left"
|
|
||||||
)
|
|
||||||
print(f" {wide.estimated_size('mb'):.1f} MB after IoD")
|
print(f" {wide.estimated_size('mb'):.1f} MB after IoD")
|
||||||
|
|
||||||
# POI proximity counts (pre-computed per postcode)
|
# POI proximity counts (pre-computed per postcode)
|
||||||
|
|
@ -66,44 +66,68 @@ def _build_wide(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Derived columns
|
# Derived columns
|
||||||
wide = wide.with_columns(
|
wide = (
|
||||||
(pl.col("latest_price") / pl.col("total_floor_area")).alias("Price per sqm"),
|
wide.with_columns(
|
||||||
).drop(
|
(pl.col("latest_price") / pl.col("total_floor_area")).alias(
|
||||||
'date_of_transfer',
|
"Price per sqm"
|
||||||
'inspection_date',
|
),
|
||||||
'floor_height',
|
)
|
||||||
'lsoa21',
|
.drop(
|
||||||
'LSOA code (2021)',
|
"date_of_transfer",
|
||||||
'Local Authority District code (2024)',
|
"inspection_date",
|
||||||
'Local Authority District name (2024)',
|
"floor_height",
|
||||||
'imd_score',
|
"lsoa21",
|
||||||
'housing_barriers_score',
|
"LSOA code (2021)",
|
||||||
'idaci_score',
|
"Local Authority District code (2024)",
|
||||||
'idaopi_score',
|
"Local Authority District name (2024)",
|
||||||
'children_young_people_score',
|
"imd_score",
|
||||||
'adult_skills_score',
|
"housing_barriers_score",
|
||||||
'geographical_barriers_score',
|
"idaci_score",
|
||||||
'wider_barriers_score',
|
"idaopi_score",
|
||||||
).rename({
|
"children_young_people_score",
|
||||||
'construction_age_band': "Approximate construction age",
|
"adult_skills_score",
|
||||||
"income_score": "Income Score (rate)",
|
"geographical_barriers_score",
|
||||||
"employment_score": "Employment Score (rate)",
|
"wider_barriers_score",
|
||||||
"education_score": "Education, Skills and Training Score",
|
)
|
||||||
"health_score": "Health Deprivation and Disability Score",
|
.rename(
|
||||||
"crime_score": "Crime Score",
|
{
|
||||||
})
|
"construction_age_band": "Approximate construction age",
|
||||||
|
"income_score": "Income Score (rate)",
|
||||||
|
"employment_score": "Employment Score (rate)",
|
||||||
|
"education_score": "Education, Skills and Training Score",
|
||||||
|
"health_score": "Health Deprivation and Disability Score",
|
||||||
|
"crime_score": "Crime Score",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return wide
|
return wide
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Build wide property dataframe with all joins")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file")
|
description="Build wide property dataframe with all joins"
|
||||||
parser.add_argument("--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file")
|
)
|
||||||
parser.add_argument("--iod", type=Path, help="Index of Deprivation parquet file (optional)")
|
parser.add_argument(
|
||||||
parser.add_argument("--poi-proximity", type=Path, help="POI proximity counts parquet file (optional)")
|
"--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
|
||||||
parser.add_argument("--journey-times", type=Path, help="Journey times parquet file (optional)")
|
)
|
||||||
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
|
parser.add_argument(
|
||||||
|
"--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--iod", type=Path, help="Index of Deprivation parquet file (optional)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--poi-proximity",
|
||||||
|
type=Path,
|
||||||
|
help="POI proximity counts parquet file (optional)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--journey-times", type=Path, help="Journey times parquet file (optional)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output parquet file path"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
wide = _build_wide(
|
wide = _build_wide(
|
||||||
|
|
@ -119,7 +143,7 @@ def main():
|
||||||
|
|
||||||
wide.write_parquet(args.output)
|
wide.write_parquet(args.output)
|
||||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||||
|
|
||||||
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -584,9 +584,7 @@ def transform(input_path: Path) -> pl.LazyFrame:
|
||||||
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
|
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
|
||||||
unmapped.append(cat)
|
unmapped.append(cat)
|
||||||
if unmapped:
|
if unmapped:
|
||||||
raise ValueError(
|
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
|
||||||
f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
|
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
|
||||||
mapped_but_absent = []
|
mapped_but_absent = []
|
||||||
|
|
@ -623,9 +621,15 @@ def transform(input_path: Path) -> pl.LazyFrame:
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Transform raw POIs to filtered version with friendly names")
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument("--input", type=Path, required=True, help="Raw POIs parquet file")
|
description="Transform raw POIs to filtered version with friendly names"
|
||||||
parser.add_argument("--output", type=Path, required=True, help="Output filtered POIs parquet file")
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input", type=Path, required=True, help="Raw POIs parquet file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
df = transform(args.input).collect()
|
df = transform(args.input).collect()
|
||||||
|
|
|
||||||
|
|
@ -2,4 +2,10 @@ from .fuzzy_join import fuzzy_join_on_postcode
|
||||||
from .haversine import haversine_km, haversine_km_expr
|
from .haversine import haversine_km, haversine_km_expr
|
||||||
from .poi_counts import POI_GROUPS, count_pois_within_radius
|
from .poi_counts import POI_GROUPS, count_pois_within_radius
|
||||||
|
|
||||||
__all__ = ["fuzzy_join_on_postcode", "haversine_km", "haversine_km_expr", "POI_GROUPS", "count_pois_within_radius"]
|
__all__ = [
|
||||||
|
"fuzzy_join_on_postcode",
|
||||||
|
"haversine_km",
|
||||||
|
"haversine_km_expr",
|
||||||
|
"POI_GROUPS",
|
||||||
|
"count_pois_within_radius",
|
||||||
|
]
|
||||||
|
|
|
||||||
|
|
@ -9,14 +9,14 @@ import polars as pl
|
||||||
from thefuzz import fuzz
|
from thefuzz import fuzz
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
_NUMBER_RE = re.compile(r'\d+')
|
_NUMBER_RE = re.compile(r"\d+")
|
||||||
|
|
||||||
|
|
||||||
def _normalize(s: pl.Expr) -> pl.Expr:
|
def _normalize(s: pl.Expr) -> pl.Expr:
|
||||||
return (
|
return (
|
||||||
s.str.to_uppercase()
|
s.str.to_uppercase()
|
||||||
.str.replace_all(r'[,.\-]', ' ')
|
.str.replace_all(r"[,.\-]", " ")
|
||||||
.str.replace_all(r'\s+', ' ')
|
.str.replace_all(r"\s+", " ")
|
||||||
.str.strip_chars()
|
.str.strip_chars()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -40,22 +40,25 @@ def fuzzy_join_on_postcode(
|
||||||
have null right columns.
|
have null right columns.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_')
|
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
|
||||||
left_path = Path(tmpdir) / 'left.parquet'
|
left_path = Path(tmpdir) / "left.parquet"
|
||||||
right_path = Path(tmpdir) / 'right.parquet'
|
right_path = Path(tmpdir) / "right.parquet"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Materialise each side exactly once, with a row index, to temp parquet.
|
# Materialise each side exactly once, with a row index, to temp parquet.
|
||||||
left.with_row_index('_left_idx').sink_parquet(left_path)
|
left.with_row_index("_left_idx").sink_parquet(left_path)
|
||||||
right.with_row_index('_right_idx').sink_parquet(right_path)
|
right.with_row_index("_right_idx").sink_parquet(right_path)
|
||||||
|
|
||||||
# Collect only the narrow columns needed for matching (projection pushdown).
|
# Collect only the narrow columns needed for matching (projection pushdown).
|
||||||
left_match = (
|
left_match = (
|
||||||
pl.scan_parquet(left_path)
|
pl.scan_parquet(left_path)
|
||||||
.select(
|
.select(
|
||||||
'_left_idx',
|
"_left_idx",
|
||||||
_normalize(pl.col(left_address_col)).alias('_left_address'),
|
_normalize(pl.col(left_address_col)).alias("_left_address"),
|
||||||
pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
|
pl.col(left_postcode_col)
|
||||||
|
.str.strip_chars()
|
||||||
|
.str.to_uppercase()
|
||||||
|
.alias("_left_postcode"),
|
||||||
)
|
)
|
||||||
.collect()
|
.collect()
|
||||||
)
|
)
|
||||||
|
|
@ -63,18 +66,23 @@ def fuzzy_join_on_postcode(
|
||||||
right_match = (
|
right_match = (
|
||||||
pl.scan_parquet(right_path)
|
pl.scan_parquet(right_path)
|
||||||
.select(
|
.select(
|
||||||
'_right_idx',
|
"_right_idx",
|
||||||
_normalize(pl.col(right_address_col)).alias('_right_address'),
|
_normalize(pl.col(right_address_col)).alias("_right_address"),
|
||||||
pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
|
pl.col(right_postcode_col)
|
||||||
|
.str.strip_chars()
|
||||||
|
.str.to_uppercase()
|
||||||
|
.alias("_right_postcode"),
|
||||||
)
|
)
|
||||||
.unique(subset=['_right_address', '_right_postcode'], keep='first')
|
.unique(subset=["_right_address", "_right_postcode"], keep="first")
|
||||||
.collect()
|
.collect()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Group right side by postcode for fast lookup
|
# Group right side by postcode for fast lookup
|
||||||
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||||
for idx, postcode, address in zip(
|
for idx, postcode, address in zip(
|
||||||
right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address']
|
right_match["_right_idx"],
|
||||||
|
right_match["_right_postcode"],
|
||||||
|
right_match["_right_address"],
|
||||||
):
|
):
|
||||||
if postcode is not None:
|
if postcode is not None:
|
||||||
right_by_postcode.setdefault(postcode, []).append((idx, address))
|
right_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||||
|
|
@ -82,7 +90,9 @@ def fuzzy_join_on_postcode(
|
||||||
# Group left side by postcode
|
# Group left side by postcode
|
||||||
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||||
for idx, postcode, address in zip(
|
for idx, postcode, address in zip(
|
||||||
left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address']
|
left_match["_left_idx"],
|
||||||
|
left_match["_left_postcode"],
|
||||||
|
left_match["_left_address"],
|
||||||
):
|
):
|
||||||
if address is not None and postcode is not None:
|
if address is not None and postcode is not None:
|
||||||
left_by_postcode.setdefault(postcode, []).append((idx, address))
|
left_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||||
|
|
@ -103,7 +113,7 @@ def fuzzy_join_on_postcode(
|
||||||
for pairs in tqdm(
|
for pairs in tqdm(
|
||||||
executor.map(_score_bucket, tasks, chunksize=64),
|
executor.map(_score_bucket, tasks, chunksize=64),
|
||||||
total=len(tasks),
|
total=len(tasks),
|
||||||
desc='Fuzzy matching',
|
desc="Fuzzy matching",
|
||||||
):
|
):
|
||||||
all_pairs.extend(pairs)
|
all_pairs.extend(pairs)
|
||||||
|
|
||||||
|
|
@ -127,24 +137,27 @@ def fuzzy_join_on_postcode(
|
||||||
|
|
||||||
# Build a small mapping LazyFrame and join back to the cached parquets.
|
# Build a small mapping LazyFrame and join back to the cached parquets.
|
||||||
if matches:
|
if matches:
|
||||||
mapping = pl.LazyFrame({
|
mapping = pl.LazyFrame(
|
||||||
'_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32),
|
{
|
||||||
'_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32),
|
"_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
|
||||||
})
|
"_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
mapping = pl.LazyFrame({
|
mapping = pl.LazyFrame(
|
||||||
'_left_idx': pl.Series([], dtype=pl.UInt32),
|
{
|
||||||
'_right_idx': pl.Series([], dtype=pl.UInt32),
|
"_left_idx": pl.Series([], dtype=pl.UInt32),
|
||||||
})
|
"_right_idx": pl.Series([], dtype=pl.UInt32),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
left_cached = pl.scan_parquet(left_path)
|
left_cached = pl.scan_parquet(left_path)
|
||||||
right_cached = pl.scan_parquet(right_path)
|
right_cached = pl.scan_parquet(right_path)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
left_cached
|
left_cached.join(mapping, on="_left_idx", how="left")
|
||||||
.join(mapping, on='_left_idx', how='left')
|
.join(right_cached, on="_right_idx", how="left")
|
||||||
.join(right_cached, on='_right_idx', how='left')
|
.drop("_left_idx", "_right_idx")
|
||||||
.drop('_left_idx', '_right_idx')
|
|
||||||
)
|
)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||||
|
|
@ -158,7 +171,9 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
||||||
"""
|
"""
|
||||||
nums_a = set(_NUMBER_RE.findall(a))
|
nums_a = set(_NUMBER_RE.findall(a))
|
||||||
nums_b = set(_NUMBER_RE.findall(b))
|
nums_b = set(_NUMBER_RE.findall(b))
|
||||||
smaller, larger = (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
|
smaller, larger = (
|
||||||
|
(nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
|
||||||
|
)
|
||||||
if not smaller and larger:
|
if not smaller and larger:
|
||||||
return False
|
return False
|
||||||
return smaller.issubset(larger)
|
return smaller.issubset(larger)
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,9 @@ import polars as pl
|
||||||
_EARTH_RADIUS_KM = 6371.0
|
_EARTH_RADIUS_KM = 6371.0
|
||||||
|
|
||||||
|
|
||||||
def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -> np.ndarray:
|
def haversine_km(
|
||||||
|
lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float
|
||||||
|
) -> np.ndarray:
|
||||||
"""Compute haversine distance in km between arrays (lat1, lon1) and a single point (lat2, lon2)."""
|
"""Compute haversine distance in km between arrays (lat1, lon1) and a single point (lat2, lon2)."""
|
||||||
lat1_rad = np.radians(lat1)
|
lat1_rad = np.radians(lat1)
|
||||||
lon1_rad = np.radians(lon1)
|
lon1_rad = np.radians(lon1)
|
||||||
|
|
@ -14,7 +16,10 @@ def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -
|
||||||
lon2_rad = np.radians(lon2)
|
lon2_rad = np.radians(lon2)
|
||||||
dlat = lat2_rad - lat1_rad
|
dlat = lat2_rad - lat1_rad
|
||||||
dlon = lon2_rad - lon1_rad
|
dlon = lon2_rad - lon1_rad
|
||||||
a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
|
a = (
|
||||||
|
np.sin(dlat / 2) ** 2
|
||||||
|
+ np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
|
||||||
|
)
|
||||||
c = 2 * np.arcsin(np.sqrt(a))
|
c = 2 * np.arcsin(np.sqrt(a))
|
||||||
return _EARTH_RADIUS_KM * c
|
return _EARTH_RADIUS_KM * c
|
||||||
|
|
||||||
|
|
@ -32,5 +37,7 @@ def haversine_km_expr(
|
||||||
dlat = pl.lit(dest_lat_rad) - lat_rad
|
dlat = pl.lit(dest_lat_rad) - lat_rad
|
||||||
dlon = pl.lit(dest_lon_rad) - lon_rad
|
dlon = pl.lit(dest_lon_rad) - lon_rad
|
||||||
|
|
||||||
a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (dlon / 2).sin() ** 2
|
a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (
|
||||||
|
dlon / 2
|
||||||
|
).sin() ** 2
|
||||||
return 2 * _EARTH_RADIUS_KM * a.sqrt().arcsin()
|
return 2 * _EARTH_RADIUS_KM * a.sqrt().arcsin()
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,9 @@ def _count_pois_per_postcode(
|
||||||
pc_codes = postcodes_df["postcode"].to_list()
|
pc_codes = postcodes_df["postcode"].to_list()
|
||||||
|
|
||||||
# Initialize result arrays
|
# Initialize result arrays
|
||||||
result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS}
|
result_counts = {
|
||||||
|
group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS
|
||||||
|
}
|
||||||
|
|
||||||
# Process in batches with progress
|
# Process in batches with progress
|
||||||
batch_size = 50000
|
batch_size = 50000
|
||||||
|
|
@ -83,7 +85,9 @@ def _count_pois_per_postcode(
|
||||||
end_idx = min(start_idx + batch_size, n_postcodes)
|
end_idx = min(start_idx + batch_size, n_postcodes)
|
||||||
|
|
||||||
if batch_idx % 5 == 0:
|
if batch_idx % 5 == 0:
|
||||||
print(f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}")
|
print(
|
||||||
|
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
|
||||||
|
)
|
||||||
|
|
||||||
# Process batch
|
# Process batch
|
||||||
for i in range(start_idx, end_idx):
|
for i in range(start_idx, end_idx):
|
||||||
|
|
@ -109,12 +113,7 @@ def _count_pois_per_postcode(
|
||||||
nearby = np.concatenate(nearby_indices)
|
nearby = np.concatenate(nearby_indices)
|
||||||
|
|
||||||
# Vectorized distance calculation for all nearby POIs
|
# Vectorized distance calculation for all nearby POIs
|
||||||
distances = haversine_km(
|
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lat, pc_lon)
|
||||||
poi_lats[nearby],
|
|
||||||
poi_lngs[nearby],
|
|
||||||
pc_lat,
|
|
||||||
pc_lon
|
|
||||||
)
|
|
||||||
|
|
||||||
# Filter by radius
|
# Filter by radius
|
||||||
within_mask = distances <= radius_km
|
within_mask = distances <= radius_km
|
||||||
|
|
@ -147,13 +146,13 @@ def count_pois_within_radius(
|
||||||
"""
|
"""
|
||||||
# Get unique postcodes with coordinates
|
# Get unique postcodes with coordinates
|
||||||
print("Deduplicating postcodes...")
|
print("Deduplicating postcodes...")
|
||||||
unique_postcodes = (
|
unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
|
||||||
properties
|
subset=["postcode"]
|
||||||
.select(["postcode", "lat", "lon"])
|
|
||||||
.unique(subset=["postcode"])
|
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes")
|
print(
|
||||||
|
f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
|
||||||
|
)
|
||||||
|
|
||||||
# Count POIs per postcode
|
# Count POIs per postcode
|
||||||
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
|
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
|
||||||
|
|
@ -174,11 +173,7 @@ def count_pois_within_radius(
|
||||||
result_lazy = (
|
result_lazy = (
|
||||||
properties.lazy()
|
properties.lazy()
|
||||||
.select("postcode")
|
.select("postcode")
|
||||||
.join(
|
.join(pl.scan_parquet(tmp_path), on="postcode", how="left")
|
||||||
pl.scan_parquet(tmp_path),
|
|
||||||
on="postcode",
|
|
||||||
how="left"
|
|
||||||
)
|
|
||||||
.select(count_cols)
|
.select(count_cols)
|
||||||
.fill_null(0)
|
.fill_null(0)
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,6 @@ result = fuzzy_join_on_postcode(
|
||||||
|
|
||||||
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
||||||
|
|
||||||
print('Testing the matching between EPC and PP addresses')
|
print("Testing the matching between EPC and PP addresses")
|
||||||
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
|
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
|
||||||
print(snapshot)
|
print(snapshot)
|
||||||
|
|
|
||||||
|
|
@ -73,29 +73,39 @@ class TestHaversineKmExpr:
|
||||||
def test_same_point(self):
|
def test_same_point(self):
|
||||||
"""Distance from a point to itself should be zero."""
|
"""Distance from a point to itself should be zero."""
|
||||||
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
|
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
|
||||||
result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist"))
|
result = df.select(
|
||||||
|
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
|
||||||
|
)
|
||||||
assert result["dist"][0] == pytest.approx(0.0, abs=1e-10)
|
assert result["dist"][0] == pytest.approx(0.0, abs=1e-10)
|
||||||
|
|
||||||
def test_known_distance_london_to_paris(self):
|
def test_known_distance_london_to_paris(self):
|
||||||
"""Test distance from London to Paris (~344 km)."""
|
"""Test distance from London to Paris (~344 km)."""
|
||||||
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
|
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
|
||||||
result = df.select(haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist"))
|
result = df.select(
|
||||||
|
haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist")
|
||||||
|
)
|
||||||
assert result["dist"][0] == pytest.approx(344, rel=0.01)
|
assert result["dist"][0] == pytest.approx(344, rel=0.01)
|
||||||
|
|
||||||
def test_known_distance_new_york_to_london(self):
|
def test_known_distance_new_york_to_london(self):
|
||||||
"""Test distance from New York to London (~5570 km)."""
|
"""Test distance from New York to London (~5570 km)."""
|
||||||
df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]})
|
df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]})
|
||||||
result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist"))
|
result = df.select(
|
||||||
|
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
|
||||||
|
)
|
||||||
assert result["dist"][0] == pytest.approx(5570, rel=0.01)
|
assert result["dist"][0] == pytest.approx(5570, rel=0.01)
|
||||||
|
|
||||||
def test_multiple_points(self):
|
def test_multiple_points(self):
|
||||||
"""Test calculating distances from multiple points to a single destination."""
|
"""Test calculating distances from multiple points to a single destination."""
|
||||||
df = pl.DataFrame({
|
df = pl.DataFrame(
|
||||||
"lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC
|
{
|
||||||
"lon": [-0.1278, 2.3522, -74.0060],
|
"lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC
|
||||||
})
|
"lon": [-0.1278, 2.3522, -74.0060],
|
||||||
|
}
|
||||||
|
)
|
||||||
# Distance to Edinburgh
|
# Distance to Edinburgh
|
||||||
result = df.select(haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist"))
|
result = df.select(
|
||||||
|
haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist")
|
||||||
|
)
|
||||||
|
|
||||||
dists = result["dist"].to_numpy()
|
dists = result["dist"].to_numpy()
|
||||||
# All distances should be positive
|
# All distances should be positive
|
||||||
|
|
@ -128,7 +138,9 @@ class TestHaversineConsistency:
|
||||||
|
|
||||||
# Polars version
|
# Polars version
|
||||||
df = pl.DataFrame({"lat": lats, "lon": lons})
|
df = pl.DataFrame({"lat": lats, "lon": lons})
|
||||||
polars_result = df.select(haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist"))
|
polars_result = df.select(
|
||||||
|
haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist")
|
||||||
|
)
|
||||||
polars_dists = polars_result["dist"].to_numpy()
|
polars_dists = polars_result["dist"].to_numpy()
|
||||||
|
|
||||||
# Should be identical (or at least very close due to floating point)
|
# Should be identical (or at least very close due to floating point)
|
||||||
|
|
|
||||||
|
|
@ -7,28 +7,32 @@ from pipeline.utils.poi_counts import POI_GROUPS, count_pois_within_radius
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def pois():
|
def pois():
|
||||||
"""POIs clustered around two locations: central London and 10km away."""
|
"""POIs clustered around two locations: central London and 10km away."""
|
||||||
return pl.DataFrame({
|
return pl.DataFrame(
|
||||||
"lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
|
{
|
||||||
"lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
|
"lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
|
||||||
"category": [
|
"lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
|
||||||
"Restaurant",
|
"category": [
|
||||||
"Fast Food",
|
"Restaurant",
|
||||||
"Supermarket",
|
"Fast Food",
|
||||||
"Park",
|
"Supermarket",
|
||||||
"Station",
|
"Park",
|
||||||
"Restaurant", # too far from any property
|
"Station",
|
||||||
],
|
"Restaurant", # too far from any property
|
||||||
})
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def properties():
|
def properties():
|
||||||
"""Two properties at the same postcode near central London, one at a distant postcode."""
|
"""Two properties at the same postcode near central London, one at a distant postcode."""
|
||||||
return pl.DataFrame({
|
return pl.DataFrame(
|
||||||
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
|
{
|
||||||
"lat": [51.5074, 51.5074, 55.0],
|
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
|
||||||
"lon": [-0.1278, -0.1278, -3.0],
|
"lat": [51.5074, 51.5074, 55.0],
|
||||||
})
|
"lon": [-0.1278, -0.1278, -3.0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_counts_pois_within_radius(properties, pois):
|
def test_counts_pois_within_radius(properties, pois):
|
||||||
|
|
@ -41,9 +45,9 @@ def test_counts_pois_within_radius(properties, pois):
|
||||||
assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"
|
assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"
|
||||||
|
|
||||||
# First two rows share a postcode near the central London cluster
|
# First two rows share a postcode near the central London cluster
|
||||||
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
||||||
assert result["groceries_2km"][0] == 1 # Supermarket
|
assert result["groceries_2km"][0] == 1 # Supermarket
|
||||||
assert result["parks_2km"][0] == 1 # Park
|
assert result["parks_2km"][0] == 1 # Park
|
||||||
assert result["public_transport_2km"][0] == 1 # Station
|
assert result["public_transport_2km"][0] == 1 # Station
|
||||||
|
|
||||||
# Second row is the same postcode, so same counts
|
# Second row is the same postcode, so same counts
|
||||||
|
|
@ -55,11 +59,13 @@ def test_counts_pois_within_radius(properties, pois):
|
||||||
|
|
||||||
|
|
||||||
def test_no_pois_returns_zeros(properties):
|
def test_no_pois_returns_zeros(properties):
|
||||||
empty_pois = pl.DataFrame({
|
empty_pois = pl.DataFrame(
|
||||||
"lat": pl.Series([], dtype=pl.Float64),
|
{
|
||||||
"lng": pl.Series([], dtype=pl.Float64),
|
"lat": pl.Series([], dtype=pl.Float64),
|
||||||
"category": pl.Series([], dtype=pl.String),
|
"lng": pl.Series([], dtype=pl.Float64),
|
||||||
})
|
"category": pl.Series([], dtype=pl.String),
|
||||||
|
}
|
||||||
|
)
|
||||||
result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)
|
result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)
|
||||||
|
|
||||||
for group in POI_GROUPS:
|
for group in POI_GROUPS:
|
||||||
|
|
@ -70,11 +76,13 @@ def test_no_pois_returns_zeros(properties):
|
||||||
|
|
||||||
def test_custom_radius(pois):
|
def test_custom_radius(pois):
|
||||||
"""A tiny radius should exclude POIs that are even slightly away."""
|
"""A tiny radius should exclude POIs that are even slightly away."""
|
||||||
properties = pl.DataFrame({
|
properties = pl.DataFrame(
|
||||||
"postcode": ["EC1A 1BB"],
|
{
|
||||||
"lat": [51.5074],
|
"postcode": ["EC1A 1BB"],
|
||||||
"lon": [-0.1278],
|
"lat": [51.5074],
|
||||||
})
|
"lon": [-0.1278],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# 0.01 km = 10m — only the POI at the exact same location should match
|
# 0.01 km = 10m — only the POI at the exact same location should match
|
||||||
result = count_pois_within_radius(properties, pois, radius_km=0.01)
|
result = count_pois_within_radius(properties, pois, radius_km=0.01)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue