Add hexagon backend
This commit is contained in:
parent
a7cc4d9b2b
commit
ab704c0dc0
18 changed files with 1443 additions and 0 deletions
0
pipeline/sources/__init__.py
Normal file
0
pipeline/sources/__init__.py
Normal file
48
pipeline/sources/postcodes.py
Normal file
48
pipeline/sources/postcodes.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
from pathlib import Path
|
||||
import polars as pl
|
||||
import h3
|
||||
|
||||
from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR
|
||||
|
||||
|
||||
def lat_long_to_h3(lat: float, long: float, resolution: int) -> str:
|
||||
"""Convert lat/long to H3 index at given resolution."""
|
||||
return h3.latlng_to_cell(lat, long, resolution)
|
||||
|
||||
|
||||
def load_postcodes() -> pl.LazyFrame:
|
||||
"""Load postcode data from arcgis parquet file."""
|
||||
return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
pl.col("lat"),
|
||||
pl.col("long"),
|
||||
)
|
||||
|
||||
|
||||
def process_postcodes() -> pl.LazyFrame:
|
||||
"""Process postcodes and add H3 indices at multiple resolutions."""
|
||||
df = load_postcodes().collect()
|
||||
|
||||
for res in H3_RESOLUTIONS:
|
||||
col_name = f"h3_res{res}"
|
||||
df = df.with_columns(
|
||||
pl.struct(["lat", "long"])
|
||||
.map_elements(
|
||||
lambda x: lat_long_to_h3(x["lat"], x["long"], res),
|
||||
return_dtype=pl.Utf8,
|
||||
)
|
||||
.alias(col_name)
|
||||
)
|
||||
|
||||
return df.lazy()
|
||||
|
||||
|
||||
def save_postcodes(output_path: Path | None = None) -> Path:
|
||||
"""Process and save postcodes with H3 indices."""
|
||||
output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet"
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
df = process_postcodes().collect()
|
||||
df.write_parquet(output_path)
|
||||
|
||||
return output_path
|
||||
41
pipeline/sources/property_prices.py
Normal file
41
pipeline/sources/property_prices.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.base import DataSource
|
||||
from pipeline.config import DATA_DIR, H3_RESOLUTIONS
|
||||
|
||||
|
||||
class PropertyPricesSource(DataSource):
|
||||
"""Land Registry property prices data source."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "property_prices"
|
||||
|
||||
def load(self) -> pl.LazyFrame:
|
||||
"""Load raw property prices data."""
|
||||
return pl.scan_parquet(DATA_DIR / "pp-complete.parquet")
|
||||
|
||||
def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Process and join with postcode coordinates and H3 indices."""
|
||||
prices = self.load().select(
|
||||
pl.col("price"),
|
||||
pl.col("date_of_transfer").dt.year().alias("year"),
|
||||
pl.col("property_type"),
|
||||
pl.col("postcode"),
|
||||
)
|
||||
|
||||
joined = prices.join(
|
||||
postcodes,
|
||||
on="postcode",
|
||||
how="inner",
|
||||
)
|
||||
|
||||
h3_cols = [pl.col(f"h3_res{res}") for res in H3_RESOLUTIONS]
|
||||
return joined.select(
|
||||
pl.col("price"),
|
||||
pl.col("year"),
|
||||
pl.col("property_type"),
|
||||
pl.col("lat"),
|
||||
pl.col("long"),
|
||||
*h3_cols,
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue