This commit is contained in:
Andras Schmelczer 2026-02-15 09:48:30 +00:00
parent 128b3191e7
commit 03445188ea
54 changed files with 596953 additions and 3577 deletions

65
finder/storage.py Normal file
View file

@ -0,0 +1,65 @@
import logging
from pathlib import Path
import polars as pl
log = logging.getLogger("rightmove")
def write_parquet(properties: list[dict], path: Path) -> None:
"""Write properties list to parquet using Polars."""
if not properties:
log.warning("No properties to write to %s", path)
return
df = pl.DataFrame(
{
"id": [p["id"] for p in properties],
"bedrooms": [p["bedrooms"] for p in properties],
"bathrooms": [p["bathrooms"] for p in properties],
"total_rooms": [p["total_rooms"] for p in properties],
"longitude": [p["longitude"] for p in properties],
"latitude": [p["latitude"] for p in properties],
"postcode": [p["postcode"] for p in properties],
"address": [p["address"] for p in properties],
"tenure": [p["tenure"] for p in properties],
"property_type": [p["property_type"] for p in properties],
"property_sub_type": [p["property_sub_type"] for p in properties],
"price": [p["price"] for p in properties],
"price_frequency": [p["price_frequency"] for p in properties],
"price_qualifier": [p["price_qualifier"] for p in properties],
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
"url": [p["url"] for p in properties],
"features": [p["features"] for p in properties],
"first_visible_date": [p["first_visible_date"] for p in properties],
"update_date": [p["update_date"] for p in properties],
"outcode": [p["outcode"] for p in properties],
"house_share": [p["house_share"] for p in properties],
},
schema={
"id": pl.Int64,
"bedrooms": pl.Int32,
"bathrooms": pl.Int32,
"total_rooms": pl.Int32,
"longitude": pl.Float64,
"latitude": pl.Float64,
"postcode": pl.Utf8,
"address": pl.Utf8,
"tenure": pl.Utf8,
"property_type": pl.Utf8,
"property_sub_type": pl.Utf8,
"price": pl.Int64,
"price_frequency": pl.Utf8,
"price_qualifier": pl.Utf8,
"floorspace_sqm": pl.Float64,
"url": pl.Utf8,
"features": pl.List(pl.Utf8),
"first_visible_date": pl.Utf8,
"update_date": pl.Utf8,
"outcode": pl.Utf8,
"house_share": pl.Boolean,
},
)
df.write_parquet(path)
log.info("Wrote %d properties to %s", len(df), path)