changes
This commit is contained in:
parent
524580eb25
commit
ffe080adef
82 changed files with 2652 additions and 2956 deletions
|
|
@ -1,63 +1,94 @@
|
|||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from transform import normalize_price
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||||
"""Write properties list to parquet using Polars."""
|
||||
def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||
"""Write properties list to parquet with server-ready column names.
|
||||
|
||||
channel: "buy" or "rent"
|
||||
"""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
# Parse first_visible_date to datetime
|
||||
listing_dates = []
|
||||
for p in properties:
|
||||
fvd = p.get("first_visible_date", "")
|
||||
if fvd:
|
||||
try:
|
||||
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
|
||||
listing_dates.append(dt.replace(tzinfo=None))
|
||||
except (ValueError, TypeError):
|
||||
listing_dates.append(None)
|
||||
else:
|
||||
listing_dates.append(None)
|
||||
|
||||
# Derive asking price / asking rent based on channel
|
||||
if channel == "buy":
|
||||
asking_prices = [p["price"] for p in properties]
|
||||
asking_rents = [None] * len(properties)
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
else:
|
||||
asking_prices = [None] * len(properties)
|
||||
asking_rents = [
|
||||
normalize_price(p["price"], p["price_frequency"]) for p in properties
|
||||
]
|
||||
listing_statuses = ["For rent"] * len(properties)
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [p["id"] for p in properties],
|
||||
"bedrooms": [p["bedrooms"] for p in properties],
|
||||
"bathrooms": [p["bathrooms"] for p in properties],
|
||||
"total_rooms": [p["total_rooms"] for p in properties],
|
||||
"longitude": [p["longitude"] for p in properties],
|
||||
"latitude": [p["latitude"] for p in properties],
|
||||
"postcode": [p["postcode"] for p in properties],
|
||||
"address": [p["address"] for p in properties],
|
||||
"tenure": [p["tenure"] for p in properties],
|
||||
"property_type": [p["property_type"] for p in properties],
|
||||
"property_sub_type": [p["property_sub_type"] for p in properties],
|
||||
"price": [p["price"] for p in properties],
|
||||
"price_frequency": [p["price_frequency"] for p in properties],
|
||||
"price_qualifier": [p["price_qualifier"] for p in properties],
|
||||
"floorspace_sqm": [p["floorspace_sqm"] for p in properties],
|
||||
"url": [p["url"] for p in properties],
|
||||
"features": [p["features"] for p in properties],
|
||||
"first_visible_date": [p["first_visible_date"] for p in properties],
|
||||
"update_date": [p["update_date"] for p in properties],
|
||||
"outcode": [p["outcode"] for p in properties],
|
||||
"house_share": [p["house_share"] for p in properties],
|
||||
"Bedrooms": [p["Bedrooms"] for p in properties],
|
||||
"Bathrooms": [p["Bathrooms"] for p in properties],
|
||||
"Number of bedrooms & living rooms": [
|
||||
p["Number of bedrooms & living rooms"] for p in properties
|
||||
],
|
||||
"lon": [p["lon"] for p in properties],
|
||||
"lat": [p["lat"] for p in properties],
|
||||
"Postcode": [p["Postcode"] for p in properties],
|
||||
"Address per Property Register": [
|
||||
p["Address per Property Register"] for p in properties
|
||||
],
|
||||
"Leashold/Freehold": [p["Leashold/Freehold"] for p in properties],
|
||||
"Property type": [p["Property type"] for p in properties],
|
||||
"Property sub-type": [p["Property sub-type"] for p in properties],
|
||||
"Price qualifier": [p["Price qualifier"] for p in properties],
|
||||
"Total floor area (sqm)": [
|
||||
p["Total floor area (sqm)"] for p in properties
|
||||
],
|
||||
"Listing URL": [p["Listing URL"] for p in properties],
|
||||
"Listing features": [p["Listing features"] for p in properties],
|
||||
"Listing date": listing_dates,
|
||||
"Listing status": listing_statuses,
|
||||
"Asking price": asking_prices,
|
||||
"Asking rent (monthly)": asking_rents,
|
||||
},
|
||||
schema={
|
||||
"id": pl.Int64,
|
||||
"bedrooms": pl.Int32,
|
||||
"bathrooms": pl.Int32,
|
||||
"total_rooms": pl.Int32,
|
||||
"longitude": pl.Float64,
|
||||
"latitude": pl.Float64,
|
||||
"postcode": pl.Utf8,
|
||||
"address": pl.Utf8,
|
||||
"tenure": pl.Utf8,
|
||||
"property_type": pl.Utf8,
|
||||
"property_sub_type": pl.Utf8,
|
||||
"price": pl.Int64,
|
||||
"price_frequency": pl.Utf8,
|
||||
"price_qualifier": pl.Utf8,
|
||||
"floorspace_sqm": pl.Float64,
|
||||
"url": pl.Utf8,
|
||||
"features": pl.List(pl.Utf8),
|
||||
"first_visible_date": pl.Utf8,
|
||||
"update_date": pl.Utf8,
|
||||
"outcode": pl.Utf8,
|
||||
"house_share": pl.Boolean,
|
||||
"Bedrooms": pl.Int32,
|
||||
"Bathrooms": pl.Int32,
|
||||
"Number of bedrooms & living rooms": pl.Int32,
|
||||
"lon": pl.Float64,
|
||||
"lat": pl.Float64,
|
||||
"Postcode": pl.Utf8,
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"Leashold/Freehold": pl.Utf8,
|
||||
"Property type": pl.Utf8,
|
||||
"Property sub-type": pl.Utf8,
|
||||
"Price qualifier": pl.Utf8,
|
||||
"Total floor area (sqm)": pl.Float64,
|
||||
"Listing URL": pl.Utf8,
|
||||
"Listing features": pl.List(pl.Utf8),
|
||||
"Listing date": pl.Datetime("us"),
|
||||
"Listing status": pl.Utf8,
|
||||
"Asking price": pl.Int64,
|
||||
"Asking rent (monthly)": pl.Int64,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue