all good
This commit is contained in:
parent
47d89f6fad
commit
017902b8e6
82 changed files with 331466 additions and 54841 deletions
|
|
@ -4,17 +4,14 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||
from transform import map_property_type, normalize_postcode, normalize_price
|
||||
from constants import MAX_BEDROOMS
|
||||
from transform import map_property_type, normalize_postcode
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||
"""Write properties list to parquet with server-ready column names.
|
||||
|
||||
channel: "buy" or "rent"
|
||||
"""
|
||||
def write_parquet(properties: list[dict], path: Path) -> None:
|
||||
"""Write sale properties list to parquet with server-ready column names."""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
|
@ -69,7 +66,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
listing_dates.append(dt)
|
||||
except (ValueError, TypeError):
|
||||
# Try additional date formats (OpenRent: "DD Month, YYYY", "Today")
|
||||
# Try additional date formats used by scraped listing sources.
|
||||
parsed = None
|
||||
stripped = fvd.strip()
|
||||
lower = stripped.lower()
|
||||
|
|
@ -93,35 +90,9 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
else:
|
||||
listing_dates.append(None)
|
||||
|
||||
# Derive asking price / asking rent based on channel
|
||||
# Zero prices indicate parsing failures or POA/auction listings — treat as null
|
||||
if channel == "buy":
|
||||
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||
asking_rents = [None] * len(properties)
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
else:
|
||||
asking_prices = [None] * len(properties)
|
||||
# Normalize to monthly, then apply sanity bounds. Rents outside
|
||||
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
|
||||
# pricing (short lets), annual rents mislabelled as monthly, or £0
|
||||
# placeholders — null them out rather than polluting aggregates.
|
||||
rent_outliers = 0
|
||||
asking_rents = []
|
||||
for p in properties:
|
||||
monthly = normalize_price(p["price"], p["price_frequency"])
|
||||
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
|
||||
rent_outliers += 1
|
||||
asking_rents.append(None)
|
||||
else:
|
||||
asking_rents.append(monthly)
|
||||
if rent_outliers:
|
||||
log.warning(
|
||||
"Nulled %d rent outliers outside [£%d, £%d]/month",
|
||||
rent_outliers,
|
||||
MIN_RENT_MONTHLY,
|
||||
MAX_RENT_MONTHLY,
|
||||
)
|
||||
listing_statuses = ["For rent"] * len(properties)
|
||||
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
@ -146,7 +117,6 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
"Listing date": listing_dates,
|
||||
"Listing status": listing_statuses,
|
||||
"Asking price": asking_prices,
|
||||
"Asking rent (monthly)": asking_rents,
|
||||
},
|
||||
schema={
|
||||
"Bedrooms": pl.Int32,
|
||||
|
|
@ -166,18 +136,15 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
"Listing date": pl.Datetime("us"),
|
||||
"Listing status": pl.Utf8,
|
||||
"Asking price": pl.Int64,
|
||||
"Asking rent (monthly)": pl.Int64,
|
||||
},
|
||||
)
|
||||
|
||||
# Derive asking price per sqm for buy listings
|
||||
if channel == "buy":
|
||||
df = df.with_columns(
|
||||
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Asking price per sqm"),
|
||||
)
|
||||
df = df.with_columns(
|
||||
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Asking price per sqm"),
|
||||
)
|
||||
|
||||
df.write_parquet(path)
|
||||
log.info("Wrote %d properties to %s", len(df), path)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue