Remove finder
This commit is contained in:
parent
55238f59aa
commit
cd778dd088
26 changed files with 0 additions and 57826 deletions
|
|
@ -1,183 +0,0 @@
|
|||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||
from transform import map_property_type, normalize_postcode, normalize_price
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||
"""Write properties list to parquet with server-ready column names.
|
||||
|
||||
channel: "buy" or "rent"
|
||||
"""
|
||||
if not properties:
|
||||
log.warning("No properties to write to %s", path)
|
||||
return
|
||||
|
||||
# Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
|
||||
# almost certainly prices or other numeric fields mis-parsed as bedrooms.
|
||||
bad_count = 0
|
||||
for p in properties:
|
||||
for key in ("Bedrooms", "Bathrooms"):
|
||||
val = p.get(key, 0) or 0
|
||||
if val > MAX_BEDROOMS:
|
||||
bad_count += 1
|
||||
p[key] = None
|
||||
# Recompute derived field after sanitization
|
||||
beds = p.get("Bedrooms")
|
||||
baths = p.get("Bathrooms")
|
||||
if beds is None or baths is None:
|
||||
p["Number of bedrooms & living rooms"] = None
|
||||
else:
|
||||
p["Number of bedrooms & living rooms"] = beds + baths
|
||||
|
||||
if bad_count:
|
||||
log.warning(
|
||||
"Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
|
||||
bad_count,
|
||||
MAX_BEDROOMS,
|
||||
)
|
||||
|
||||
# Re-derive Property type from Property sub-type using current PROPERTY_TYPE_MAP.
|
||||
# This retroactively fixes data scraped with older versions of the type map.
|
||||
remapped = 0
|
||||
for p in properties:
|
||||
sub_type = p.get("Property sub-type", "")
|
||||
if sub_type and sub_type != "Unknown":
|
||||
new_type = map_property_type(sub_type)
|
||||
if new_type != p.get("Property type"):
|
||||
p["Property type"] = new_type
|
||||
remapped += 1
|
||||
if remapped:
|
||||
log.info("Re-mapped %d property types from sub-types", remapped)
|
||||
|
||||
# Parse first_visible_date to datetime
|
||||
listing_dates = []
|
||||
for p in properties:
|
||||
fvd = p.get("first_visible_date", "")
|
||||
if fvd:
|
||||
try:
|
||||
dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
|
||||
# Convert to UTC naive datetime for consistent storage
|
||||
if dt.tzinfo is not None:
|
||||
from datetime import timezone
|
||||
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
listing_dates.append(dt)
|
||||
except (ValueError, TypeError):
|
||||
# Try additional date formats (OpenRent: "DD Month, YYYY", "Today")
|
||||
parsed = None
|
||||
stripped = fvd.strip()
|
||||
lower = stripped.lower()
|
||||
if lower == "today":
|
||||
parsed = datetime.now().replace(
|
||||
hour=0, minute=0, second=0, microsecond=0
|
||||
)
|
||||
elif lower == "tomorrow":
|
||||
from datetime import timedelta
|
||||
parsed = (
|
||||
datetime.now() + timedelta(days=1)
|
||||
).replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
else:
|
||||
for fmt in ("%d %B, %Y", "%d %B %Y"):
|
||||
try:
|
||||
parsed = datetime.strptime(stripped, fmt)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
listing_dates.append(parsed)
|
||||
else:
|
||||
listing_dates.append(None)
|
||||
|
||||
# Derive asking price / asking rent based on channel
|
||||
# Zero prices indicate parsing failures or POA/auction listings — treat as null
|
||||
if channel == "buy":
|
||||
asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
|
||||
asking_rents = [None] * len(properties)
|
||||
listing_statuses = ["For sale"] * len(properties)
|
||||
else:
|
||||
asking_prices = [None] * len(properties)
|
||||
# Normalize to monthly, then apply sanity bounds. Rents outside
|
||||
# [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
|
||||
# pricing (short lets), annual rents mislabelled as monthly, or £0
|
||||
# placeholders — null them out rather than polluting aggregates.
|
||||
rent_outliers = 0
|
||||
asking_rents = []
|
||||
for p in properties:
|
||||
monthly = normalize_price(p["price"], p["price_frequency"])
|
||||
if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
|
||||
rent_outliers += 1
|
||||
asking_rents.append(None)
|
||||
else:
|
||||
asking_rents.append(monthly)
|
||||
if rent_outliers:
|
||||
log.warning(
|
||||
"Nulled %d rent outliers outside [£%d, £%d]/month",
|
||||
rent_outliers,
|
||||
MIN_RENT_MONTHLY,
|
||||
MAX_RENT_MONTHLY,
|
||||
)
|
||||
listing_statuses = ["For rent"] * len(properties)
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Bedrooms": [p["Bedrooms"] for p in properties],
|
||||
"Bathrooms": [p["Bathrooms"] for p in properties],
|
||||
"Number of bedrooms & living rooms": [
|
||||
p["Number of bedrooms & living rooms"] for p in properties
|
||||
],
|
||||
"lon": [p["lon"] for p in properties],
|
||||
"lat": [p["lat"] for p in properties],
|
||||
"Postcode": [normalize_postcode(p["Postcode"]) for p in properties],
|
||||
"Address per Property Register": [
|
||||
p["Address per Property Register"] for p in properties
|
||||
],
|
||||
"Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
|
||||
"Property type": [p["Property type"] for p in properties],
|
||||
"Property sub-type": [p["Property sub-type"] for p in properties],
|
||||
"Price qualifier": [p["Price qualifier"] for p in properties],
|
||||
"Total floor area (sqm)": [p["Total floor area (sqm)"] for p in properties],
|
||||
"Listing URL": [p["Listing URL"] for p in properties],
|
||||
"Listing features": [p["Listing features"] for p in properties],
|
||||
"Listing date": listing_dates,
|
||||
"Listing status": listing_statuses,
|
||||
"Asking price": asking_prices,
|
||||
"Asking rent (monthly)": asking_rents,
|
||||
},
|
||||
schema={
|
||||
"Bedrooms": pl.Int32,
|
||||
"Bathrooms": pl.Int32,
|
||||
"Number of bedrooms & living rooms": pl.Int32,
|
||||
"lon": pl.Float64,
|
||||
"lat": pl.Float64,
|
||||
"Postcode": pl.Utf8,
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Property type": pl.Utf8,
|
||||
"Property sub-type": pl.Utf8,
|
||||
"Price qualifier": pl.Utf8,
|
||||
"Total floor area (sqm)": pl.Float64,
|
||||
"Listing URL": pl.Utf8,
|
||||
"Listing features": pl.List(pl.Utf8),
|
||||
"Listing date": pl.Datetime("us"),
|
||||
"Listing status": pl.Utf8,
|
||||
"Asking price": pl.Int64,
|
||||
"Asking rent (monthly)": pl.Int64,
|
||||
},
|
||||
)
|
||||
|
||||
# Derive asking price per sqm for buy listings
|
||||
if channel == "buy":
|
||||
df = df.with_columns(
|
||||
(pl.col("Asking price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Asking price per sqm"),
|
||||
)
|
||||
|
||||
df.write_parquet(path)
|
||||
log.info("Wrote %d properties to %s", len(df), path)
|
||||
Loading…
Add table
Add a link
Reference in a new issue