all is well
This commit is contained in:
parent
eac1bd0d13
commit
2f149503bb
53 changed files with 1543 additions and 354 deletions
|
|
@ -82,7 +82,7 @@ PROPERTY_TYPE_MAP = {
|
|||
"Farm / Barn": "Other",
|
||||
"Farm House": "Other",
|
||||
"House": "Detached",
|
||||
"House of Multiple Occupation": "Flats/Maisonettes",
|
||||
"House of Multiple Occupation": "Other",
|
||||
"House Share": "Other",
|
||||
"Not Specified": "Other",
|
||||
"Chalet": "Other",
|
||||
|
|
@ -90,15 +90,15 @@ PROPERTY_TYPE_MAP = {
|
|||
"Coach House": "Other",
|
||||
"Character Property": "Other",
|
||||
"Cluster House": "Other",
|
||||
"Retirement Property": "Flats/Maisonettes",
|
||||
"Retirement Property": "Other",
|
||||
"Parking": "Other",
|
||||
"Plot": "Other",
|
||||
"Garages": "Other",
|
||||
"Mews": "Terraced",
|
||||
"Property": "Other",
|
||||
"Flat Share": "Other",
|
||||
"Block of Apartments": "Flats/Maisonettes",
|
||||
"Private Halls": "Flats/Maisonettes",
|
||||
"Block of Apartments": "Other",
|
||||
"Private Halls": "Other",
|
||||
"Terraced Bungalow": "Terraced",
|
||||
"Equestrian Facility": "Other",
|
||||
"Ground Maisonette": "Flats/Maisonettes",
|
||||
|
|
@ -107,13 +107,13 @@ PROPERTY_TYPE_MAP = {
|
|||
"Farm Land": "Other",
|
||||
"House Boat": "Other",
|
||||
"Barn": "Other",
|
||||
"Serviced Apartments": "Flats/Maisonettes",
|
||||
"Serviced Apartments": "Other",
|
||||
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
|
||||
"Semi Detached": "Semi-Detached",
|
||||
"Semi Detached Bungalow": "Semi-Detached",
|
||||
"End Of Terrace": "Terraced",
|
||||
"End Terrace": "Terraced",
|
||||
"Block Of Apartments": "Flats/Maisonettes",
|
||||
"Block Of Apartments": "Other",
|
||||
# Lowercase variants (from home.co.uk / Rightmove APIs)
|
||||
"house": "Detached",
|
||||
"bungalow": "Other",
|
||||
|
|
@ -121,7 +121,7 @@ PROPERTY_TYPE_MAP = {
|
|||
"land": "Other",
|
||||
"other": "Other",
|
||||
"not-specified": "Other",
|
||||
"retirement-property": "Flats/Maisonettes",
|
||||
"retirement-property": "Other",
|
||||
"equestrian-facility": "Other",
|
||||
"flat": "Flats/Maisonettes",
|
||||
"detached": "Detached",
|
||||
|
|
|
|||
|
|
@ -19,7 +19,12 @@ from constants import (
|
|||
RETRY_BASE_DELAY,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||
from transform import (
|
||||
normalize_postcode,
|
||||
normalize_sub_type,
|
||||
parse_int_value,
|
||||
validate_floor_area,
|
||||
)
|
||||
|
||||
log = logging.getLogger("homecouk")
|
||||
|
||||
|
|
@ -170,11 +175,19 @@ def parse_floor_area(description: str | None) -> float | None:
|
|||
"""Try to extract floor area from description text like '789 sq.ft.' or '73 sq.m.'."""
|
||||
if not description:
|
||||
return None
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", description, re.IGNORECASE)
|
||||
m = re.search(
|
||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
|
||||
description,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", description, re.IGNORECASE)
|
||||
m = re.search(
|
||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
|
||||
description,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
|
@ -237,6 +250,15 @@ def map_property_type(raw_type: str | None) -> str:
|
|||
# Home.co.uk uses types like "House", "Flat", "Apartment", "Detached", etc.
|
||||
# Try common patterns
|
||||
lower = raw_type.lower()
|
||||
excluded_flat_like = (
|
||||
"block of apartment",
|
||||
"house of multiple occupation",
|
||||
"private halls",
|
||||
"retirement",
|
||||
"serviced apartment",
|
||||
)
|
||||
if any(term in lower for term in excluded_flat_like):
|
||||
return "Other"
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
|
|
@ -269,8 +291,10 @@ def transform_property(
|
|||
log.debug("Coords outside England: lat=%.4f lng=%.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
price = prop.get("price") or prop.get("latest_price")
|
||||
if not price or int(price) <= 0:
|
||||
price = parse_int_value(prop.get("price")) or parse_int_value(
|
||||
prop.get("latest_price")
|
||||
)
|
||||
if not price or price <= 0:
|
||||
return None
|
||||
|
||||
# Home.co.uk provides postcodes directly, but fall back to spatial index
|
||||
|
|
@ -281,10 +305,10 @@ def transform_property(
|
|||
log.debug("No postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
raw_beds = prop.get("bedrooms", 0) or 0
|
||||
raw_baths = prop.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
||||
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
||||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"home.co.uk %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
|
|
@ -318,7 +342,7 @@ def transform_property(
|
|||
"Leasehold/Freehold": parse_tenure(prop),
|
||||
"Property type": map_property_type(listing_type),
|
||||
"Property sub-type": normalize_sub_type(listing_type),
|
||||
"price": int(price),
|
||||
"price": price,
|
||||
"price_frequency": "",
|
||||
"Price qualifier": price_qualifier,
|
||||
"Total floor area (sqm)": parse_floor_area(prop.get("description")),
|
||||
|
|
@ -362,7 +386,16 @@ def search_outcode(
|
|||
break
|
||||
|
||||
for prop in raw_props:
|
||||
transformed = transform_property(prop, pc_index)
|
||||
try:
|
||||
transformed = transform_property(prop, pc_index)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"home.co.uk %s property %s failed to transform: %s",
|
||||
outcode,
|
||||
prop.get("listing_id") or prop.get("property_id") or "?",
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
|
|
|
|||
63
finder/listing_filters.py
Normal file
63
finder/listing_filters.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
"""Shared target filters for manual buy-listing scrapes."""
|
||||
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
BUY_MAX_PRICE = 1_000_000
|
||||
BUY_MIN_BEDROOMS = 2
|
||||
BUY_MAX_BEDROOMS = 5
|
||||
BUY_ALLOWED_BATHROOMS = frozenset({2, 3})
|
||||
BUY_MIN_FLOOR_AREA_SQM = 90.0
|
||||
BUY_MAX_FLOOR_AREA_SQM = 170.0
|
||||
BUY_PROPERTY_TYPES = frozenset({"Flats/Maisonettes"})
|
||||
|
||||
BUY_MIN_FLOOR_AREA_SQFT = round(BUY_MIN_FLOOR_AREA_SQM / 0.092903)
|
||||
BUY_MAX_FLOOR_AREA_SQFT = round(BUY_MAX_FLOOR_AREA_SQM / 0.092903)
|
||||
|
||||
|
||||
def _number(value: Any) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
number = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if not math.isfinite(number):
|
||||
return None
|
||||
return number
|
||||
|
||||
|
||||
def _int(value: Any) -> int | None:
|
||||
number = _number(value)
|
||||
if number is None or not number.is_integer():
|
||||
return None
|
||||
return int(number)
|
||||
|
||||
|
||||
def matches_strict_buy_listing_filter(prop: dict) -> bool:
|
||||
"""Exact filter used to guard scraped/output datasets."""
|
||||
if "price" in prop:
|
||||
price = _number(prop.get("price"))
|
||||
else:
|
||||
price = _number(prop.get("Asking price"))
|
||||
if price is None or price <= 0 or price >= BUY_MAX_PRICE:
|
||||
return False
|
||||
|
||||
bedrooms = _int(prop.get("Bedrooms"))
|
||||
if bedrooms is None or (
|
||||
bedrooms < BUY_MIN_BEDROOMS or bedrooms > BUY_MAX_BEDROOMS
|
||||
):
|
||||
return False
|
||||
|
||||
property_type = prop.get("Property type")
|
||||
if property_type not in BUY_PROPERTY_TYPES:
|
||||
return False
|
||||
|
||||
bathrooms = _int(prop.get("Bathrooms"))
|
||||
if bathrooms not in BUY_ALLOWED_BATHROOMS:
|
||||
return False
|
||||
|
||||
floor_area = _number(prop.get("Total floor area (sqm)"))
|
||||
if floor_area is None:
|
||||
return False
|
||||
return BUY_MIN_FLOOR_AREA_SQM <= floor_area <= BUY_MAX_FLOOR_AREA_SQM
|
||||
|
|
@ -10,6 +10,15 @@ from constants import (
|
|||
TYPEAHEAD_URL,
|
||||
)
|
||||
from http_client import fetch_with_retry
|
||||
from listing_filters import (
|
||||
BUY_ALLOWED_BATHROOMS,
|
||||
BUY_MAX_BEDROOMS,
|
||||
BUY_MAX_FLOOR_AREA_SQFT,
|
||||
BUY_MAX_PRICE,
|
||||
BUY_MIN_BEDROOMS,
|
||||
BUY_MIN_FLOOR_AREA_SQFT,
|
||||
matches_strict_buy_listing_filter,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import transform_property
|
||||
|
||||
|
|
@ -22,12 +31,23 @@ outcode_cache: dict[str, str] = {}
|
|||
# Requesting index >= 1008 returns HTTP 400.
|
||||
_MAX_INDEX = 1008
|
||||
|
||||
# Property type filters for splitting overcapped searches. Each sub-query
|
||||
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
|
||||
_PROPERTY_TYPES = [
|
||||
"detached", "semi-detached", "terraced", "flat",
|
||||
"bungalow", "park-home", "land",
|
||||
]
|
||||
_BASE_BUY_SEARCH_PARAMS = {
|
||||
"propertyTypes": "flat",
|
||||
"minBedrooms": str(BUY_MIN_BEDROOMS),
|
||||
"maxBedrooms": str(BUY_MAX_BEDROOMS),
|
||||
"minBathrooms": str(min(BUY_ALLOWED_BATHROOMS)),
|
||||
"maxBathrooms": str(max(BUY_ALLOWED_BATHROOMS)),
|
||||
"minSize": str(BUY_MIN_FLOOR_AREA_SQFT),
|
||||
"maxSize": str(BUY_MAX_FLOOR_AREA_SQFT),
|
||||
"maxPrice": str(BUY_MAX_PRICE - 1),
|
||||
}
|
||||
|
||||
|
||||
def _buy_search_params(extra_params: dict | None = None) -> dict:
|
||||
params = dict(_BASE_BUY_SEARCH_PARAMS)
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
return params
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
|
|
@ -92,8 +112,18 @@ def _paginate(
|
|||
break
|
||||
|
||||
for prop in raw_props:
|
||||
transformed = transform_property(prop, outcode, pc_index)
|
||||
if transformed:
|
||||
try:
|
||||
transformed = transform_property(prop, outcode, pc_index)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"Rightmove %s/%s property %s failed to transform: %s",
|
||||
outcode,
|
||||
channel_cfg["channel"],
|
||||
prop.get("id", "?"),
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
if transformed and matches_strict_buy_listing_filter(transformed):
|
||||
properties.append(transformed)
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties, result_count
|
||||
|
|
@ -105,6 +135,15 @@ def _paginate(
|
|||
|
||||
if index >= result_count:
|
||||
break
|
||||
if index >= _MAX_INDEX:
|
||||
log.warning(
|
||||
"%s/%s: %d filtered results exceed Rightmove's %d-result page cap",
|
||||
outcode,
|
||||
channel_cfg["channel"],
|
||||
result_count,
|
||||
_MAX_INDEX,
|
||||
)
|
||||
break
|
||||
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
|
|
@ -121,54 +160,20 @@ def search_outcode(
|
|||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties.
|
||||
|
||||
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
|
||||
re-queries per property type to recover listings beyond the cap.
|
||||
Search requests set the supported Rightmove filters directly: flats,
|
||||
2-5 bedrooms, 2-3 bathrooms, 969-1830 sq ft, and asking price below £1m.
|
||||
"""
|
||||
properties, result_count = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index, max_properties=max_properties
|
||||
properties, _ = _paginate(
|
||||
client,
|
||||
outcode_id,
|
||||
outcode,
|
||||
channel_cfg,
|
||||
pc_index,
|
||||
extra_params=_buy_search_params(),
|
||||
max_properties=max_properties,
|
||||
)
|
||||
|
||||
if max_properties is not None and len(properties) >= max_properties:
|
||||
return properties[:max_properties]
|
||||
|
||||
if result_count <= _MAX_INDEX:
|
||||
return properties
|
||||
|
||||
# Hit the 1008 cap — re-search per property type to get full coverage
|
||||
ch = channel_cfg["channel"]
|
||||
log.info(
|
||||
"%s/%s: %d results exceed %d cap, splitting by property type",
|
||||
outcode, ch, result_count, _MAX_INDEX,
|
||||
)
|
||||
|
||||
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
|
||||
|
||||
for pt in _PROPERTY_TYPES:
|
||||
pt_props, _ = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index,
|
||||
extra_params={"propertyTypes": pt},
|
||||
max_properties=max_properties,
|
||||
)
|
||||
new = 0
|
||||
for p in pt_props:
|
||||
if p["id"] not in all_by_id:
|
||||
all_by_id[p["id"]] = p
|
||||
new += 1
|
||||
if (
|
||||
max_properties is not None
|
||||
and len(all_by_id) >= max_properties
|
||||
):
|
||||
break
|
||||
if new:
|
||||
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
|
||||
if max_properties is not None and len(all_by_id) >= max_properties:
|
||||
break
|
||||
|
||||
log.info(
|
||||
"%s/%s: type split recovered %d → %d properties",
|
||||
outcode, ch, len(properties), len(all_by_id),
|
||||
)
|
||||
properties = list(all_by_id.values())
|
||||
if max_properties is not None:
|
||||
return properties[:max_properties]
|
||||
return properties
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ from homecouk import load_cookies as load_homecouk_cookies
|
|||
from homecouk import make_client as make_homecouk_client
|
||||
from homecouk import search_outcode as homecouk_search_outcode
|
||||
from http_client import make_client
|
||||
from listing_filters import matches_strict_buy_listing_filter
|
||||
from rightmove import resolve_outcode_id
|
||||
from rightmove import search_outcode as rightmove_search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
|
@ -181,11 +182,11 @@ def _source_names(sources: str | Iterable[str] | None) -> list[str]:
|
|||
requested = [str(source).strip().lower() for source in sources]
|
||||
|
||||
requested = [source for source in requested if source]
|
||||
if "all" in requested:
|
||||
return list(SOURCE_ORDER)
|
||||
unknown = sorted(set(requested) - set(SOURCE_ORDER))
|
||||
unknown = sorted(set(requested) - set(SOURCE_ORDER) - {"all"})
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown source(s): {', '.join(unknown)}")
|
||||
if "all" in requested:
|
||||
return list(SOURCE_ORDER)
|
||||
return [source for source in SOURCE_ORDER if source in requested]
|
||||
|
||||
|
||||
|
|
@ -196,19 +197,28 @@ def _dedup_key(prop: dict) -> tuple:
|
|||
def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]:
|
||||
merged: dict[str, dict] = {}
|
||||
seen_keys: set[tuple] = set()
|
||||
seen_ids: set[str] = set()
|
||||
counts = {source: 0 for source in SOURCE_ORDER}
|
||||
deduped = 0
|
||||
|
||||
for source in SOURCE_ORDER:
|
||||
for prop in source_results.get(source, []):
|
||||
prop_id = prop.get("id")
|
||||
key = _dedup_key(prop)
|
||||
if (prop_id is not None and prop_id in merged) or key in seen_keys:
|
||||
deduped += 1
|
||||
continue
|
||||
storage_key = prop_id if prop_id is not None else f"{source}:{len(merged)}"
|
||||
if prop_id is not None:
|
||||
prop_id = str(prop_id)
|
||||
if prop_id in seen_ids:
|
||||
deduped += 1
|
||||
continue
|
||||
seen_ids.add(prop_id)
|
||||
storage_key = prop_id
|
||||
else:
|
||||
key = _dedup_key(prop)
|
||||
if key in seen_keys:
|
||||
deduped += 1
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
storage_key = f"{source}:{len(merged)}"
|
||||
merged[storage_key] = prop
|
||||
seen_keys.add(key)
|
||||
counts[source] += 1
|
||||
|
||||
return list(merged.values()), counts, deduped
|
||||
|
|
@ -241,13 +251,22 @@ def _store_properties(
|
|||
if remaining == 0:
|
||||
return 0
|
||||
|
||||
eligible = [prop for prop in props if _property_is_londonish(prop)]
|
||||
dropped = len(props) - len(eligible)
|
||||
if dropped:
|
||||
londonish = [prop for prop in props if _property_is_londonish(prop)]
|
||||
dropped_outside_area = len(props) - len(londonish)
|
||||
if dropped_outside_area:
|
||||
log.debug(
|
||||
"%s dropped %d properties outside the Greater London-ish postcode filter",
|
||||
source,
|
||||
dropped,
|
||||
dropped_outside_area,
|
||||
)
|
||||
|
||||
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
|
||||
dropped_non_matching = len(londonish) - len(eligible)
|
||||
if dropped_non_matching:
|
||||
log.debug(
|
||||
"%s dropped %d properties outside the strict buy-listing filters",
|
||||
source,
|
||||
dropped_non_matching,
|
||||
)
|
||||
|
||||
selected = eligible if remaining is None else eligible[:remaining]
|
||||
|
|
@ -367,20 +386,16 @@ def _scrape_homecouk(
|
|||
log.info("home.co.uk cap reached")
|
||||
return
|
||||
|
||||
remaining = _source_remaining(
|
||||
results, "homecouk", max_properties_per_source
|
||||
)
|
||||
if remaining == 0:
|
||||
log.info("home.co.uk cap reached")
|
||||
return
|
||||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# home.co.uk cannot express the full filter set at source.
|
||||
# Fetch the outcode page set first; _store_properties applies
|
||||
# the strict filter and source cap after transformation.
|
||||
props = homecouk_search_outcode(
|
||||
client,
|
||||
outcode,
|
||||
pc_index,
|
||||
max_properties=remaining,
|
||||
max_properties=None,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
|
|
@ -442,19 +457,17 @@ def _scrape_zoopla(
|
|||
log.info("Zoopla cap reached")
|
||||
return
|
||||
|
||||
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
|
||||
if remaining == 0:
|
||||
log.info("Zoopla cap reached")
|
||||
return
|
||||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# Zoopla source-side filters are unverified here. Fetch the
|
||||
# outcode page set first; _store_properties applies the
|
||||
# strict filter and source cap after transformation.
|
||||
props, _ = zoopla_search_outcode(
|
||||
page,
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
max_properties=remaining,
|
||||
max_properties=None,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
|
|
@ -506,9 +519,6 @@ def run_scrape(
|
|||
output_base = Path(output_dir) if output_dir is not None else DATA_DIR
|
||||
output_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if "zoopla" in selected_sources and pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
|
||||
errors: list[str] = []
|
||||
results = {source: [] for source in SOURCE_ORDER}
|
||||
started_at = time.time()
|
||||
|
|
@ -539,7 +549,8 @@ def run_scrape(
|
|||
)
|
||||
|
||||
if "zoopla" in selected_sources:
|
||||
assert pc_coords is not None
|
||||
if pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
_scrape_zoopla(
|
||||
selected_outcodes,
|
||||
pc_index,
|
||||
|
|
@ -551,19 +562,36 @@ def run_scrape(
|
|||
|
||||
merged, source_counts, deduped = _merge_properties(results)
|
||||
output_path = output_base / "online_listings_buy.parquet"
|
||||
write_parquet(merged, output_path)
|
||||
if merged:
|
||||
write_parquet(merged, output_path)
|
||||
else:
|
||||
if output_path.exists():
|
||||
output_path.unlink()
|
||||
log.warning("No strict properties to write to %s", output_path)
|
||||
|
||||
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
|
||||
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
|
||||
if filtered:
|
||||
write_parquet(filtered, filtered_output_path)
|
||||
else:
|
||||
if filtered_output_path.exists():
|
||||
filtered_output_path.unlink()
|
||||
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
|
||||
|
||||
counts = {
|
||||
"total": len(merged),
|
||||
"filtered_total": len(filtered),
|
||||
"deduped": deduped,
|
||||
"sources": source_counts,
|
||||
}
|
||||
source_summary = " ".join(
|
||||
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
|
||||
)
|
||||
log.info(
|
||||
"Sale scrape complete: %d unique (rightmove:%d homecouk:%d zoopla:%d deduped:%d)",
|
||||
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
|
||||
len(merged),
|
||||
source_counts["rightmove"],
|
||||
source_counts["homecouk"],
|
||||
source_counts["zoopla"],
|
||||
len(filtered),
|
||||
source_summary,
|
||||
deduped,
|
||||
)
|
||||
|
||||
|
|
@ -575,6 +603,7 @@ def run_scrape(
|
|||
},
|
||||
"counts": counts,
|
||||
"path": str(output_path),
|
||||
"filtered_path": str(filtered_output_path),
|
||||
"errors": errors,
|
||||
"elapsed_seconds": round(time.time() - started_at, 3),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,9 +45,10 @@ def write_parquet(properties: list[dict], path: Path) -> None:
|
|||
remapped = 0
|
||||
for p in properties:
|
||||
sub_type = p.get("Property sub-type", "")
|
||||
if sub_type and sub_type != "Unknown":
|
||||
current_type = p.get("Property type")
|
||||
if sub_type and sub_type != "Unknown" and current_type in (None, "", "Other"):
|
||||
new_type = map_property_type(sub_type)
|
||||
if new_type != p.get("Property type"):
|
||||
if new_type != current_type:
|
||||
p["Property type"] = new_type
|
||||
remapped += 1
|
||||
if remapped:
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import logging
|
||||
import math
|
||||
import re
|
||||
|
||||
from constants import MAX_BEDROOMS, PROPERTY_TYPE_MAP, RIGHTMOVE_BASE
|
||||
|
|
@ -29,17 +30,43 @@ def validate_floor_area(sqm: float | None) -> float | None:
|
|||
return sqm
|
||||
|
||||
|
||||
def parse_int_value(value) -> int | None:
|
||||
"""Parse an integer-like API value without truncating decimals."""
|
||||
if value is None or isinstance(value, bool):
|
||||
return None
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
if not math.isfinite(value) or not value.is_integer():
|
||||
return None
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
cleaned = value.strip().replace(",", "").replace("£", "")
|
||||
if not re.fullmatch(r"\d+", cleaned):
|
||||
return None
|
||||
return int(cleaned)
|
||||
return None
|
||||
|
||||
|
||||
def parse_display_size(display_size: str | None) -> float | None:
|
||||
"""Parse displaySize like '499 sq. ft.' or '4,124 sq. ft.' to sqm."""
|
||||
if not display_size:
|
||||
return None
|
||||
# Try sq. ft. first
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*ft", display_size, re.IGNORECASE)
|
||||
m = re.search(
|
||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))",
|
||||
display_size,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
sqft = float(m.group(1).replace(",", ""))
|
||||
return validate_floor_area(round(sqft * 0.092903, 1))
|
||||
# Try sq. m.
|
||||
m = re.search(r"([\d,]+(?:\.\d+)?)\s*sq\.?\s*m", display_size, re.IGNORECASE)
|
||||
m = re.search(
|
||||
r"([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))",
|
||||
display_size,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
return validate_floor_area(round(float(m.group(1).replace(",", "")), 1))
|
||||
return None
|
||||
|
|
@ -86,7 +113,21 @@ def map_property_type(sub_type: str | None) -> str:
|
|||
return canonical
|
||||
# Keyword fallback for compound types not in the map
|
||||
lower = sub_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
|
||||
excluded_flat_like = (
|
||||
"block of apartment",
|
||||
"house of multiple occupation",
|
||||
"private halls",
|
||||
"retirement",
|
||||
"serviced apartment",
|
||||
)
|
||||
if any(term in lower for term in excluded_flat_like):
|
||||
return "Other"
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "semi" in lower and "detach" in lower:
|
||||
return "Semi-Detached"
|
||||
|
|
@ -158,10 +199,10 @@ def transform_property(
|
|||
lat, lng = fix_coords(raw_lat, raw_lng)
|
||||
|
||||
price_obj = prop.get("price", {})
|
||||
amount = price_obj.get("amount")
|
||||
amount = parse_int_value(price_obj.get("amount"))
|
||||
if not amount:
|
||||
return None
|
||||
price = int(amount)
|
||||
price = amount
|
||||
if price <= 0:
|
||||
return None
|
||||
|
||||
|
|
@ -172,14 +213,23 @@ def transform_property(
|
|||
|
||||
# POA / Auction listings have unreliable prices — treat as no price
|
||||
pq_lower = price_qualifier.lower()
|
||||
if "poa" in pq_lower or "auction" in pq_lower:
|
||||
non_comparable_price_terms = (
|
||||
"poa",
|
||||
"auction",
|
||||
"shared ownership",
|
||||
"shared equity",
|
||||
"part buy",
|
||||
"part rent",
|
||||
"from",
|
||||
)
|
||||
if any(term in pq_lower for term in non_comparable_price_terms):
|
||||
return None
|
||||
|
||||
sub_type = prop.get("propertySubType", "")
|
||||
raw_beds = prop.get("bedrooms", 0) or 0
|
||||
raw_baths = prop.get("bathrooms", 0) or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
raw_beds = parse_int_value(prop.get("bedrooms")) or 0
|
||||
raw_baths = parse_int_value(prop.get("bathrooms")) or 0
|
||||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Rightmove %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
|
|
@ -197,8 +247,15 @@ def transform_property(
|
|||
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
|
||||
property_url = prop.get("propertyUrl") or ""
|
||||
if not isinstance(property_url, str):
|
||||
property_url = ""
|
||||
listing_id = prop.get("id") or property_url
|
||||
if not listing_id:
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": prop.get("id"),
|
||||
"id": listing_id,
|
||||
"Bedrooms": bedrooms,
|
||||
"Bathrooms": bathrooms,
|
||||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||||
|
|
@ -213,7 +270,7 @@ def transform_property(
|
|||
"price_frequency": "",
|
||||
"Price qualifier": price_qualifier,
|
||||
"Total floor area (sqm)": parse_display_size(prop.get("displaySize")),
|
||||
"Listing URL": RIGHTMOVE_BASE + prop.get("propertyUrl", ""),
|
||||
"Listing URL": RIGHTMOVE_BASE + property_url if property_url else "",
|
||||
"Listing features": key_features,
|
||||
"first_visible_date": prop.get("firstVisibleDate", ""),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ import time
|
|||
|
||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, validate_floor_area
|
||||
from transform import normalize_sub_type, parse_int_value, validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -106,7 +106,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
||||
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
||||
|
||||
let tenure = '';
|
||||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
|
|
@ -141,7 +142,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
|
|
@ -181,7 +183,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
const bedsMatch = text.match(/(\d+)\s*beds?/i);
|
||||
const bathsMatch = text.match(/(\d+)\s*baths?/i);
|
||||
const recMatch = text.match(/(\d+)\s*reception/i);
|
||||
const areaMatch = text.match(/([\d,]+)\s*sq\.?\s*ft/i);
|
||||
const areaSqftMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*ft|square\s+feet|ft(?:\^?2|²))/i);
|
||||
const areaSqmMatch = text.match(/([\d,]+(?:\.\d+)?)\s*(?:sq\.?\s*m|square\s+met(?:er|re)s?|m(?:\^?2|²))/i);
|
||||
|
||||
let address = '';
|
||||
for (const line of lines) {
|
||||
|
|
@ -225,7 +228,8 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
beds: bedsMatch && parseInt(bedsMatch[1]) <= 20 ? parseInt(bedsMatch[1]) : null,
|
||||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqft: areaSqftMatch ? parseInt(areaSqftMatch[1].replace(/,/g, '')) : null,
|
||||
floor_area_sqm: areaSqmMatch ? parseFloat(areaSqmMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
|
|
@ -611,7 +615,22 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return canonical
|
||||
# Keyword fallback
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||
excluded_flat_like = (
|
||||
"block of apartment",
|
||||
"house of multiple occupation",
|
||||
"private halls",
|
||||
"retirement",
|
||||
"serviced apartment",
|
||||
)
|
||||
if any(term in lower for term in excluded_flat_like):
|
||||
return "Other"
|
||||
if (
|
||||
"flat" in lower
|
||||
or "apartment" in lower
|
||||
or "maisonette" in lower
|
||||
or "studio" in lower
|
||||
or "penthouse" in lower
|
||||
):
|
||||
return "Flats/Maisonettes"
|
||||
if "semi" in lower and "detach" in lower:
|
||||
return "Semi-Detached"
|
||||
|
|
@ -634,8 +653,8 @@ def transform_property(
|
|||
|
||||
Zoopla search cards do not include coordinates, so we resolve lat/lng
|
||||
from postcodes extracted from the address text."""
|
||||
price = raw.get("price")
|
||||
if not price or int(price) <= 0:
|
||||
price = parse_int_value(raw.get("price"))
|
||||
if not price or price <= 0:
|
||||
return None
|
||||
|
||||
address = raw.get("address", "")
|
||||
|
|
@ -670,10 +689,10 @@ def transform_property(
|
|||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
raw_beds = raw.get("beds") or 0
|
||||
raw_baths = raw.get("baths") or 0
|
||||
bedrooms = raw_beds if raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if raw_baths <= MAX_BEDROOMS else 0
|
||||
raw_beds = parse_int_value(raw.get("beds")) or 0
|
||||
raw_baths = parse_int_value(raw.get("baths")) or 0
|
||||
bedrooms = raw_beds if 0 <= raw_beds <= MAX_BEDROOMS else 0
|
||||
bathrooms = raw_baths if 0 <= raw_baths <= MAX_BEDROOMS else 0
|
||||
if raw_beds > MAX_BEDROOMS or raw_baths > MAX_BEDROOMS:
|
||||
log.warning(
|
||||
"Zoopla %s: implausible beds=%d baths=%d (capped to 0)",
|
||||
|
|
@ -683,9 +702,13 @@ def transform_property(
|
|||
|
||||
# Floor area: convert sq ft to sq m
|
||||
floor_area_sqm = None
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = validate_floor_area(round(sqft * 0.092903, 1))
|
||||
raw_sqm = raw.get("floor_area_sqm")
|
||||
if raw_sqm:
|
||||
floor_area_sqm = validate_floor_area(round(float(raw_sqm), 1))
|
||||
else:
|
||||
sqft = raw.get("floor_area_sqft")
|
||||
if sqft:
|
||||
floor_area_sqm = validate_floor_area(round(float(sqft) * 0.092903, 1))
|
||||
|
||||
listing_id = raw.get("id", "")
|
||||
listing_url = raw.get("url", "")
|
||||
|
|
@ -704,7 +727,7 @@ def transform_property(
|
|||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
"price": int(price),
|
||||
"price": price,
|
||||
"price_frequency": "",
|
||||
"Price qualifier": "",
|
||||
"Total floor area (sqm)": floor_area_sqm,
|
||||
|
|
@ -760,7 +783,18 @@ def search_outcode(
|
|||
properties = []
|
||||
dropped = 0
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, pc_index, pc_coords, search_outcode=outcode)
|
||||
try:
|
||||
transformed = transform_property(
|
||||
raw, pc_index, pc_coords, search_outcode=outcode
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"Zoopla %s property %s failed to transform: %s",
|
||||
outcode,
|
||||
raw.get("id", "?"),
|
||||
exc,
|
||||
)
|
||||
transformed = None
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
else:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue