Improve scraping

This commit is contained in:
Andras Schmelczer 2026-03-29 11:41:35 +01:00
parent 89a85e9a0c
commit c14d28f430
7 changed files with 91 additions and 25 deletions

View file

@ -117,6 +117,13 @@ PROPERTY_TYPE_MAP = {
"House Boat": "Other",
"Barn": "Other",
"Serviced Apartments": "Flats/Maisonettes",
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
"Semi Detached": "Semi-Detached",
"Semi Detached Bungalow": "Semi-Detached",
"End Of Terrace": "Terraced",
"End Terrace": "Terraced",
"Block Of Apartments": "Flats/Maisonettes",
"Farm / Barn": "Other",
# Lowercase variants (from home.co.uk / Rightmove APIs)
"house": "Detached",
"bungalow": "Other",

View file

@ -26,7 +26,7 @@ from metrics import (
homecouk_requests_total,
)
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
log = logging.getLogger("homecouk")
@ -359,11 +359,11 @@ def transform_property(
"Number of bedrooms & living rooms": bedrooms + bathrooms,
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Postcode": normalize_postcode(postcode),
"Address per Property Register": address,
"Leasehold/Freehold": parse_tenure(prop),
"Property type": map_property_type(listing_type),
"Property sub-type": listing_type.title() if listing_type else "Unknown",
"Property sub-type": normalize_sub_type(listing_type),
"price": int(price),
"price_frequency": "" if channel == "BUY" else "monthly",
"Price qualifier": price_qualifier,

View file

@ -46,7 +46,7 @@ from metrics import (
openrent_requests_total,
)
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
log = logging.getLogger("openrent")
@ -781,14 +781,14 @@ def transform_property(
"Number of bedrooms & living rooms": bedrooms,
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Postcode": normalize_postcode(postcode),
"Address per Property Register": address,
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
# property ownership concept that doesn't apply to rental listings. The
# landlord's tenure is not shown on OpenRent listing pages.
"Leasehold/Freehold": None,
"Property type": map_property_type(property_type),
"Property sub-type": property_type or "Unknown",
"Property sub-type": normalize_sub_type(property_type),
"price": int(price),
"price_frequency": frequency,
"Price qualifier": "",

View file

@ -338,7 +338,25 @@ def _load_checkpoint(
if rpath.exists():
try:
with open(rpath) as f:
loaded_results[source][channel.upper()] = json.load(f)
raw = json.load(f)
# Deduplicate by ID — concurrent workers (e.g. hk_worker's
# ThreadPoolExecutor) can cause in-flight outcodes to have
# results saved before their progress index is recorded.
# On resume those outcodes get re-scraped, duplicating results.
seen_ids: set[str] = set()
deduped: list[dict] = []
for p in raw:
pid = p.get("id")
if pid not in seen_ids:
seen_ids.add(pid)
deduped.append(p)
if len(deduped) < len(raw):
log.info(
"Checkpoint %s/%s: deduped %d%d (removed %d dupes)",
source, channel, len(raw), len(deduped),
len(raw) - len(deduped),
)
loaded_results[source][channel.upper()] = deduped
except Exception:
log.warning(
"Checkpoint results for %s/%s corrupt, restarting %s",

View file

@ -5,7 +5,7 @@ from pathlib import Path
import polars as pl
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
from transform import map_property_type, normalize_price
from transform import map_property_type, normalize_postcode, normalize_price
log = logging.getLogger("rightmove")
@ -132,7 +132,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
],
"lon": [p["lon"] for p in properties],
"lat": [p["lat"] for p in properties],
"Postcode": [p["Postcode"] for p in properties],
"Postcode": [normalize_postcode(p["Postcode"]) for p in properties],
"Address per Property Register": [
p["Address per Property Register"] for p in properties
],

View file

@ -7,21 +7,24 @@ from spatial import PostcodeSpatialIndex
log = logging.getLogger("rightmove")
# Maximum plausible floor area for a residential property listing (sqm).
# ~21,500 sq ft — covers even the largest UK mansions.
# Floor area bounds (sqm). Values outside this range are almost certainly
# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
# UK mansions.
MIN_FLOOR_AREA_SQM = 5.0
MAX_FLOOR_AREA_SQM = 2000.0
def validate_floor_area(sqm: float | None) -> float | None:
"""Validate a floor area value. Returns None for nonsensical values.
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
which catches parsing errors where prices or other large numbers are
mistakenly extracted as floor area from free-text descriptions or DOM text.
"""
if sqm is None:
return None
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
return None
return sqm
@ -42,6 +45,25 @@ def parse_display_size(display_size: str | None) -> float | None:
return None
def normalize_sub_type(sub_type: str | None) -> str:
"""Normalize property sub-type for consistent storage.
Fixes delimiter inconsistencies (underscores/hyphens spaces) from
home.co.uk and truncates Zoopla description fragments that were
accidentally captured as sub-types.
"""
if not sub_type:
return "Unknown"
cleaned = sub_type.replace("_", " ").strip()
# Description fragments captured as sub-types are much longer than any
# real property type name (longest canonical is ~25 chars)
if len(cleaned) > 40:
return "Unknown"
# Collapse multiple spaces
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned.title()
def map_property_type(sub_type: str | None) -> str:
"""Map propertySubType to canonical type."""
if not sub_type:
@ -51,6 +73,15 @@ def map_property_type(sub_type: str | None) -> str:
return canonical
# Try title-case variant (e.g., "country house" → "Country House")
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
if canonical:
return canonical
# Try lowercase variant (e.g., "Townhouse" → "townhouse")
canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
if canonical:
return canonical
# Normalize delimiters (underscores/hyphens → spaces) and try again
normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
canonical = PROPERTY_TYPE_MAP.get(normalized)
if canonical:
return canonical
# Keyword fallback for compound types not in the map
@ -103,12 +134,13 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
def normalize_postcode(postcode: str) -> str:
"""Ensure UK postcode has a space before the 3-char incode.
E.g., 'SW1A1AA' 'SW1A 1AA', 'E1 4AB' unchanged."""
postcode = postcode.strip().upper()
if " " in postcode or len(postcode) < 5:
return postcode
return postcode[:-3] + " " + postcode[-3:]
"""Ensure UK postcode has exactly one space before the 3-char incode.
E.g., 'SW1A1AA' 'SW1A 1AA', 'N4 2HA' 'N4 2HA', 'E1 4AB' unchanged."""
# Strip all whitespace then re-insert the single canonical space
compact = re.sub(r"\s+", "", postcode).upper()
if len(compact) < 5:
return compact
return compact[:-3] + " " + compact[-3:]
def normalize_price(amount: int, frequency: str) -> int:
@ -187,7 +219,7 @@ def transform_property(
"Address per Property Register": prop.get("displayAddress", ""),
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": sub_type or "Unknown",
"Property sub-type": normalize_sub_type(sub_type),
"price": price,
"price_frequency": frequency,
"Price qualifier": price_qualifier,

View file

@ -29,7 +29,7 @@ import time
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
from transform import normalize_sub_type, validate_floor_area
log = logging.getLogger("zoopla")
@ -666,16 +666,25 @@ def _map_property_type(raw_type: str | None) -> str:
return canonical
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
if canonical:
return canonical
# Lowercase match (e.g., "Townhouse" → "townhouse")
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
if canonical:
return canonical
# Normalize delimiters (underscores/hyphens → spaces) and try again
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
canonical = PROPERTY_TYPE_MAP.get(normalized)
if canonical:
return canonical
# Keyword fallback
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
if "semi" in lower:
if "semi" in lower and "detach" in lower:
return "Semi-Detached"
if "detach" in lower:
return "Detached"
if "terrace" in lower or "mews" in lower:
return "Terraced"
if "house" in lower:
@ -792,7 +801,7 @@ def transform_property(
"Address per Property Register": address,
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": raw.get("property_type") or "",
"Property sub-type": normalize_sub_type(raw.get("property_type")),
"price": int(price),
"price_frequency": frequency,
"Price qualifier": "",