From c14d28f43049b31102c382d14fea3bac0bc94848 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 29 Mar 2026 11:41:35 +0100 Subject: [PATCH] Improve scraping --- finder/constants.py | 7 ++++++ finder/homecouk.py | 6 ++--- finder/openrent.py | 6 ++--- finder/scraper.py | 20 ++++++++++++++++- finder/storage.py | 4 ++-- finder/transform.py | 54 ++++++++++++++++++++++++++++++++++++--------- finder/zoopla.py | 19 +++++++++++----- 7 files changed, 91 insertions(+), 25 deletions(-) diff --git a/finder/constants.py b/finder/constants.py index 9aee415..604646a 100644 --- a/finder/constants.py +++ b/finder/constants.py @@ -117,6 +117,13 @@ PROPERTY_TYPE_MAP = { "House Boat": "Other", "Barn": "Other", "Serviced Apartments": "Flats/Maisonettes", + # Space-separated variants (from home.co.uk underscore/hyphen normalization) + "Semi Detached": "Semi-Detached", + "Semi Detached Bungalow": "Semi-Detached", + "End Of Terrace": "Terraced", + "End Terrace": "Terraced", + "Block Of Apartments": "Flats/Maisonettes", + "Farm / Barn": "Other", # Lowercase variants (from home.co.uk / Rightmove APIs) "house": "Detached", "bungalow": "Other", diff --git a/finder/homecouk.py b/finder/homecouk.py index bace56d..51e3940 100644 --- a/finder/homecouk.py +++ b/finder/homecouk.py @@ -26,7 +26,7 @@ from metrics import ( homecouk_requests_total, ) from spatial import PostcodeSpatialIndex -from transform import validate_floor_area +from transform import normalize_postcode, normalize_sub_type, validate_floor_area log = logging.getLogger("homecouk") @@ -359,11 +359,11 @@ def transform_property( "Number of bedrooms & living rooms": bedrooms + bathrooms, "lon": lng, "lat": lat, - "Postcode": postcode, + "Postcode": normalize_postcode(postcode), "Address per Property Register": address, "Leasehold/Freehold": parse_tenure(prop), "Property type": map_property_type(listing_type), - "Property sub-type": listing_type.title() if listing_type else "Unknown", + "Property sub-type": normalize_sub_type(listing_type), "price": int(price), "price_frequency": "" if channel == "BUY" else "monthly", "Price qualifier": price_qualifier, diff --git a/finder/openrent.py b/finder/openrent.py index f08a3cd..8be4779 100644 --- a/finder/openrent.py +++ b/finder/openrent.py @@ -46,7 +46,7 @@ from metrics import ( openrent_requests_total, ) from spatial import PostcodeSpatialIndex -from transform import validate_floor_area +from transform import normalize_postcode, normalize_sub_type, validate_floor_area log = logging.getLogger("openrent") @@ -781,14 +781,14 @@ def transform_property( "Number of bedrooms & living rooms": bedrooms, "lon": lng, "lat": lat, - "Postcode": postcode, + "Postcode": normalize_postcode(postcode), "Address per Property Register": address, # OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a # property ownership concept that doesn't apply to rental listings. The # landlord's tenure is not shown on OpenRent listing pages. "Leasehold/Freehold": None, "Property type": map_property_type(property_type), - "Property sub-type": property_type or "Unknown", + "Property sub-type": normalize_sub_type(property_type), "price": int(price), "price_frequency": frequency, "Price qualifier": "", diff --git a/finder/scraper.py b/finder/scraper.py index 78db671..a06c354 100644 --- a/finder/scraper.py +++ b/finder/scraper.py @@ -338,7 +338,25 @@ def _load_checkpoint( if rpath.exists(): try: with open(rpath) as f: - loaded_results[source][channel.upper()] = json.load(f) + raw = json.load(f) + # Deduplicate by ID — concurrent workers (e.g. hk_worker's + # ThreadPoolExecutor) can cause in-flight outcodes to have + # results saved before their progress index is recorded. + # On resume those outcodes get re-scraped, duplicating results. + seen_ids: set[str] = set() + deduped: list[dict] = [] + for p in raw: + pid = p.get("id") + if pid not in seen_ids: + seen_ids.add(pid) + deduped.append(p) + if len(deduped) < len(raw): + log.info( + "Checkpoint %s/%s: deduped %d → %d (removed %d dupes)", + source, channel, len(raw), len(deduped), + len(raw) - len(deduped), + ) + loaded_results[source][channel.upper()] = deduped except Exception: log.warning( "Checkpoint results for %s/%s corrupt, restarting %s", diff --git a/finder/storage.py b/finder/storage.py index 487ee34..30ee9d1 100644 --- a/finder/storage.py +++ b/finder/storage.py @@ -5,7 +5,7 @@ from pathlib import Path import polars as pl from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY -from transform import map_property_type, normalize_price +from transform import map_property_type, normalize_postcode, normalize_price log = logging.getLogger("rightmove") @@ -132,7 +132,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None: ], "lon": [p["lon"] for p in properties], "lat": [p["lat"] for p in properties], - "Postcode": [p["Postcode"] for p in properties], + "Postcode": [normalize_postcode(p["Postcode"]) for p in properties], "Address per Property Register": [ p["Address per Property Register"] for p in properties ], diff --git a/finder/transform.py b/finder/transform.py index 301e0e6..94ec195 100644 --- a/finder/transform.py +++ b/finder/transform.py @@ -7,21 +7,24 @@ from spatial import PostcodeSpatialIndex log = logging.getLogger("rightmove") -# Maximum plausible floor area for a residential property listing (sqm). -# ~21,500 sq ft — covers even the largest UK mansions. +# Floor area bounds (sqm). Values outside this range are almost certainly +# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a +# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest +# UK mansions. +MIN_FLOOR_AREA_SQM = 5.0 MAX_FLOOR_AREA_SQM = 2000.0 def validate_floor_area(sqm: float | None) -> float | None: """Validate a floor area value. Returns None for nonsensical values. - Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM, + Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM, which catches parsing errors where prices or other large numbers are mistakenly extracted as floor area from free-text descriptions or DOM text. """ if sqm is None: return None - if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM: + if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM: return None return sqm @@ -42,6 +45,25 @@ def parse_display_size(display_size: str | None) -> float | None: return None +def normalize_sub_type(sub_type: str | None) -> str: + """Normalize property sub-type for consistent storage. + + Fixes delimiter inconsistencies (underscores/hyphens → spaces) from + home.co.uk and truncates Zoopla description fragments that were + accidentally captured as sub-types. + """ + if not sub_type: + return "Unknown" + cleaned = sub_type.replace("_", " ").strip() + # Description fragments captured as sub-types are much longer than any + # real property type name (longest canonical is ~25 chars) + if len(cleaned) > 40: + return "Unknown" + # Collapse multiple spaces + cleaned = re.sub(r"\s+", " ", cleaned) + return cleaned.title() + + def map_property_type(sub_type: str | None) -> str: """Map propertySubType to canonical type.""" if not sub_type: @@ -51,6 +73,15 @@ def map_property_type(sub_type: str | None) -> str: return canonical # Try title-case variant (e.g., "country house" → "Country House") canonical = PROPERTY_TYPE_MAP.get(sub_type.title()) + if canonical: + return canonical + # Try lowercase variant (e.g., "Townhouse" → "townhouse") + canonical = PROPERTY_TYPE_MAP.get(sub_type.lower()) + if canonical: + return canonical + # Normalize delimiters (underscores/hyphens → spaces) and try again + normalized = re.sub(r"[-_]+", " ", sub_type).strip().title() + canonical = PROPERTY_TYPE_MAP.get(normalized) if canonical: return canonical # Keyword fallback for compound types not in the map @@ -103,12 +134,13 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]: def normalize_postcode(postcode: str) -> str: - """Ensure UK postcode has a space before the 3-char incode. - E.g., 'SW1A1AA' → 'SW1A 1AA', 'E1 4AB' unchanged.""" - postcode = postcode.strip().upper() - if " " in postcode or len(postcode) < 5: - return postcode - return postcode[:-3] + " " + postcode[-3:] + """Ensure UK postcode has exactly one space before the 3-char incode. + E.g., 'SW1A1AA' → 'SW1A 1AA', 'N4 2HA' → 'N4 2HA', 'E1 4AB' unchanged.""" + # Strip all whitespace then re-insert the single canonical space + compact = re.sub(r"\s+", "", postcode).upper() + if len(compact) < 5: + return compact + return compact[:-3] + " " + compact[-3:] def normalize_price(amount: int, frequency: str) -> int: @@ -187,7 +219,7 @@ def transform_property( "Address per Property Register": prop.get("displayAddress", ""), "Leasehold/Freehold": extract_tenure(prop.get("tenure")), "Property type": map_property_type(sub_type), - "Property sub-type": sub_type or "Unknown", + "Property sub-type": normalize_sub_type(sub_type), "price": price, "price_frequency": frequency, "Price qualifier": price_qualifier, diff --git a/finder/zoopla.py b/finder/zoopla.py index f610704..60c2c5e 100644 --- a/finder/zoopla.py +++ b/finder/zoopla.py @@ -29,7 +29,7 @@ import time from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped from spatial import PostcodeSpatialIndex -from transform import validate_floor_area +from transform import normalize_sub_type, validate_floor_area log = logging.getLogger("zoopla") @@ -666,16 +666,25 @@ def _map_property_type(raw_type: str | None) -> str: return canonical # Title-case match (handles regex-extracted lowercase like "town house" → "Town House") canonical = PROPERTY_TYPE_MAP.get(raw_type.title()) + if canonical: + return canonical + # Lowercase match (e.g., "Townhouse" → "townhouse") + canonical = PROPERTY_TYPE_MAP.get(raw_type.lower()) + if canonical: + return canonical + # Normalize delimiters (underscores/hyphens → spaces) and try again + normalized = re.sub(r"[-_]+", " ", raw_type).strip().title() + canonical = PROPERTY_TYPE_MAP.get(normalized) if canonical: return canonical # Keyword fallback lower = raw_type.lower() if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower: return "Flats/Maisonettes" - if "detached" in lower and "semi" not in lower: - return "Detached" - if "semi" in lower: + if "semi" in lower and "detach" in lower: return "Semi-Detached" + if "detach" in lower: + return "Detached" if "terrace" in lower or "mews" in lower: return "Terraced" if "house" in lower: @@ -792,7 +801,7 @@ def transform_property( "Address per Property Register": address, "Leasehold/Freehold": raw.get("tenure") or None, "Property type": _map_property_type(raw.get("property_type")), - "Property sub-type": raw.get("property_type") or "", + "Property sub-type": normalize_sub_type(raw.get("property_type")), "price": int(price), "price_frequency": frequency, "Price qualifier": "",