Improve scraping
This commit is contained in:
parent
89a85e9a0c
commit
c14d28f430
7 changed files with 91 additions and 25 deletions
|
|
@ -117,6 +117,13 @@ PROPERTY_TYPE_MAP = {
|
|||
"House Boat": "Other",
|
||||
"Barn": "Other",
|
||||
"Serviced Apartments": "Flats/Maisonettes",
|
||||
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
|
||||
"Semi Detached": "Semi-Detached",
|
||||
"Semi Detached Bungalow": "Semi-Detached",
|
||||
"End Of Terrace": "Terraced",
|
||||
"End Terrace": "Terraced",
|
||||
"Block Of Apartments": "Flats/Maisonettes",
|
||||
"Farm / Barn": "Other",
|
||||
# Lowercase variants (from home.co.uk / Rightmove APIs)
|
||||
"house": "Detached",
|
||||
"bungalow": "Other",
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ from metrics import (
|
|||
homecouk_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||
|
||||
log = logging.getLogger("homecouk")
|
||||
|
||||
|
|
@ -359,11 +359,11 @@ def transform_property(
|
|||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Postcode": normalize_postcode(postcode),
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": parse_tenure(prop),
|
||||
"Property type": map_property_type(listing_type),
|
||||
"Property sub-type": listing_type.title() if listing_type else "Unknown",
|
||||
"Property sub-type": normalize_sub_type(listing_type),
|
||||
"price": int(price),
|
||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||
"Price qualifier": price_qualifier,
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ from metrics import (
|
|||
openrent_requests_total,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||
|
||||
log = logging.getLogger("openrent")
|
||||
|
||||
|
|
@ -781,14 +781,14 @@ def transform_property(
|
|||
"Number of bedrooms & living rooms": bedrooms,
|
||||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Postcode": normalize_postcode(postcode),
|
||||
"Address per Property Register": address,
|
||||
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
||||
# property ownership concept that doesn't apply to rental listings. The
|
||||
# landlord's tenure is not shown on OpenRent listing pages.
|
||||
"Leasehold/Freehold": None,
|
||||
"Property type": map_property_type(property_type),
|
||||
"Property sub-type": property_type or "Unknown",
|
||||
"Property sub-type": normalize_sub_type(property_type),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
|
|
|
|||
|
|
@ -338,7 +338,25 @@ def _load_checkpoint(
|
|||
if rpath.exists():
|
||||
try:
|
||||
with open(rpath) as f:
|
||||
loaded_results[source][channel.upper()] = json.load(f)
|
||||
raw = json.load(f)
|
||||
# Deduplicate by ID — concurrent workers (e.g. hk_worker's
|
||||
# ThreadPoolExecutor) can cause in-flight outcodes to have
|
||||
# results saved before their progress index is recorded.
|
||||
# On resume those outcodes get re-scraped, duplicating results.
|
||||
seen_ids: set[str] = set()
|
||||
deduped: list[dict] = []
|
||||
for p in raw:
|
||||
pid = p.get("id")
|
||||
if pid not in seen_ids:
|
||||
seen_ids.add(pid)
|
||||
deduped.append(p)
|
||||
if len(deduped) < len(raw):
|
||||
log.info(
|
||||
"Checkpoint %s/%s: deduped %d → %d (removed %d dupes)",
|
||||
source, channel, len(raw), len(deduped),
|
||||
len(raw) - len(deduped),
|
||||
)
|
||||
loaded_results[source][channel.upper()] = deduped
|
||||
except Exception:
|
||||
log.warning(
|
||||
"Checkpoint results for %s/%s corrupt, restarting %s",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||
from transform import map_property_type, normalize_price
|
||||
from transform import map_property_type, normalize_postcode, normalize_price
|
||||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
|
@ -132,7 +132,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
|||
],
|
||||
"lon": [p["lon"] for p in properties],
|
||||
"lat": [p["lat"] for p in properties],
|
||||
"Postcode": [p["Postcode"] for p in properties],
|
||||
"Postcode": [normalize_postcode(p["Postcode"]) for p in properties],
|
||||
"Address per Property Register": [
|
||||
p["Address per Property Register"] for p in properties
|
||||
],
|
||||
|
|
|
|||
|
|
@ -7,21 +7,24 @@ from spatial import PostcodeSpatialIndex
|
|||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
# Maximum plausible floor area for a residential property listing (sqm).
|
||||
# ~21,500 sq ft — covers even the largest UK mansions.
|
||||
# Floor area bounds (sqm). Values outside this range are almost certainly
|
||||
# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
|
||||
# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
|
||||
# UK mansions.
|
||||
MIN_FLOOR_AREA_SQM = 5.0
|
||||
MAX_FLOOR_AREA_SQM = 2000.0
|
||||
|
||||
|
||||
def validate_floor_area(sqm: float | None) -> float | None:
|
||||
"""Validate a floor area value. Returns None for nonsensical values.
|
||||
|
||||
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
|
||||
Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
|
||||
which catches parsing errors where prices or other large numbers are
|
||||
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
||||
"""
|
||||
if sqm is None:
|
||||
return None
|
||||
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
|
||||
if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
|
||||
return None
|
||||
return sqm
|
||||
|
||||
|
|
@ -42,6 +45,25 @@ def parse_display_size(display_size: str | None) -> float | None:
|
|||
return None
|
||||
|
||||
|
||||
def normalize_sub_type(sub_type: str | None) -> str:
|
||||
"""Normalize property sub-type for consistent storage.
|
||||
|
||||
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
|
||||
home.co.uk and truncates Zoopla description fragments that were
|
||||
accidentally captured as sub-types.
|
||||
"""
|
||||
if not sub_type:
|
||||
return "Unknown"
|
||||
cleaned = sub_type.replace("_", " ").strip()
|
||||
# Description fragments captured as sub-types are much longer than any
|
||||
# real property type name (longest canonical is ~25 chars)
|
||||
if len(cleaned) > 40:
|
||||
return "Unknown"
|
||||
# Collapse multiple spaces
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
return cleaned.title()
|
||||
|
||||
|
||||
def map_property_type(sub_type: str | None) -> str:
|
||||
"""Map propertySubType to canonical type."""
|
||||
if not sub_type:
|
||||
|
|
@ -51,6 +73,15 @@ def map_property_type(sub_type: str | None) -> str:
|
|||
return canonical
|
||||
# Try title-case variant (e.g., "country house" → "Country House")
|
||||
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Try lowercase variant (e.g., "Townhouse" → "townhouse")
|
||||
canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||||
normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
|
||||
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Keyword fallback for compound types not in the map
|
||||
|
|
@ -103,12 +134,13 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
|||
|
||||
|
||||
def normalize_postcode(postcode: str) -> str:
|
||||
"""Ensure UK postcode has a space before the 3-char incode.
|
||||
E.g., 'SW1A1AA' → 'SW1A 1AA', 'E1 4AB' unchanged."""
|
||||
postcode = postcode.strip().upper()
|
||||
if " " in postcode or len(postcode) < 5:
|
||||
return postcode
|
||||
return postcode[:-3] + " " + postcode[-3:]
|
||||
"""Ensure UK postcode has exactly one space before the 3-char incode.
|
||||
E.g., 'SW1A1AA' → 'SW1A 1AA', 'N4 2HA' → 'N4 2HA', 'E1 4AB' unchanged."""
|
||||
# Strip all whitespace then re-insert the single canonical space
|
||||
compact = re.sub(r"\s+", "", postcode).upper()
|
||||
if len(compact) < 5:
|
||||
return compact
|
||||
return compact[:-3] + " " + compact[-3:]
|
||||
|
||||
|
||||
def normalize_price(amount: int, frequency: str) -> int:
|
||||
|
|
@ -187,7 +219,7 @@ def transform_property(
|
|||
"Address per Property Register": prop.get("displayAddress", ""),
|
||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": sub_type or "Unknown",
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
"price": price,
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": price_qualifier,
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import time
|
|||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
from transform import normalize_sub_type, validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -666,16 +666,25 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return canonical
|
||||
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Lowercase match (e.g., "Townhouse" → "townhouse")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||||
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
|
||||
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Keyword fallback
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
if "semi" in lower:
|
||||
if "semi" in lower and "detach" in lower:
|
||||
return "Semi-Detached"
|
||||
if "detach" in lower:
|
||||
return "Detached"
|
||||
if "terrace" in lower or "mews" in lower:
|
||||
return "Terraced"
|
||||
if "house" in lower:
|
||||
|
|
@ -792,7 +801,7 @@ def transform_property(
|
|||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": raw.get("property_type") or "",
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue