Improve scraping
This commit is contained in:
parent
89a85e9a0c
commit
c14d28f430
7 changed files with 91 additions and 25 deletions
|
|
@ -117,6 +117,13 @@ PROPERTY_TYPE_MAP = {
|
||||||
"House Boat": "Other",
|
"House Boat": "Other",
|
||||||
"Barn": "Other",
|
"Barn": "Other",
|
||||||
"Serviced Apartments": "Flats/Maisonettes",
|
"Serviced Apartments": "Flats/Maisonettes",
|
||||||
|
# Space-separated variants (from home.co.uk underscore/hyphen normalization)
|
||||||
|
"Semi Detached": "Semi-Detached",
|
||||||
|
"Semi Detached Bungalow": "Semi-Detached",
|
||||||
|
"End Of Terrace": "Terraced",
|
||||||
|
"End Terrace": "Terraced",
|
||||||
|
"Block Of Apartments": "Flats/Maisonettes",
|
||||||
|
"Farm / Barn": "Other",
|
||||||
# Lowercase variants (from home.co.uk / Rightmove APIs)
|
# Lowercase variants (from home.co.uk / Rightmove APIs)
|
||||||
"house": "Detached",
|
"house": "Detached",
|
||||||
"bungalow": "Other",
|
"bungalow": "Other",
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ from metrics import (
|
||||||
homecouk_requests_total,
|
homecouk_requests_total,
|
||||||
)
|
)
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
from transform import validate_floor_area
|
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||||
|
|
||||||
log = logging.getLogger("homecouk")
|
log = logging.getLogger("homecouk")
|
||||||
|
|
||||||
|
|
@ -359,11 +359,11 @@ def transform_property(
|
||||||
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
"Number of bedrooms & living rooms": bedrooms + bathrooms,
|
||||||
"lon": lng,
|
"lon": lng,
|
||||||
"lat": lat,
|
"lat": lat,
|
||||||
"Postcode": postcode,
|
"Postcode": normalize_postcode(postcode),
|
||||||
"Address per Property Register": address,
|
"Address per Property Register": address,
|
||||||
"Leasehold/Freehold": parse_tenure(prop),
|
"Leasehold/Freehold": parse_tenure(prop),
|
||||||
"Property type": map_property_type(listing_type),
|
"Property type": map_property_type(listing_type),
|
||||||
"Property sub-type": listing_type.title() if listing_type else "Unknown",
|
"Property sub-type": normalize_sub_type(listing_type),
|
||||||
"price": int(price),
|
"price": int(price),
|
||||||
"price_frequency": "" if channel == "BUY" else "monthly",
|
"price_frequency": "" if channel == "BUY" else "monthly",
|
||||||
"Price qualifier": price_qualifier,
|
"Price qualifier": price_qualifier,
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ from metrics import (
|
||||||
openrent_requests_total,
|
openrent_requests_total,
|
||||||
)
|
)
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
from transform import validate_floor_area
|
from transform import normalize_postcode, normalize_sub_type, validate_floor_area
|
||||||
|
|
||||||
log = logging.getLogger("openrent")
|
log = logging.getLogger("openrent")
|
||||||
|
|
||||||
|
|
@ -781,14 +781,14 @@ def transform_property(
|
||||||
"Number of bedrooms & living rooms": bedrooms,
|
"Number of bedrooms & living rooms": bedrooms,
|
||||||
"lon": lng,
|
"lon": lng,
|
||||||
"lat": lat,
|
"lat": lat,
|
||||||
"Postcode": postcode,
|
"Postcode": normalize_postcode(postcode),
|
||||||
"Address per Property Register": address,
|
"Address per Property Register": address,
|
||||||
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
# OpenRent is a rental-only platform — tenure (Freehold/Leasehold) is a
|
||||||
# property ownership concept that doesn't apply to rental listings. The
|
# property ownership concept that doesn't apply to rental listings. The
|
||||||
# landlord's tenure is not shown on OpenRent listing pages.
|
# landlord's tenure is not shown on OpenRent listing pages.
|
||||||
"Leasehold/Freehold": None,
|
"Leasehold/Freehold": None,
|
||||||
"Property type": map_property_type(property_type),
|
"Property type": map_property_type(property_type),
|
||||||
"Property sub-type": property_type or "Unknown",
|
"Property sub-type": normalize_sub_type(property_type),
|
||||||
"price": int(price),
|
"price": int(price),
|
||||||
"price_frequency": frequency,
|
"price_frequency": frequency,
|
||||||
"Price qualifier": "",
|
"Price qualifier": "",
|
||||||
|
|
|
||||||
|
|
@ -338,7 +338,25 @@ def _load_checkpoint(
|
||||||
if rpath.exists():
|
if rpath.exists():
|
||||||
try:
|
try:
|
||||||
with open(rpath) as f:
|
with open(rpath) as f:
|
||||||
loaded_results[source][channel.upper()] = json.load(f)
|
raw = json.load(f)
|
||||||
|
# Deduplicate by ID — concurrent workers (e.g. hk_worker's
|
||||||
|
# ThreadPoolExecutor) can cause in-flight outcodes to have
|
||||||
|
# results saved before their progress index is recorded.
|
||||||
|
# On resume those outcodes get re-scraped, duplicating results.
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
deduped: list[dict] = []
|
||||||
|
for p in raw:
|
||||||
|
pid = p.get("id")
|
||||||
|
if pid not in seen_ids:
|
||||||
|
seen_ids.add(pid)
|
||||||
|
deduped.append(p)
|
||||||
|
if len(deduped) < len(raw):
|
||||||
|
log.info(
|
||||||
|
"Checkpoint %s/%s: deduped %d → %d (removed %d dupes)",
|
||||||
|
source, channel, len(raw), len(deduped),
|
||||||
|
len(raw) - len(deduped),
|
||||||
|
)
|
||||||
|
loaded_results[source][channel.upper()] = deduped
|
||||||
except Exception:
|
except Exception:
|
||||||
log.warning(
|
log.warning(
|
||||||
"Checkpoint results for %s/%s corrupt, restarting %s",
|
"Checkpoint results for %s/%s corrupt, restarting %s",
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
|
||||||
from transform import map_property_type, normalize_price
|
from transform import map_property_type, normalize_postcode, normalize_price
|
||||||
|
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
||||||
|
|
@ -132,7 +132,7 @@ def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
|
||||||
],
|
],
|
||||||
"lon": [p["lon"] for p in properties],
|
"lon": [p["lon"] for p in properties],
|
||||||
"lat": [p["lat"] for p in properties],
|
"lat": [p["lat"] for p in properties],
|
||||||
"Postcode": [p["Postcode"] for p in properties],
|
"Postcode": [normalize_postcode(p["Postcode"]) for p in properties],
|
||||||
"Address per Property Register": [
|
"Address per Property Register": [
|
||||||
p["Address per Property Register"] for p in properties
|
p["Address per Property Register"] for p in properties
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -7,21 +7,24 @@ from spatial import PostcodeSpatialIndex
|
||||||
log = logging.getLogger("rightmove")
|
log = logging.getLogger("rightmove")
|
||||||
|
|
||||||
|
|
||||||
# Maximum plausible floor area for a residential property listing (sqm).
|
# Floor area bounds (sqm). Values outside this range are almost certainly
|
||||||
# ~21,500 sq ft — covers even the largest UK mansions.
|
# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
|
||||||
|
# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
|
||||||
|
# UK mansions.
|
||||||
|
MIN_FLOOR_AREA_SQM = 5.0
|
||||||
MAX_FLOOR_AREA_SQM = 2000.0
|
MAX_FLOOR_AREA_SQM = 2000.0
|
||||||
|
|
||||||
|
|
||||||
def validate_floor_area(sqm: float | None) -> float | None:
|
def validate_floor_area(sqm: float | None) -> float | None:
|
||||||
"""Validate a floor area value. Returns None for nonsensical values.
|
"""Validate a floor area value. Returns None for nonsensical values.
|
||||||
|
|
||||||
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
|
Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
|
||||||
which catches parsing errors where prices or other large numbers are
|
which catches parsing errors where prices or other large numbers are
|
||||||
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
||||||
"""
|
"""
|
||||||
if sqm is None:
|
if sqm is None:
|
||||||
return None
|
return None
|
||||||
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
|
if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
|
||||||
return None
|
return None
|
||||||
return sqm
|
return sqm
|
||||||
|
|
||||||
|
|
@ -42,6 +45,25 @@ def parse_display_size(display_size: str | None) -> float | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_sub_type(sub_type: str | None) -> str:
|
||||||
|
"""Normalize property sub-type for consistent storage.
|
||||||
|
|
||||||
|
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
|
||||||
|
home.co.uk and truncates Zoopla description fragments that were
|
||||||
|
accidentally captured as sub-types.
|
||||||
|
"""
|
||||||
|
if not sub_type:
|
||||||
|
return "Unknown"
|
||||||
|
cleaned = sub_type.replace("_", " ").strip()
|
||||||
|
# Description fragments captured as sub-types are much longer than any
|
||||||
|
# real property type name (longest canonical is ~25 chars)
|
||||||
|
if len(cleaned) > 40:
|
||||||
|
return "Unknown"
|
||||||
|
# Collapse multiple spaces
|
||||||
|
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||||
|
return cleaned.title()
|
||||||
|
|
||||||
|
|
||||||
def map_property_type(sub_type: str | None) -> str:
|
def map_property_type(sub_type: str | None) -> str:
|
||||||
"""Map propertySubType to canonical type."""
|
"""Map propertySubType to canonical type."""
|
||||||
if not sub_type:
|
if not sub_type:
|
||||||
|
|
@ -51,6 +73,15 @@ def map_property_type(sub_type: str | None) -> str:
|
||||||
return canonical
|
return canonical
|
||||||
# Try title-case variant (e.g., "country house" → "Country House")
|
# Try title-case variant (e.g., "country house" → "Country House")
|
||||||
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
|
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
|
||||||
|
if canonical:
|
||||||
|
return canonical
|
||||||
|
# Try lowercase variant (e.g., "Townhouse" → "townhouse")
|
||||||
|
canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
|
||||||
|
if canonical:
|
||||||
|
return canonical
|
||||||
|
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||||||
|
normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
|
||||||
|
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||||||
if canonical:
|
if canonical:
|
||||||
return canonical
|
return canonical
|
||||||
# Keyword fallback for compound types not in the map
|
# Keyword fallback for compound types not in the map
|
||||||
|
|
@ -103,12 +134,13 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
||||||
|
|
||||||
|
|
||||||
def normalize_postcode(postcode: str) -> str:
|
def normalize_postcode(postcode: str) -> str:
|
||||||
"""Ensure UK postcode has a space before the 3-char incode.
|
"""Ensure UK postcode has exactly one space before the 3-char incode.
|
||||||
E.g., 'SW1A1AA' → 'SW1A 1AA', 'E1 4AB' unchanged."""
|
E.g., 'SW1A1AA' → 'SW1A 1AA', 'N4 2HA' → 'N4 2HA', 'E1 4AB' unchanged."""
|
||||||
postcode = postcode.strip().upper()
|
# Strip all whitespace then re-insert the single canonical space
|
||||||
if " " in postcode or len(postcode) < 5:
|
compact = re.sub(r"\s+", "", postcode).upper()
|
||||||
return postcode
|
if len(compact) < 5:
|
||||||
return postcode[:-3] + " " + postcode[-3:]
|
return compact
|
||||||
|
return compact[:-3] + " " + compact[-3:]
|
||||||
|
|
||||||
|
|
||||||
def normalize_price(amount: int, frequency: str) -> int:
|
def normalize_price(amount: int, frequency: str) -> int:
|
||||||
|
|
@ -187,7 +219,7 @@ def transform_property(
|
||||||
"Address per Property Register": prop.get("displayAddress", ""),
|
"Address per Property Register": prop.get("displayAddress", ""),
|
||||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||||
"Property type": map_property_type(sub_type),
|
"Property type": map_property_type(sub_type),
|
||||||
"Property sub-type": sub_type or "Unknown",
|
"Property sub-type": normalize_sub_type(sub_type),
|
||||||
"price": price,
|
"price": price,
|
||||||
"price_frequency": frequency,
|
"price_frequency": frequency,
|
||||||
"Price qualifier": price_qualifier,
|
"Price qualifier": price_qualifier,
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ import time
|
||||||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||||
from spatial import PostcodeSpatialIndex
|
from spatial import PostcodeSpatialIndex
|
||||||
from transform import validate_floor_area
|
from transform import normalize_sub_type, validate_floor_area
|
||||||
|
|
||||||
log = logging.getLogger("zoopla")
|
log = logging.getLogger("zoopla")
|
||||||
|
|
||||||
|
|
@ -666,16 +666,25 @@ def _map_property_type(raw_type: str | None) -> str:
|
||||||
return canonical
|
return canonical
|
||||||
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
||||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
||||||
|
if canonical:
|
||||||
|
return canonical
|
||||||
|
# Lowercase match (e.g., "Townhouse" → "townhouse")
|
||||||
|
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
|
||||||
|
if canonical:
|
||||||
|
return canonical
|
||||||
|
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||||||
|
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
|
||||||
|
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||||||
if canonical:
|
if canonical:
|
||||||
return canonical
|
return canonical
|
||||||
# Keyword fallback
|
# Keyword fallback
|
||||||
lower = raw_type.lower()
|
lower = raw_type.lower()
|
||||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||||
return "Flats/Maisonettes"
|
return "Flats/Maisonettes"
|
||||||
if "detached" in lower and "semi" not in lower:
|
if "semi" in lower and "detach" in lower:
|
||||||
return "Detached"
|
|
||||||
if "semi" in lower:
|
|
||||||
return "Semi-Detached"
|
return "Semi-Detached"
|
||||||
|
if "detach" in lower:
|
||||||
|
return "Detached"
|
||||||
if "terrace" in lower or "mews" in lower:
|
if "terrace" in lower or "mews" in lower:
|
||||||
return "Terraced"
|
return "Terraced"
|
||||||
if "house" in lower:
|
if "house" in lower:
|
||||||
|
|
@ -792,7 +801,7 @@ def transform_property(
|
||||||
"Address per Property Register": address,
|
"Address per Property Register": address,
|
||||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||||
"Property type": _map_property_type(raw.get("property_type")),
|
"Property type": _map_property_type(raw.get("property_type")),
|
||||||
"Property sub-type": raw.get("property_type") or "",
|
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||||
"price": int(price),
|
"price": int(price),
|
||||||
"price_frequency": frequency,
|
"price_frequency": frequency,
|
||||||
"Price qualifier": "",
|
"Price qualifier": "",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue