Improve scraping

This commit is contained in:
Andras Schmelczer 2026-03-29 11:41:35 +01:00
parent 89a85e9a0c
commit c14d28f430
7 changed files with 91 additions and 25 deletions

View file

@ -29,7 +29,7 @@ import time
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
from spatial import PostcodeSpatialIndex
from transform import validate_floor_area
from transform import normalize_sub_type, validate_floor_area
log = logging.getLogger("zoopla")
@ -666,16 +666,25 @@ def _map_property_type(raw_type: str | None) -> str:
return canonical
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
if canonical:
return canonical
# Lowercase match (e.g., "Townhouse" → "townhouse")
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
if canonical:
return canonical
# Normalize delimiters (underscores/hyphens → spaces) and try again
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
canonical = PROPERTY_TYPE_MAP.get(normalized)
if canonical:
return canonical
# Keyword fallback
lower = raw_type.lower()
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
return "Flats/Maisonettes"
if "detached" in lower and "semi" not in lower:
return "Detached"
if "semi" in lower:
if "semi" in lower and "detach" in lower:
return "Semi-Detached"
if "detach" in lower:
return "Detached"
if "terrace" in lower or "mews" in lower:
return "Terraced"
if "house" in lower:
@ -792,7 +801,7 @@ def transform_property(
"Address per Property Register": address,
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": raw.get("property_type") or "",
"Property sub-type": normalize_sub_type(raw.get("property_type")),
"price": int(price),
"price_frequency": frequency,
"Price qualifier": "",