Improve scraping
This commit is contained in:
parent
89a85e9a0c
commit
c14d28f430
7 changed files with 91 additions and 25 deletions
|
|
@ -29,7 +29,7 @@ import time
|
|||
from constants import DELAY_BETWEEN_PAGES, MAX_BEDROOMS, PROPERTY_TYPE_MAP, ZOOPLA_BASE
|
||||
from metrics import zoopla_errors_total, zoopla_pages_scraped, zoopla_properties_scraped
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import validate_floor_area
|
||||
from transform import normalize_sub_type, validate_floor_area
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -666,16 +666,25 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
return canonical
|
||||
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Lowercase match (e.g., "Townhouse" → "townhouse")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.lower())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||||
normalized = re.sub(r"[-_]+", " ", raw_type).strip().title()
|
||||
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Keyword fallback
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
if "semi" in lower:
|
||||
if "semi" in lower and "detach" in lower:
|
||||
return "Semi-Detached"
|
||||
if "detach" in lower:
|
||||
return "Detached"
|
||||
if "terrace" in lower or "mews" in lower:
|
||||
return "Terraced"
|
||||
if "house" in lower:
|
||||
|
|
@ -792,7 +801,7 @@ def transform_property(
|
|||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": raw.get("property_type") or "",
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue