scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

288
finder/test_zoopla.py Normal file
View file

@ -0,0 +1,288 @@
from zoopla import _detail_cache_key, parse_detail_geo, transform_property
def test_detail_cache_key_uses_listing_id() -> None:
assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
# No id in the URL -> fall back to the URL itself as the key.
assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
class StubPostcodeIndex:
"""Spatial index stub whose nearest-lookup returns a fixed postcode."""
def __init__(self, postcode: str = "BR1 2AB") -> None:
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str:
return self._postcode
# London-ish postcodes with coordinates, plus the Norfolk sample used by the
# verified detail-page snippet (well inside the England bounds check).
PC_COORDS = {
"BR1 2AB": (51.40, 0.01),
"SW1A 1AA": (51.50, -0.14),
"NR29 4RG": (52.716014, 1.614495),
}
# Verified RSC `location` object (listing 59888978), as it appears escaped inside
# a self.__next_f flight chunk in page.content().
_LOCATION_ESCAPED = (
'<script>self.__next_f.push([1,"...'
'\\"location\\":{\\"outcode\\":\\"NR29\\",'
'\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
'\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
'\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
'..."])</script>'
)
def test_parse_detail_geo_location_object_escaped() -> None:
geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
assert geo == {
"lat": 52.716014,
"lng": 1.614495,
"postcode": "NR29 4RG",
"outcode": "NR29",
"source": "detail_location",
"uprn": "10023461458",
"number_or_name": "Martham Mill",
# No `address` twin in this snippet, so there is no full street address.
"full_address": None,
}
def test_parse_detail_geo_location_object_unescaped() -> None:
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_location"
assert geo["postcode"] == "NR29 4RG"
def test_parse_detail_geo_address_twin() -> None:
html = (
'"address":{"fullAddress":"Riverside, Martham NR29",'
'"latitude":52.716014,"longitude":1.614495,'
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_address_obj"
assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
assert geo["uprn"] == "10023461458"
assert geo["full_address"] == "Riverside, Martham NR29"
def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
# Real detail pages carry both wrappers: the `location` object holds the
# uprn + house number/name, the `address` twin holds the full street
# address. They share a uprn, so the twin's fullAddress is attached.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG",'
'"propertyNumberOrName":"Martham Mill"}'
'"address":{"fullAddress":"Riverside, Martham NR29",'
'"latitude":52.716014,"longitude":1.614495,'
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["source"] == "detail_location"
assert geo["uprn"] == "10023461458"
assert geo["number_or_name"] == "Martham Mill"
assert geo["full_address"] == "Riverside, Martham NR29"
def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
# The only `address` twin on the page belongs to a different uprn (a
# comparable listing). With a uprn to match on, an unrelated twin is never
# borrowed — full_address stays None rather than grabbing the wrong street.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
'"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
'"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["uprn"] == "10023461458"
assert geo["full_address"] is None
def test_parse_detail_geo_ignores_poi_coordinates() -> None:
# A charger POI (its coordinates NOT wrapped in a "location" object) followed
# by the property's own "location" wrapper. Anchoring on the wrapper means
# the POI's coordinates are ignored and the property's are returned.
poi = (
'"name":"Martham Community Centre","numberOfConnectors":2,'
'"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
)
prop = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(poi + prop)
assert geo is not None
assert geo["source"] == "detail_location"
# The property's coords win, not the community centre's.
assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
assert geo["postcode"] == "NR29 4RG"
def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
# Page embeds two location objects (e.g. a comparable then the property).
# With a search outcode, the one in that outcode is preferred; without one,
# the first (document order = primary listing) is returned.
comparable = (
'"location":{"outcode":"EN2",'
'"coordinates":{"latitude":51.65,"longitude":-0.08},'
'"postalCode":"EN2 6SN"}'
)
target = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"postalCode":"NR29 4RG"}'
)
geo = parse_detail_geo(comparable + target, search_outcode="NR29")
assert geo is not None and geo["postcode"] == "NR29 4RG"
geo_first = parse_detail_geo(comparable + target)
assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
def test_parse_detail_geo_rejects_out_of_england() -> None:
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":10.0,"longitude":10.0},'
'"uprn":"1","postalCode":"NR29 4RG"}'
)
assert parse_detail_geo(html) is None
def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
# postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
# keep the coordinates, drop the untrustworthy postcode.
html = (
'"location":{"outcode":"NR29",'
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
'"uprn":"1","postalCode":"AB12 3CD"}'
)
geo = parse_detail_geo(html)
assert geo is not None
assert geo["lat"] == 52.716014
assert geo["postcode"] is None
def test_parse_detail_geo_returns_none_for_garbage() -> None:
assert parse_detail_geo("<html><body>no data here</body></html>") is None
assert parse_detail_geo("") is None
# Coordinates that are not inside a property location/address wrapper (e.g.
# only an unwrapped POI) yield nothing — safe degradation to the outcode.
assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
def _raw(**overrides) -> dict:
raw = {
"id": "123",
"url": "/for-sale/details/123/",
"address": "South Street, Bromley BR1",
"price": 500000,
"beds": 2,
"baths": 1,
"property_type": "Flat",
}
raw.update(overrides)
return raw
def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
assert result["Postcode"] == "BR1 3CD"
assert result["Postcode source"] == "detail_address"
assert result["Inferred postcode"] == "BR1 2AB"
assert (result["lat"], result["lon"]) == (51.401, 0.011)
def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Mismatching detail postcode is rejected in favour of the spatial value.
assert result["Postcode"] == "BR1 2AB"
assert result["Postcode source"] == "detail_coordinates"
def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
result = transform_property(
_raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
assert result["Postcode"] == "SW1A 1AA"
assert result["Postcode source"] == "detail_address"
assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
def test_transform_without_detail_falls_back_to_search_outcode() -> None:
# No detail, address has no recognizable outcode -> coarse search-outcode centroid.
result = transform_property(
_raw(address="A street with no postcode"),
StubPostcodeIndex(),
PC_COORDS,
search_outcode="BR1",
detail=None,
)
assert result is not None
assert result["Postcode"] == "BR1 2AB"
assert result["Postcode source"] == "search_outcode"
# No detail page -> no UPRN / house number recovered.
assert result["UPRN"] is None
assert result["Property number or name"] is None
def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
detail = {
"lat": 51.401,
"lng": 0.011,
"postcode": "BR1 3CD",
"outcode": "BR1",
"uprn": "100023461458",
"number_or_name": "12",
"full_address": "South Street, Bromley BR1",
}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
assert result["UPRN"] == "100023461458"
assert result["Property number or name"] == "12"
# The detail full address replaces the outcode-level card address, and the
# house number is prepended for a near-exact Property Register match.
assert result["Listing raw address"] == "South Street, Bromley BR1"
assert result["Address per Property Register"] == "12, South Street, Bromley"
def test_transform_ignores_out_of_england_detail_coords() -> None:
detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
result = transform_property(
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
)
assert result is not None
# Bad detail coords are discarded; falls through to the address outcode (BR1).
assert result["Postcode source"] == "address_outcode"
assert 49 <= result["lat"] <= 56