scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
288
finder/test_zoopla.py
Normal file
288
finder/test_zoopla.py
Normal file
|
|
@ -0,0 +1,288 @@
|
|||
from zoopla import _detail_cache_key, parse_detail_geo, transform_property
|
||||
|
||||
|
||||
def test_detail_cache_key_uses_listing_id() -> None:
|
||||
assert _detail_cache_key("/for-sale/details/59888978/") == "59888978"
|
||||
assert _detail_cache_key("https://www.zoopla.co.uk/for-sale/details/59888978/") == "59888978"
|
||||
# No id in the URL -> fall back to the URL itself as the key.
|
||||
assert _detail_cache_key("/for-sale/property/br1/") == "/for-sale/property/br1/"
|
||||
|
||||
|
||||
class StubPostcodeIndex:
|
||||
"""Spatial index stub whose nearest-lookup returns a fixed postcode."""
|
||||
|
||||
def __init__(self, postcode: str = "BR1 2AB") -> None:
|
||||
self._postcode = postcode
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str:
|
||||
return self._postcode
|
||||
|
||||
|
||||
# London-ish postcodes with coordinates, plus the Norfolk sample used by the
|
||||
# verified detail-page snippet (well inside the England bounds check).
|
||||
PC_COORDS = {
|
||||
"BR1 2AB": (51.40, 0.01),
|
||||
"SW1A 1AA": (51.50, -0.14),
|
||||
"NR29 4RG": (52.716014, 1.614495),
|
||||
}
|
||||
|
||||
# Verified RSC `location` object (listing 59888978), as it appears escaped inside
|
||||
# a self.__next_f flight chunk in page.content().
|
||||
_LOCATION_ESCAPED = (
|
||||
'<script>self.__next_f.push([1,"...'
|
||||
'\\"location\\":{\\"outcode\\":\\"NR29\\",'
|
||||
'\\"coordinates\\":{\\"latitude\\":52.716014,\\"longitude\\":1.614495},'
|
||||
'\\"uprn\\":\\"10023461458\\",\\"postalCode\\":\\"NR29 4RG\\",'
|
||||
'\\"propertyNumberOrName\\":\\"Martham Mill\\"}'
|
||||
'..."])</script>'
|
||||
)
|
||||
|
||||
|
||||
def test_parse_detail_geo_location_object_escaped() -> None:
|
||||
geo = parse_detail_geo(_LOCATION_ESCAPED, search_outcode="NR29")
|
||||
assert geo == {
|
||||
"lat": 52.716014,
|
||||
"lng": 1.614495,
|
||||
"postcode": "NR29 4RG",
|
||||
"outcode": "NR29",
|
||||
"source": "detail_location",
|
||||
"uprn": "10023461458",
|
||||
"number_or_name": "Martham Mill",
|
||||
# No `address` twin in this snippet, so there is no full street address.
|
||||
"full_address": None,
|
||||
}
|
||||
|
||||
|
||||
def test_parse_detail_geo_location_object_unescaped() -> None:
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_location"
|
||||
assert geo["postcode"] == "NR29 4RG"
|
||||
|
||||
|
||||
def test_parse_detail_geo_address_twin() -> None:
|
||||
html = (
|
||||
'"address":{"fullAddress":"Riverside, Martham NR29",'
|
||||
'"latitude":52.716014,"longitude":1.614495,'
|
||||
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_address_obj"
|
||||
assert (geo["lat"], geo["lng"], geo["postcode"]) == (52.716014, 1.614495, "NR29 4RG")
|
||||
assert geo["uprn"] == "10023461458"
|
||||
assert geo["full_address"] == "Riverside, Martham NR29"
|
||||
|
||||
|
||||
def test_parse_detail_geo_merges_location_uprn_with_address_full_address() -> None:
|
||||
# Real detail pages carry both wrappers: the `location` object holds the
|
||||
# uprn + house number/name, the `address` twin holds the full street
|
||||
# address. They share a uprn, so the twin's fullAddress is attached.
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG",'
|
||||
'"propertyNumberOrName":"Martham Mill"}'
|
||||
'"address":{"fullAddress":"Riverside, Martham NR29",'
|
||||
'"latitude":52.716014,"longitude":1.614495,'
|
||||
'"outcode":"NR29","postcode":"NR29 4RG","uprn":"10023461458"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_location"
|
||||
assert geo["uprn"] == "10023461458"
|
||||
assert geo["number_or_name"] == "Martham Mill"
|
||||
assert geo["full_address"] == "Riverside, Martham NR29"
|
||||
|
||||
|
||||
def test_parse_detail_geo_does_not_borrow_comparable_full_address() -> None:
|
||||
# The only `address` twin on the page belongs to a different uprn (a
|
||||
# comparable listing). With a uprn to match on, an unrelated twin is never
|
||||
# borrowed — full_address stays None rather than grabbing the wrong street.
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||
'"address":{"fullAddress":"Some Comparable, Elsewhere EN2",'
|
||||
'"latitude":51.65,"longitude":-0.08,"uprn":"99999999"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["uprn"] == "10023461458"
|
||||
assert geo["full_address"] is None
|
||||
|
||||
|
||||
def test_parse_detail_geo_ignores_poi_coordinates() -> None:
|
||||
# A charger POI (its coordinates NOT wrapped in a "location" object) followed
|
||||
# by the property's own "location" wrapper. Anchoring on the wrapper means
|
||||
# the POI's coordinates are ignored and the property's are returned.
|
||||
poi = (
|
||||
'"name":"Martham Community Centre","numberOfConnectors":2,'
|
||||
'"postcode":"NR29 4SN","coordinates":{"latitude":52.699379,"longitude":1.62921}'
|
||||
)
|
||||
prop = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"10023461458","postalCode":"NR29 4RG"}'
|
||||
)
|
||||
geo = parse_detail_geo(poi + prop)
|
||||
assert geo is not None
|
||||
assert geo["source"] == "detail_location"
|
||||
# The property's coords win, not the community centre's.
|
||||
assert (geo["lat"], geo["lng"]) == (52.716014, 1.614495)
|
||||
assert geo["postcode"] == "NR29 4RG"
|
||||
|
||||
|
||||
def test_parse_detail_geo_prefers_location_matching_search_outcode() -> None:
|
||||
# Page embeds two location objects (e.g. a comparable then the property).
|
||||
# With a search outcode, the one in that outcode is preferred; without one,
|
||||
# the first (document order = primary listing) is returned.
|
||||
comparable = (
|
||||
'"location":{"outcode":"EN2",'
|
||||
'"coordinates":{"latitude":51.65,"longitude":-0.08},'
|
||||
'"postalCode":"EN2 6SN"}'
|
||||
)
|
||||
target = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"postalCode":"NR29 4RG"}'
|
||||
)
|
||||
geo = parse_detail_geo(comparable + target, search_outcode="NR29")
|
||||
assert geo is not None and geo["postcode"] == "NR29 4RG"
|
||||
geo_first = parse_detail_geo(comparable + target)
|
||||
assert geo_first is not None and geo_first["postcode"] == "EN2 6SN"
|
||||
|
||||
|
||||
def test_parse_detail_geo_rejects_out_of_england() -> None:
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":10.0,"longitude":10.0},'
|
||||
'"uprn":"1","postalCode":"NR29 4RG"}'
|
||||
)
|
||||
assert parse_detail_geo(html) is None
|
||||
|
||||
|
||||
def test_parse_detail_geo_drops_inconsistent_postcode() -> None:
|
||||
# postalCode outcode (AB12) disagrees with the object's own outcode (NR29):
|
||||
# keep the coordinates, drop the untrustworthy postcode.
|
||||
html = (
|
||||
'"location":{"outcode":"NR29",'
|
||||
'"coordinates":{"latitude":52.716014,"longitude":1.614495},'
|
||||
'"uprn":"1","postalCode":"AB12 3CD"}'
|
||||
)
|
||||
geo = parse_detail_geo(html)
|
||||
assert geo is not None
|
||||
assert geo["lat"] == 52.716014
|
||||
assert geo["postcode"] is None
|
||||
|
||||
|
||||
def test_parse_detail_geo_returns_none_for_garbage() -> None:
|
||||
assert parse_detail_geo("<html><body>no data here</body></html>") is None
|
||||
assert parse_detail_geo("") is None
|
||||
# Coordinates that are not inside a property location/address wrapper (e.g.
|
||||
# only an unwrapped POI) yield nothing — safe degradation to the outcode.
|
||||
assert parse_detail_geo('"name":"X","coordinates":{"latitude":51.5,"longitude":-0.1}') is None
|
||||
|
||||
|
||||
def _raw(**overrides) -> dict:
|
||||
raw = {
|
||||
"id": "123",
|
||||
"url": "/for-sale/details/123/",
|
||||
"address": "South Street, Bromley BR1",
|
||||
"price": 500000,
|
||||
"beds": 2,
|
||||
"baths": 1,
|
||||
"property_type": "Flat",
|
||||
}
|
||||
raw.update(overrides)
|
||||
return raw
|
||||
|
||||
|
||||
def test_transform_uses_detail_coordinates_with_agreeing_postcode() -> None:
|
||||
detail = {"lat": 51.401, "lng": 0.011, "postcode": "BR1 3CD", "outcode": "BR1"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
# Extracted detail postcode agrees with the coordinate-nearest outcode -> trusted.
|
||||
assert result["Postcode"] == "BR1 3CD"
|
||||
assert result["Postcode source"] == "detail_address"
|
||||
assert result["Inferred postcode"] == "BR1 2AB"
|
||||
assert (result["lat"], result["lon"]) == (51.401, 0.011)
|
||||
|
||||
|
||||
def test_transform_uses_nearest_when_detail_postcode_mismatches() -> None:
|
||||
detail = {"lat": 51.401, "lng": 0.011, "postcode": "E14 9SS", "outcode": "E14"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
# Mismatching detail postcode is rejected in favour of the spatial value.
|
||||
assert result["Postcode"] == "BR1 2AB"
|
||||
assert result["Postcode source"] == "detail_coordinates"
|
||||
|
||||
|
||||
def test_transform_geocodes_detail_postcode_without_coordinates() -> None:
|
||||
detail = {"lat": None, "lng": None, "postcode": "SW1A 1AA", "outcode": "SW1A"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex(), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW1A 1AA"
|
||||
assert result["Postcode source"] == "detail_address"
|
||||
assert (result["lat"], result["lon"]) == PC_COORDS["SW1A 1AA"]
|
||||
|
||||
|
||||
def test_transform_without_detail_falls_back_to_search_outcode() -> None:
|
||||
# No detail, address has no recognizable outcode -> coarse search-outcode centroid.
|
||||
result = transform_property(
|
||||
_raw(address="A street with no postcode"),
|
||||
StubPostcodeIndex(),
|
||||
PC_COORDS,
|
||||
search_outcode="BR1",
|
||||
detail=None,
|
||||
)
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "BR1 2AB"
|
||||
assert result["Postcode source"] == "search_outcode"
|
||||
# No detail page -> no UPRN / house number recovered.
|
||||
assert result["UPRN"] is None
|
||||
assert result["Property number or name"] is None
|
||||
|
||||
|
||||
def test_transform_emits_uprn_and_house_numbered_address_from_detail() -> None:
|
||||
detail = {
|
||||
"lat": 51.401,
|
||||
"lng": 0.011,
|
||||
"postcode": "BR1 3CD",
|
||||
"outcode": "BR1",
|
||||
"uprn": "100023461458",
|
||||
"number_or_name": "12",
|
||||
"full_address": "South Street, Bromley BR1",
|
||||
}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
assert result["UPRN"] == "100023461458"
|
||||
assert result["Property number or name"] == "12"
|
||||
# The detail full address replaces the outcode-level card address, and the
|
||||
# house number is prepended for a near-exact Property Register match.
|
||||
assert result["Listing raw address"] == "South Street, Bromley BR1"
|
||||
assert result["Address per Property Register"] == "12, South Street, Bromley"
|
||||
|
||||
|
||||
def test_transform_ignores_out_of_england_detail_coords() -> None:
|
||||
detail = {"lat": 10.0, "lng": 10.0, "postcode": "ZZ9 9ZZ", "outcode": "ZZ9"}
|
||||
result = transform_property(
|
||||
_raw(), StubPostcodeIndex("BR1 2AB"), PC_COORDS, search_outcode="BR1", detail=detail
|
||||
)
|
||||
assert result is not None
|
||||
# Bad detail coords are discarded; falls through to the address outcode (BR1).
|
||||
assert result["Postcode source"] == "address_outcode"
|
||||
assert 49 <= result["lat"] <= 56
|
||||
Loading…
Add table
Add a link
Reference in a new issue