perfect-postcode/finder/test_onthemarket.py

206 lines
7.4 KiB
Python

"""Tests for the OnTheMarket scraper's detail-page postcode recovery.
`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
the property's own postcode lives in the analytics dataLayer
(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
postcode sits separately under `…property.agent.postcode` — the trap we must not
fall into.
"""
import json
import onthemarket
from onthemarket import parse_detail_postcode, transform_property
class _StubIndex:
"""Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
def __init__(self, postcode: str | None):
self._postcode = postcode
def nearest(self, lat: float, lng: float) -> str | None:
return self._postcode
def _detail_html(
*,
property_id: int = 19522441,
datalayer_postcode: str = "SE5 9AA",
agent_postcode: str = "SE5 8RS",
) -> str:
"""Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
next_data = {
"props": {
"initialReduxState": {
"metadata": {
"dataLayer": {
"page-type": "details-section",
"property-type": "homes",
# The property's own unit postcode.
"postcode": datalayer_postcode,
"property-id": property_id,
"price": "275,000",
"addressline_2": "Padfield Road",
}
},
"property": {
"displayAddress": "Padfield Road, London, SE5",
"location": {"lon": -0.100233, "lat": 51.466129},
# The agent block carries the AGENT'S office postcode — the
# trap. parse_detail_postcode must not return this.
"agent": {
"address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
"postcode": agent_postcode,
},
},
}
}
}
payload = json.dumps(next_data)
return (
"<html><body>"
'<script id="__NEXT_DATA__" type="application/json">'
f"{payload}"
"</script></body></html>"
)
# ---------------------------------------------------------------------------
# parse_detail_postcode
# ---------------------------------------------------------------------------
def test_parse_returns_property_postcode_not_agent():
html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
def test_parse_normalizes_spacing():
html = _detail_html(datalayer_postcode="se59aa")
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
def test_parse_ignores_mismatched_property_id():
# dataLayer postcode belongs to property 19522441; asking for a different
# listing id must refuse to return it.
html = _detail_html(property_id=19522441)
assert parse_detail_postcode(html, "99999999") is None
def test_parse_accepts_when_no_listing_id_given():
html = _detail_html(datalayer_postcode="SE5 9AA")
assert parse_detail_postcode(html, None) == "SE5 9AA"
def test_parse_handles_missing_postcode():
html = _detail_html(datalayer_postcode="")
assert parse_detail_postcode(html, "19522441") is None
def test_parse_handles_no_next_data():
assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
def test_parse_handles_empty_html():
assert parse_detail_postcode("", "1") is None
def test_parse_handles_malformed_json():
html = (
'<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
)
assert parse_detail_postcode(html, "1") is None
def test_parse_handles_missing_datalayer():
next_data = {"props": {"initialReduxState": {"metadata": {}}}}
html = (
'<script id="__NEXT_DATA__" type="application/json">'
f"{json.dumps(next_data)}</script>"
)
assert parse_detail_postcode(html, "1") is None
# ---------------------------------------------------------------------------
# transform_property — detail postcode wiring + trust rule
# ---------------------------------------------------------------------------
_RAW_LISTING = {
"id": "19522441",
"address": "Padfield Road, London, SE5",
"location": {"lon": -0.100233, "lat": 51.466129},
"bedrooms": 2,
"bathrooms": 1,
"price": "£275,000",
"humanised-property-type": "Apartment",
"features": ["Tenure: Leasehold (99 years remaining)"],
"details-url": "/details/19522441/",
}
def test_transform_uses_trusted_detail_postcode():
# Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
# the (more precise) detail postcode and label it detail_address.
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
assert out is not None
assert out["Postcode"] == "SE5 9AA"
assert out["Postcode source"] == "detail_address"
def test_transform_rejects_detail_postcode_on_outcode_mismatch():
# Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
# outcode -> reject the detail postcode, fall back to coordinate logic.
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
assert out is not None
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "coordinates"
def test_transform_without_detail_postcode_uses_coordinates():
index = _StubIndex("SE5 1AA")
out = transform_property(_RAW_LISTING, index, detail_postcode=None)
assert out is not None
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "coordinates"
# No UPRN / house number is recoverable from OnTheMarket.
assert out["UPRN"] is None
assert out["Property number or name"] is None
def test_transform_detail_postcode_via_search_address_outcode():
# When the card address already carries a full postcode that agrees with the
# coordinates, the existing "address" source still wins absent a detail
# postcode — detail recovery never regresses that path.
raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
index = _StubIndex("SE5 1AA")
out = transform_property(raw, index, detail_postcode=None)
assert out["Postcode"] == "SE5 1AA"
assert out["Postcode source"] == "address"
# ---------------------------------------------------------------------------
# _fetch_detail_postcode caching (no real network)
# ---------------------------------------------------------------------------
def test_fetch_detail_postcode_is_cached(monkeypatch):
onthemarket._detail_postcode_cache.clear()
onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
def _boom(*args, **kwargs): # pragma: no cover - must never be called
raise AssertionError("network was hit despite a cached value")
# Any httpx use would explode; the cache hit must short-circuit first.
result = onthemarket._fetch_detail_postcode(
client=type("C", (), {"get": _boom})(),
details_url="/details/19522441/",
listing_id="19522441",
)
assert result == "SE5 9AA"
onthemarket._detail_postcode_cache.clear()