206 lines
7.4 KiB
Python
206 lines
7.4 KiB
Python
"""Tests for the OnTheMarket scraper's detail-page postcode recovery.
|
|
|
|
`parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode
|
|
or None), so these tests use a trimmed but faithful copy of a real OnTheMarket
|
|
detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure:
|
|
the property's own postcode lives in the analytics dataLayer
|
|
(`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office
|
|
postcode sits separately under `…property.agent.postcode` — the trap we must not
|
|
fall into.
|
|
"""
|
|
|
|
import json
|
|
|
|
import onthemarket
|
|
from onthemarket import parse_detail_postcode, transform_property
|
|
|
|
|
|
class _StubIndex:
|
|
"""Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode."""
|
|
|
|
def __init__(self, postcode: str | None):
|
|
self._postcode = postcode
|
|
|
|
def nearest(self, lat: float, lng: float) -> str | None:
|
|
return self._postcode
|
|
|
|
|
|
def _detail_html(
|
|
*,
|
|
property_id: int = 19522441,
|
|
datalayer_postcode: str = "SE5 9AA",
|
|
agent_postcode: str = "SE5 8RS",
|
|
) -> str:
|
|
"""Build detail-page HTML with a real-shaped __NEXT_DATA__ payload."""
|
|
next_data = {
|
|
"props": {
|
|
"initialReduxState": {
|
|
"metadata": {
|
|
"dataLayer": {
|
|
"page-type": "details-section",
|
|
"property-type": "homes",
|
|
# The property's own unit postcode.
|
|
"postcode": datalayer_postcode,
|
|
"property-id": property_id,
|
|
"price": "275,000",
|
|
"addressline_2": "Padfield Road",
|
|
}
|
|
},
|
|
"property": {
|
|
"displayAddress": "Padfield Road, London, SE5",
|
|
"location": {"lon": -0.100233, "lat": 51.466129},
|
|
# The agent block carries the AGENT'S office postcode — the
|
|
# trap. parse_detail_postcode must not return this.
|
|
"agent": {
|
|
"address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS",
|
|
"postcode": agent_postcode,
|
|
},
|
|
},
|
|
}
|
|
}
|
|
}
|
|
payload = json.dumps(next_data)
|
|
return (
|
|
"<html><body>"
|
|
'<script id="__NEXT_DATA__" type="application/json">'
|
|
f"{payload}"
|
|
"</script></body></html>"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_detail_postcode
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_parse_returns_property_postcode_not_agent():
|
|
html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS")
|
|
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
|
|
|
|
|
|
def test_parse_normalizes_spacing():
|
|
html = _detail_html(datalayer_postcode="se59aa")
|
|
assert parse_detail_postcode(html, "19522441") == "SE5 9AA"
|
|
|
|
|
|
def test_parse_ignores_mismatched_property_id():
|
|
# dataLayer postcode belongs to property 19522441; asking for a different
|
|
# listing id must refuse to return it.
|
|
html = _detail_html(property_id=19522441)
|
|
assert parse_detail_postcode(html, "99999999") is None
|
|
|
|
|
|
def test_parse_accepts_when_no_listing_id_given():
|
|
html = _detail_html(datalayer_postcode="SE5 9AA")
|
|
assert parse_detail_postcode(html, None) == "SE5 9AA"
|
|
|
|
|
|
def test_parse_handles_missing_postcode():
|
|
html = _detail_html(datalayer_postcode="")
|
|
assert parse_detail_postcode(html, "19522441") is None
|
|
|
|
|
|
def test_parse_handles_no_next_data():
|
|
assert parse_detail_postcode("<html><body>no script here</body></html>", "1") is None
|
|
|
|
|
|
def test_parse_handles_empty_html():
|
|
assert parse_detail_postcode("", "1") is None
|
|
|
|
|
|
def test_parse_handles_malformed_json():
|
|
html = (
|
|
'<script id="__NEXT_DATA__" type="application/json">{not json}</script>'
|
|
)
|
|
assert parse_detail_postcode(html, "1") is None
|
|
|
|
|
|
def test_parse_handles_missing_datalayer():
|
|
next_data = {"props": {"initialReduxState": {"metadata": {}}}}
|
|
html = (
|
|
'<script id="__NEXT_DATA__" type="application/json">'
|
|
f"{json.dumps(next_data)}</script>"
|
|
)
|
|
assert parse_detail_postcode(html, "1") is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# transform_property — detail postcode wiring + trust rule
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_RAW_LISTING = {
|
|
"id": "19522441",
|
|
"address": "Padfield Road, London, SE5",
|
|
"location": {"lon": -0.100233, "lat": 51.466129},
|
|
"bedrooms": 2,
|
|
"bathrooms": 1,
|
|
"price": "£275,000",
|
|
"humanised-property-type": "Apartment",
|
|
"features": ["Tenure: Leasehold (99 years remaining)"],
|
|
"details-url": "/details/19522441/",
|
|
}
|
|
|
|
|
|
def test_transform_uses_trusted_detail_postcode():
|
|
# Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust
|
|
# the (more precise) detail postcode and label it detail_address.
|
|
index = _StubIndex("SE5 1AA")
|
|
out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA")
|
|
assert out is not None
|
|
assert out["Postcode"] == "SE5 9AA"
|
|
assert out["Postcode source"] == "detail_address"
|
|
|
|
|
|
def test_transform_rejects_detail_postcode_on_outcode_mismatch():
|
|
# Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different
|
|
# outcode -> reject the detail postcode, fall back to coordinate logic.
|
|
index = _StubIndex("SE5 1AA")
|
|
out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ")
|
|
assert out is not None
|
|
assert out["Postcode"] == "SE5 1AA"
|
|
assert out["Postcode source"] == "coordinates"
|
|
|
|
|
|
def test_transform_without_detail_postcode_uses_coordinates():
|
|
index = _StubIndex("SE5 1AA")
|
|
out = transform_property(_RAW_LISTING, index, detail_postcode=None)
|
|
assert out is not None
|
|
assert out["Postcode"] == "SE5 1AA"
|
|
assert out["Postcode source"] == "coordinates"
|
|
# No UPRN / house number is recoverable from OnTheMarket.
|
|
assert out["UPRN"] is None
|
|
assert out["Property number or name"] is None
|
|
|
|
|
|
def test_transform_detail_postcode_via_search_address_outcode():
|
|
# When the card address already carries a full postcode that agrees with the
|
|
# coordinates, the existing "address" source still wins absent a detail
|
|
# postcode — detail recovery never regresses that path.
|
|
raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA")
|
|
index = _StubIndex("SE5 1AA")
|
|
out = transform_property(raw, index, detail_postcode=None)
|
|
assert out["Postcode"] == "SE5 1AA"
|
|
assert out["Postcode source"] == "address"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _fetch_detail_postcode caching (no real network)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_fetch_detail_postcode_is_cached(monkeypatch):
|
|
onthemarket._detail_postcode_cache.clear()
|
|
onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA"
|
|
|
|
def _boom(*args, **kwargs): # pragma: no cover - must never be called
|
|
raise AssertionError("network was hit despite a cached value")
|
|
|
|
# Any httpx use would explode; the cache hit must short-circuit first.
|
|
result = onthemarket._fetch_detail_postcode(
|
|
client=type("C", (), {"get": _boom})(),
|
|
details_url="/details/19522441/",
|
|
listing_id="19522441",
|
|
)
|
|
assert result == "SE5 9AA"
|
|
onthemarket._detail_postcode_cache.clear()
|