"""Tests for the OnTheMarket scraper's detail-page postcode recovery. `parse_detail_postcode` is pure (takes the detail-page HTML, returns a postcode or None), so these tests use a trimmed but faithful copy of a real OnTheMarket detail page's `__NEXT_DATA__` payload. The fixture mirrors the live structure: the property's own postcode lives in the analytics dataLayer (`props.initialReduxState.metadata.dataLayer.postcode`) while the agent's office postcode sits separately under `…property.agent.postcode` — the trap we must not fall into. """ import json import onthemarket from onthemarket import parse_detail_postcode, transform_property class _StubIndex: """Minimal stand-in for PostcodeSpatialIndex returning a fixed postcode.""" def __init__(self, postcode: str | None): self._postcode = postcode def nearest(self, lat: float, lng: float) -> str | None: return self._postcode def _detail_html( *, property_id: int = 19522441, datalayer_postcode: str = "SE5 9AA", agent_postcode: str = "SE5 8RS", ) -> str: """Build detail-page HTML with a real-shaped __NEXT_DATA__ payload.""" next_data = { "props": { "initialReduxState": { "metadata": { "dataLayer": { "page-type": "details-section", "property-type": "homes", # The property's own unit postcode. "postcode": datalayer_postcode, "property-id": property_id, "price": "275,000", "addressline_2": "Padfield Road", } }, "property": { "displayAddress": "Padfield Road, London, SE5", "location": {"lon": -0.100233, "lat": 51.466129}, # The agent block carries the AGENT'S office postcode — the # trap. parse_detail_postcode must not return this. "agent": { "address": "29 Denmark Hill, Camberwell\nLondon\nSE5 8RS", "postcode": agent_postcode, }, }, } } } payload = json.dumps(next_data) return ( "
" '" ) # --------------------------------------------------------------------------- # parse_detail_postcode # --------------------------------------------------------------------------- def test_parse_returns_property_postcode_not_agent(): html = _detail_html(datalayer_postcode="SE5 9AA", agent_postcode="SE5 8RS") assert parse_detail_postcode(html, "19522441") == "SE5 9AA" def test_parse_normalizes_spacing(): html = _detail_html(datalayer_postcode="se59aa") assert parse_detail_postcode(html, "19522441") == "SE5 9AA" def test_parse_ignores_mismatched_property_id(): # dataLayer postcode belongs to property 19522441; asking for a different # listing id must refuse to return it. html = _detail_html(property_id=19522441) assert parse_detail_postcode(html, "99999999") is None def test_parse_accepts_when_no_listing_id_given(): html = _detail_html(datalayer_postcode="SE5 9AA") assert parse_detail_postcode(html, None) == "SE5 9AA" def test_parse_handles_missing_postcode(): html = _detail_html(datalayer_postcode="") assert parse_detail_postcode(html, "19522441") is None def test_parse_handles_no_next_data(): assert parse_detail_postcode("no script here", "1") is None def test_parse_handles_empty_html(): assert parse_detail_postcode("", "1") is None def test_parse_handles_malformed_json(): html = ( '' ) assert parse_detail_postcode(html, "1") is None def test_parse_handles_missing_datalayer(): next_data = {"props": {"initialReduxState": {"metadata": {}}}} html = ( '" ) assert parse_detail_postcode(html, "1") is None # --------------------------------------------------------------------------- # transform_property — detail postcode wiring + trust rule # --------------------------------------------------------------------------- _RAW_LISTING = { "id": "19522441", "address": "Padfield Road, London, SE5", "location": {"lon": -0.100233, "lat": 51.466129}, "bedrooms": 2, "bathrooms": 1, "price": "£275,000", "humanised-property-type": "Apartment", "features": ["Tenure: Leasehold (99 years remaining)"], "details-url": "/details/19522441/", } def test_transform_uses_trusted_detail_postcode(): # Detail postcode SE5 9AA, coordinate-nearest SE5 1AA: same outcode -> trust # the (more precise) detail postcode and label it detail_address. index = _StubIndex("SE5 1AA") out = transform_property(_RAW_LISTING, index, detail_postcode="SE5 9AA") assert out is not None assert out["Postcode"] == "SE5 9AA" assert out["Postcode source"] == "detail_address" def test_transform_rejects_detail_postcode_on_outcode_mismatch(): # Detail postcode SW9 6BZ but coordinate-nearest is SE5 1AA: different # outcode -> reject the detail postcode, fall back to coordinate logic. index = _StubIndex("SE5 1AA") out = transform_property(_RAW_LISTING, index, detail_postcode="SW9 6BZ") assert out is not None assert out["Postcode"] == "SE5 1AA" assert out["Postcode source"] == "coordinates" def test_transform_without_detail_postcode_uses_coordinates(): index = _StubIndex("SE5 1AA") out = transform_property(_RAW_LISTING, index, detail_postcode=None) assert out is not None assert out["Postcode"] == "SE5 1AA" assert out["Postcode source"] == "coordinates" # No UPRN / house number is recoverable from OnTheMarket. assert out["UPRN"] is None assert out["Property number or name"] is None def test_transform_detail_postcode_via_search_address_outcode(): # When the card address already carries a full postcode that agrees with the # coordinates, the existing "address" source still wins absent a detail # postcode — detail recovery never regresses that path. raw = dict(_RAW_LISTING, address="Padfield Road, London, SE5 1AA") index = _StubIndex("SE5 1AA") out = transform_property(raw, index, detail_postcode=None) assert out["Postcode"] == "SE5 1AA" assert out["Postcode source"] == "address" # --------------------------------------------------------------------------- # _fetch_detail_postcode caching (no real network) # --------------------------------------------------------------------------- def test_fetch_detail_postcode_is_cached(monkeypatch): onthemarket._detail_postcode_cache.clear() onthemarket._detail_postcode_cache["19522441"] = "SE5 9AA" def _boom(*args, **kwargs): # pragma: no cover - must never be called raise AssertionError("network was hit despite a cached value") # Any httpx use would explode; the cache hit must short-circuit first. result = onthemarket._fetch_detail_postcode( client=type("C", (), {"get": _boom})(), details_url="/details/19522441/", listing_id="19522441", ) assert result == "SE5 9AA" onthemarket._detail_postcode_cache.clear()