import csv import io import zipfile from datetime import date from pathlib import Path import polars as pl from pipeline.transform.join_epc_pp import ( EPC_SOURCE_COLUMNS, _join_address_parts, _run, _scan_epc_certificates, flag_price_outliers, ) def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None: with path.open("w", newline="") as file: writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) def _row(**overrides: str) -> dict[str, str]: row = { "address": "1 Example Street", "address1": "1 Example Street", "address2": "Hale", "postcode": " aa1 1aa ", "uprn": "100012345678", "current_energy_rating": "c", "potential_energy_rating": "b", "property_type": "House", "built_form": "Mid-Terrace", "inspection_date": "2024-01-02", "total_floor_area": "84.5", "number_habitable_rooms": "5", "floor_height": "2.4", "construction_age_band": "England and Wales: 1950-1966", "tenure": "owner-occupied", } row.update(overrides) return row def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path): csv_path = tmp_path / "certificates.csv" fieldnames = [column.upper() for column in EPC_SOURCE_COLUMNS] row = {column.upper(): value for column, value in _row().items()} row["NUMBER_HABITABLE_ROOMS"] = "0" _write_csv(csv_path, fieldnames, [row]) df = _scan_epc_certificates(csv_path, tmp_path).collect() assert df.to_dicts() == [ { "epc_address": "1 Example Street", "epc_address_a1": "1 Example Street", "epc_address_a12": "1 Example Street Hale", "epc_postcode": "AA1 1AA", "uprn": "100012345678", "current_energy_rating": "C", "potential_energy_rating": "B", "epc_property_type": "House", "built_form": "Mid-Terrace", "inspection_date": date(2024, 1, 2), "total_floor_area": 84.5, "number_habitable_rooms": None, "floor_height": 2.4, "construction_age_band": "England and Wales: 1950-1966", "tenure": "owner-occupied", } ] def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path): zip_path = tmp_path / "domestic-csv.zip" rows_2023 = [_row(address="2 Example Street", inspection_date="2023-03-04")] rows_2024 = [ _row( address="3 Example Street", postcode="BB2 2BB", inspection_date="2024-05-06", total_floor_area="", tenure="Rented (social)", ) ] with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: for member_name, rows in [ ("certificates-2023.csv", rows_2023), ("nested/certificates-2024.csv", rows_2024), ]: csv_text = [",".join(EPC_SOURCE_COLUMNS)] csv_text.extend( ",".join(row[column] for column in EPC_SOURCE_COLUMNS) for row in rows ) archive.writestr(member_name, "\n".join(csv_text) + "\n") archive.writestr("recommendations-2024.csv", "address,postcode\nignored,X\n") df = _scan_epc_certificates(zip_path, tmp_path).sort("inspection_date").collect() assert df.select("epc_address", "epc_postcode", "total_floor_area").to_dicts() == [ { "epc_address": "2 Example Street", "epc_postcode": "AA1 1AA", "total_floor_area": 84.5, }, { "epc_address": "3 Example Street", "epc_postcode": "BB2 2BB", "total_floor_area": None, }, ] assert df.get_column("tenure").to_list() == ["owner-occupied", "Rented (social)"] assert df.schema["number_habitable_rooms"] == pl.Int16 def test_join_address_parts_empty_string_components(): # Price-paid SAON/PAON/STREET are empty strings (not null) when absent; # concat_str(ignore_nulls=True) alone leaked the separator into the # display address (' 10 PALACE GREEN') and doubled it for empty middle # components. Empty/whitespace-only parts must contribute nothing. df = pl.DataFrame( { "saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, " ", " FLAT 2"], "paon": ["10", "10", "", "82", "", None, "10", "11 "], "street": [ "PALACE GREEN", "HIGH STREET", "HIGH STREET", "", "", None, "PALACE GREEN", "STATION ROAD", ], } ) out = df.select( _join_address_parts("saon", "paon", "street").alias("address") ).get_column("address") assert out.to_list() == [ "10 PALACE GREEN", # empty saon -> no leading space "FLAT 1 10 HIGH STREET", # normal three-part address is unchanged "FLAT 1 HIGH STREET", # empty middle component -> no double space "FLAT 21 82", # empty street -> no trailing space None, # all-empty -> null, not whitespace junk None, # all-null -> null "10 PALACE GREEN", # whitespace-only component treated as empty "FLAT 2 11 STATION ROAD", # per-component padding is stripped ] # Invariant: every produced address is trimmed and single-spaced. produced = out.drop_nulls() assert produced.str.starts_with(" ").sum() == 0 assert produced.str.ends_with(" ").sum() == 0 assert produced.str.contains(" ", literal=True).sum() == 0 def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path): # Real price-paid rows carry saon == "" (not null) on ~88% of rows; the # published pp_address must not inherit a leading separator from it. zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerow(_row()) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { "price": [250_000], "date_of_transfer": [date(2024, 2, 3)], "property_type": ["T"], "postcode": ["AA1 1AA"], "paon": ["1"], "saon": [""], "street": ["Example Street"], "locality": [""], "town_city": ["Exampletown"], "duration": ["F"], "old_new": ["N"], "ppd_category": ["A"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 # No leading space, and the clean address still matches its EPC record. assert df.select("pp_address", "epc_address").to_dicts() == [ {"pp_address": "1 Example Street", "epc_address": "1 Example Street"} ] def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path): zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerows( [ _row( current_energy_rating="d", inspection_date="2023-01-01", total_floor_area="80", tenure="Rented (social)", ), _row( current_energy_rating="c", inspection_date="2024-01-01", total_floor_area="85", tenure="owner-occupied", ), ] ) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { "price": [200_000, 250_000], "date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)], "property_type": ["T", "T"], "postcode": ["AA1 1AA", "AA1 1AA"], "paon": ["1", "1"], "saon": [None, None], "street": ["Example-Street", "Example Street"], "locality": [None, None], "town_city": ["Exampletown", "Exampletown"], "duration": ["F", "F"], "old_new": ["N", "N"], "ppd_category": ["A", "A"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 assert df.select( "epc_address", "current_energy_rating", "total_floor_area", "construction_age_band", "was_council_house", ).to_dicts() == [ { "epc_address": "1 Example Street", "current_energy_rating": "C", "total_floor_area": 85.0, # Band midpoint of 1950-1966, not the lower bound. "construction_age_band": 1958, "was_council_house": "Yes", } ] assert df.get_column("renovation_history").list.len().to_list() == [1] assert df.get_column("historical_prices").list.len().to_list() == [2] # Audit trail: the accepted fuzzy match's score is published (100 = exact # post-normalisation address match). assert df.get_column("epc_match_score").to_list() == [100] def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path): # Two certificates for the same property. The cert with the garbled, # unparseable inspection_date must NOT be chosen as "latest": a string sort # nulls-first would have picked it, attaching a stale rating/floor area. The # valid-dated cert wins, so its rating ("C") and floor area (85) survive. zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerows( [ _row( current_energy_rating="c", inspection_date="2024-01-01", total_floor_area="85", ), # Same property; an unparseable date (OCR/garbled). Under a raw # string descending sort "not-a-date" outranks the ISO date and # wins the dedup, but as a null Date it loses. _row( current_energy_rating="g", inspection_date="not-a-date", total_floor_area="40", ), ] ) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { "price": [250_000], "date_of_transfer": [date(2024, 2, 3)], "property_type": ["T"], "postcode": ["AA1 1AA"], "paon": ["1"], "saon": [None], "street": ["Example Street"], "locality": [None], "town_city": ["Exampletown"], "duration": ["F"], "old_new": ["N"], "ppd_category": ["A"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 # The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen. assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [ {"current_energy_rating": "C", "total_floor_area": 85.0} ] def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path): zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerow(_row()) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { "price": [250_000, 300_000], "date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)], "property_type": ["T", "T"], "postcode": ["AA1 1AA", ""], "paon": ["1", "2"], "saon": [None, None], "street": ["Example Street", "Example Street"], "locality": [None, None], "town_city": ["Exampletown", "Exampletown"], "duration": ["F", "F"], "old_new": ["N", "N"], "ppd_category": ["A", "A"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df["postcode"].to_list() == ["AA1 1AA"] def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path): zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerow(_row(address="1 Totally Different Road")) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { "price": [250_000], "date_of_transfer": [date(2024, 2, 3)], "property_type": ["T"], "postcode": ["AA1 1AA"], "paon": ["1"], "saon": [None], "street": ["Example Street"], "locality": [None], "town_city": ["Exampletown"], "duration": ["F"], "old_new": ["N"], "ppd_category": ["A"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 assert df.select( "pp_address", "epc_address", "total_floor_area", "current_energy_rating", "epc_match_score", ).to_dicts() == [ { "pp_address": "1 Example Street", "epc_address": None, "total_floor_area": None, "current_energy_rating": None, # No accepted match -> no score. "epc_match_score": None, } ] def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path): # Category B entries (repossessions, bulk/portfolio, power-of-sale) must not # pollute latest_price / historical_prices, but the property still survives # via its standard Category A sales. zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerow(_row()) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { "price": [200_000, 250_000, 5_000_000], "date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)], "property_type": ["T", "T", "T"], "postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"], "paon": ["1", "1", "1"], "saon": [None, None, None], "street": ["Example Street", "Example Street", "Example Street"], "locality": [None, None, None], "town_city": ["Exampletown", "Exampletown", "Exampletown"], "duration": ["F", "F", "F"], "old_new": ["N", "N", "N"], # The latest (5M) sale is a Category B bulk/portfolio transfer. "ppd_category": ["A", "A", "B"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 # Only the two Category A sales survive; the 5M Category B transfer is dropped. assert df.get_column("latest_price").to_list() == [250_000] assert df.get_column("historical_prices").list.len().to_list() == [2] def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path): # A new-build whose earliest sale is below MIN_PRICE must still take that early # year as its EXACT construction date, while latest_price uses only the # quality-passing (>=MIN_PRICE) sale. zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerow(_row()) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { # 5_000 is below MIN_PRICE (10_000) — a nominal/junk transfer that # must still anchor the construction year but stay out of the price # aggregations. "price": [5_000, 300_000], "date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)], "property_type": ["T", "T"], "postcode": ["AA1 1AA", "AA1 1AA"], "paon": ["1", "1"], "saon": [None, None], "street": ["Example Street", "Example Street"], "locality": [None, None], "town_city": ["Exampletown", "Exampletown"], "duration": ["F", "F"], "old_new": ["Y", "Y"], "ppd_category": ["A", "A"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 # Construction year is the genuine earliest transfer (2015), flagged EXACT, # even though that sale is below MIN_PRICE. assert df.get_column("construction_age_band").to_list() == [2015] assert df.get_column("is_construction_date_approximate").to_list() == [0] # latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded. assert df.get_column("latest_price").to_list() == [300_000] assert df.get_column("historical_prices").list.len().to_list() == [1] def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path): # A genuine cheap sale of 30_000 sits between the OLD floor (50k) and the # NEW floor (10k): it must now be RETAINED in the price aggregations. This # pins the 50k->10k change — it fails on the pre-fix 50k floor (where 30k was # excluded, giving historical_prices length 1 / latest_price 250_000). zip_path = tmp_path / "domestic-csv.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerow(_row()) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) price_paid_path = tmp_path / "price-paid.parquet" pl.DataFrame( { "price": [250_000, 30_000], "date_of_transfer": [date(2018, 2, 3), date(2022, 2, 3)], "property_type": ["T", "T"], "postcode": ["AA1 1AA", "AA1 1AA"], "paon": ["1", "1"], "saon": [None, None], "street": ["Example Street", "Example Street"], "locality": [None, None], "town_city": ["Exampletown", "Exampletown"], "duration": ["F", "F"], "old_new": ["N", "N"], "ppd_category": ["A", "A"], } ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 # Both sales now survive the 10k floor; the 30_000 (2022) is the most recent. assert df.get_column("historical_prices").list.len().to_list() == [2] assert df.get_column("latest_price").to_list() == [30_000] def _write_epc_zip(zip_path: Path) -> None: """Write a minimal domestic zip with the default certificate row.""" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: csv_buffer = io.StringIO() writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS) writer.writeheader() writer.writerow(_row()) archive.writestr("certificates-2024.csv", csv_buffer.getvalue()) def _price_paid_frame( prices: list[int], dates: list[date], ppd_categories: list[str] | None = None, ) -> pl.DataFrame: """One property ("1 Example Street, AA1 1AA") with the given sales.""" count = len(prices) return pl.DataFrame( { "price": prices, "date_of_transfer": dates, "property_type": ["T"] * count, "postcode": ["AA1 1AA"] * count, "paon": ["1"] * count, "saon": [None] * count, "street": ["Example Street"] * count, "locality": [None] * count, "town_city": ["Exampletown"] * count, "duration": ["F"] * count, "old_new": ["N"] * count, "ppd_category": ppd_categories or ["A"] * count, } ) def test_run_collapses_duplicate_transactions(tmp_path: Path): # Price-paid lodges the same completed sale twice under distinct # transaction ids; the duplicate must appear ONCE in historical_prices # rather than double-counting the sale. zip_path = tmp_path / "domestic-csv.zip" _write_epc_zip(zip_path) price_paid_path = tmp_path / "price-paid.parquet" _price_paid_frame( prices=[200_000, 250_000, 250_000], dates=[date(2020, 2, 3), date(2024, 2, 3), date(2024, 2, 3)], ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 # The duplicated 250_000 sale collapses to one entry; two distinct sales. assert df.get_column("historical_prices").to_list() == [ [ {"year": 2020, "month": 2, "price": 200_000}, {"year": 2024, "month": 2, "price": 250_000}, ] ] assert df.get_column("latest_price").to_list() == [250_000] def test_run_excludes_implausible_price_jump_but_keeps_property(tmp_path: Path): # The 13 QUICKSETTS HR2 7PP case: £140,000 in 2016 then "£207,500,000" in # 2026 (clearly £207,500 with extra digits, lodged as category A). The # garbage sale must vanish from latest_price / historical_prices while the # property row itself survives on its genuine sale. zip_path = tmp_path / "domestic-csv.zip" _write_epc_zip(zip_path) price_paid_path = tmp_path / "price-paid.parquet" _price_paid_frame( prices=[140_000, 207_500_000], dates=[date(2016, 6, 1), date(2026, 6, 1)], ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 assert df.get_column("latest_price").to_list() == [140_000] assert df.get_column("historical_prices").to_list() == [ [{"year": 2016, "month": 6, "price": 140_000}] ] def test_run_keeps_genuine_long_horizon_appreciation(tmp_path: Path): # x30 over 31 years is extreme but genuine (prime-London territory); the # time-aware threshold (12 * 1.10**31 ≈ 230) must leave it untouched. zip_path = tmp_path / "domestic-csv.zip" _write_epc_zip(zip_path) price_paid_path = tmp_path / "price-paid.parquet" _price_paid_frame( prices=[20_000, 600_000], dates=[date(1995, 3, 1), date(2026, 3, 1)], ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 assert df.get_column("historical_prices").list.len().to_list() == [2] assert df.get_column("latest_price").to_list() == [600_000] def test_run_keeps_right_to_buy_style_jump(tmp_path: Path): # A x12 jump on a cheap property (discounted right-to-buy purchase then an # open-market resale) is legitimate; the JUMP_MIN_PRICE floor keeps such # sales safe from the jump guard. zip_path = tmp_path / "domestic-csv.zip" _write_epc_zip(zip_path) price_paid_path = tmp_path / "price-paid.parquet" _price_paid_frame( prices=[15_000, 180_000], dates=[date(1998, 5, 1), date(2003, 5, 1)], ).write_parquet(price_paid_path) output_path = tmp_path / "epc-pp.parquet" _run(zip_path, price_paid_path, output_path, tmp_path) df = pl.read_parquet(output_path) assert df.height == 1 assert df.get_column("historical_prices").list.len().to_list() == [2] assert df.get_column("latest_price").to_list() == [180_000] def _slim_sales(rows: list[tuple[str, date, int]]) -> pl.DataFrame: return pl.DataFrame( { "_pp_group_address": [address for address, _, _ in rows], "_pp_group_postcode": ["AA11AA"] * len(rows), "date_of_transfer": [transfer_date for _, transfer_date, _ in rows], "price": [price for _, _, price in rows], } ) def test_flag_price_outliers_up_rule_flags_spike_after_normal_sale(): # x1,482 over 10 years against a threshold of 12 * 1.10**10 ≈ 31: the # HIGHER sale is flagged, the genuine earlier sale is left alone. outliers = flag_price_outliers( _slim_sales( [ ("13 QUICKSETTS", date(2016, 6, 1), 140_000), ("13 QUICKSETTS", date(2026, 6, 1), 207_500_000), ] ) ) assert outliers.to_dicts() == [ { "_pp_group_address": "13 QUICKSETTS", "_pp_group_postcode": "AA11AA", "date_of_transfer": date(2026, 6, 1), "price": 207_500_000, "_price_outlier": True, } ] def test_flag_price_outliers_down_rule_flags_spike_before_normal_sale(): # The garbage sale comes FIRST, so it has no previous sale to compare # against; the down rule (next sale collapses to under 1/threshold of this # one) must catch it instead. outliers = flag_price_outliers( _slim_sales( [ ("5 EXAMPLE ROAD", date(2016, 6, 1), 250_000_000), ("5 EXAMPLE ROAD", date(2017, 6, 1), 140_000), ] ) ) assert outliers.get_column("price").to_list() == [250_000_000] def test_flag_price_outliers_min_price_floor_protects_cheap_properties(): # x40 in under six months exceeds the relative threshold (~12.6 at the # half-year floor), but the flagged price (600k) is below JUMP_MIN_PRICE, # so nothing is flagged: the absolute floor is load-bearing here. outliers = flag_price_outliers( _slim_sales( [ ("9 CHEAP STREET", date(2000, 1, 1), 15_000), ("9 CHEAP STREET", date(2000, 6, 1), 600_000), ] ) ) assert outliers.height == 0 def test_flag_price_outliers_spares_expensive_long_horizon_growth(): # x30 over 31 years on a now-£4.5M property clears the £2M floor but stays # under the time-aware threshold (12 * 1.10**31 ≈ 230): not flagged. outliers = flag_price_outliers( _slim_sales( [ ("1 PRIME PLACE", date(1995, 1, 1), 150_000), ("1 PRIME PLACE", date(2026, 1, 1), 4_500_000), ] ) ) assert outliers.height == 0 def test_epc_band_to_year_uses_midpoint_and_clamps(): import polars as pl from pipeline.transform.join_epc_pp import epc_band_to_year df = pl.DataFrame( { "b": [ "England and Wales: 1950-1966", # midpoint 1958 "1900-1929", # midpoint 1914 "England and Wales: before 1900", # too wide -> null "2012 onwards", # single year "1012", # implausible -> null "2202", # implausible -> null None, # null -> null "1958", # already-numeric-as-string -> pass through ] } ) years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list() assert years == [1958, 1914, None, 2012, None, None, None, 1958]