scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -121,6 +121,50 @@ class TestWhitespacePostcodes:
loaded_df, _ = load_uprns(path)
assert len(loaded_df) == 0
def test_non_english_oas_excluded(self, tmp_path):
df = pl.DataFrame(
{
"GRIDGB1E": [500010, 300010],
"GRIDGB1N": [180010, 220010],
"PCDS": ["AA1 1AA", "CF1 1AA"],
"OA21CD": ["E00000001", "W00000001"],
}
)
path = tmp_path / "uprn.parquet"
df.write_parquet(path)
loaded_df, offsets = load_uprns(path)
assert set(offsets) == {"E00000001"}
assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
def test_terminated_postcodes_are_remapped(self, tmp_path):
uprns = pl.DataFrame(
{
"GRIDGB1E": [500010],
"GRIDGB1N": [180010],
"PCDS": ["aa1 1aa"],
"OA21CD": ["E00000001"],
}
)
uprn_path = tmp_path / "uprn.parquet"
uprns.write_parquet(uprn_path)
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB"],
"east1m": [500010, 500030],
"north1m": [180010, 180020],
"doterm": ["2020-01-01", None],
"ctry25cd": ["E92000001", "E92000001"],
}
)
arcgis_path = tmp_path / "arcgis.parquet"
arcgis.write_parquet(arcgis_path)
loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
# ---------------------------------------------------------------------------
# Bug 3: Voronoi deduplication is first-seen-wins