Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -116,9 +116,9 @@ MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_
generate-postcode-boundaries generate-travel-times enrich-actual-listings
prepare: $(PRICES_STAMP) download-places tiles satellite-tiles overlay-tiles property-border-tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --parquet $(PRICE_INDEX) --postcode-boundary-match "$(POSTCODES_PQ)::$(PC_BOUNDARIES)"
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --parquet $(PRICE_INDEX) --postcode-boundary-match "$(POSTCODES_PQ)::$(PC_BOUNDARIES)" --postcode-features $(POSTCODES_PQ) --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)" --price-index $(PRICE_INDEX)
merge: $(MERGE_STAMP) | $(POSTCODES_PQ) $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --postcode-features $(POSTCODES_PQ) --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)"
enrich-actual-listings: $(ACTUAL_LISTINGS_ENRICHED)
tiles: $(TILES) $(SATELLITE_TILES) $(SATELLITE_HIGHRES_TILES)
satellite-tiles: $(SATELLITE_TILES)
@ -413,13 +413,13 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
--tree-density-postcodes $(TREE_DENSITY_PC) \
--output-postcodes $(POSTCODES_PQ) \
--output-properties $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --postcode-features $(POSTCODES_PQ) --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)"
@touch $@
# ── Price estimation (post-merge) ───────────────────────────────────────────
$(POSTCODES_PQ) $(PROPERTIES_PQ) &: $(MERGE_STAMP)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --postcode-features $(POSTCODES_PQ) --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)"
$(PRICE_INDEX): $(MERGE_STAMP) $(PRICE_INDEX_DEPS) | $(PROPERTIES_PQ) $(POSTCODES_PQ)
uv run python -m pipeline.transform.price_estimation.index --input $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --output $@
@ -428,7 +428,7 @@ $(PRICE_INDEX): $(MERGE_STAMP) $(PRICE_INDEX_DEPS) | $(PROPERTIES_PQ) $(POSTCODE
$(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX) $(PRICE_ESTIMATE_DEPS) | $(PROPERTIES_PQ) $(POSTCODES_PQ)
@rm -f $@
uv run python -m pipeline.transform.price_estimation.estimate --properties $(PROPERTIES_PQ) --postcodes $(POSTCODES_PQ) --index $(PRICE_INDEX)
$(VALIDATE_OUTPUTS) --parquet $(PROPERTIES_PQ) --parquet $(POSTCODES_PQ) --parquet $(PRICE_INDEX)
$(VALIDATE_OUTPUTS) --parquet $(PROPERTIES_PQ) --parquet $(POSTCODES_PQ) --parquet $(PRICE_INDEX) --postcode-features $(POSTCODES_PQ) --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)" --price-index $(PRICE_INDEX)
@touch $@
$(ACTUAL_LISTINGS_ENRICHED): $(ACTUAL_LISTINGS_RAW) $(EPC) \

View file

@ -85,7 +85,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% White': 'Part de la population sidentifiant comme blanche',
'% South Asian': 'Part de la population sidentifiant comme sud-asiatique',
'% Black': 'Part de la population sidentifiant comme noire',
'% East Asian': 'Part de la population sidentifiant comme est-asiatique',
'% East/SE Asian': 'Part de la population sidentifiant comme est/sud-est asiatique',
'% Mixed':
'Part de la population sidentifiant comme métisse ou de plusieurs groupes ethniques',
'% Other': 'Part de la population sidentifiant comme appartenant à un autre groupe ethnique',
@ -98,7 +98,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% Green': 'Part des voix vertes aux élections générales de 2024',
'% Other parties': 'Part cumulée des voix de tous les autres partis et indépendants',
'Distance to nearest park (km)': 'Distance au parc ou espace vert le plus proche',
'Noise (dB)': 'Niveau maximal de bruit des transports près du code postal, en décibels (Lden)',
'Noise (dB)': 'Le plus élevé des bruits routier, ferroviaire ou aérien près du code postal, en décibels (Lden). Angleterre uniquement ; vide = hors zone cartographiée, pas forcément calme.',
'Max available download speed (Mbps)': 'Débit descendant haut débit maximal disponible au code postal',
Schools: 'Écoles primaires et secondaires notées à proximité',
'Specific crimes': 'Filtrer une seule catégorie dinfractions de rue à la fois',
@ -185,7 +185,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% White': 'Anteil der Personen, die sich als weiß identifizieren',
'% South Asian': 'Anteil der Personen, die sich als südasiatisch identifizieren',
'% Black': 'Anteil der Personen, die sich als schwarz identifizieren',
'% East Asian': 'Anteil der Personen, die sich als ostasiatisch identifizieren',
'% East/SE Asian': 'Anteil der Personen, die sich als ost-/südostasiatisch identifizieren',
'% Mixed':
'Anteil der Personen, die sich als gemischt oder mehreren ethnischen Gruppen zugehörig identifizieren',
'% Other': 'Anteil der Personen, die sich einer anderen ethnischen Gruppe zuordnen',
@ -198,7 +198,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% Green': 'Stimmenanteil der Grünen bei der Parlamentswahl 2024',
'% Other parties': 'Kombinierter Stimmenanteil aller anderen Parteien und Unabhängigen',
'Distance to nearest park (km)': 'Entfernung zum nächsten Park oder Grünfläche',
'Noise (dB)': 'Maximaler Verkehrslärmpegel in der Nähe des Postcodes in Dezibel (Lden)',
'Noise (dB)': 'Lautester von Straßen-, Bahn- oder Fluglärm in der Nähe des Postcodes in Dezibel (Lden). Nur England; leer = nicht kartiert, nicht unbedingt leise.',
'Max available download speed (Mbps)':
'Maximal verfügbare Breitband-Downloadgeschwindigkeit am Postcode',
Schools: 'Bewertete Grundschulen und weiterführende Schulen in der Nähe',
@ -262,7 +262,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% White': '白人人口比例',
'% South Asian': '南亚裔人口比例',
'% Black': '黑人人口比例',
'% East Asian': '东亚裔人口比例',
'% East/SE Asian': '东亚/东南亚裔人口比例',
'% Mixed': '混血或多族裔人口比例',
'% Other': '其他族裔人口比例',
'Voter turnout (%)': '2024 年大选中登记选民的投票率',
@ -273,7 +273,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% Green': '2024 年大选中绿党得票率',
'% Other parties': '所有其他政党和独立候选人的综合得票率',
'Distance to nearest park (km)': '到最近公园或绿地的距离',
'Noise (dB)': '该邮编附近最高交通噪音水平Lden分贝',
'Noise (dB)': '该邮编附近道路、铁路或机场中最高的噪音水平Lden分贝。仅英格兰空白表示未覆盖不一定安静。',
'Max available download speed (Mbps)': '该邮编可用的最高宽带下载速度',
Schools: '附近有评级的小学和中学',
'Specific crimes': '一次筛选一种街面犯罪类别',
@ -347,7 +347,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% White': 'श्वेत के रूप में पहचान करने वाली आबादी का प्रतिशत',
'% South Asian': 'दक्षिण एशियाई के रूप में पहचान करने वाली आबादी का प्रतिशत',
'% Black': 'अश्वेत के रूप में पहचान करने वाली आबादी का प्रतिशत',
'% East Asian': 'पूर्वी एशियाई के रूप में पहचान करने वाली आबादी का प्रतिशत',
'% East/SE Asian': 'पूर्वी/दक्षिण-पूर्वी एशियाई के रूप में पहचान करने वाली आबादी का प्रतिशत',
'% Mixed': 'मिश्रित या कई जातीय समूहों से पहचान करने वाली आबादी का प्रतिशत',
'% Other': 'अन्य जातीय समूह के रूप में पहचान करने वाली आबादी का प्रतिशत',
'Voter turnout (%)': '2024 आम चुनाव में मतदान करने वाले पंजीकृत मतदाताओं का प्रतिशत',
@ -358,7 +358,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% Green': '2024 आम चुनाव में ग्रीन पार्टी का मत-प्रतिशत',
'% Other parties': 'बाकी सभी पार्टियों और निर्दलीयों का संयुक्त मत-प्रतिशत',
'Distance to nearest park (km)': 'निकटतम पार्क या हरित क्षेत्र तक दूरी',
'Noise (dB)': 'पोस्टकोड पर सड़क शोर स्तर, डेसीबल (Lden) में',
'Noise (dB)': 'पोस्टकोड के पास सड़क, रेल या हवाई अड्डे के शोर में सबसे अधिक, डेसीबल (Lden) में। केवल इंग्लैंड; खाली = मैप नहीं किया गया, जरूरी नहीं कि शांत हो।',
'Max available download speed (Mbps)': 'पोस्टकोड पर उपलब्ध अधिकतम डाउनलोड गति',
Schools: 'पास के रेटेड प्राइमरी और सेकेंडरी स्कूल',
'Specific crimes': 'एक समय में एक सड़क-स्तर अपराध श्रेणी से फिल्टर करें',
@ -438,7 +438,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% White': 'A fehérként azonosított lakosság aránya',
'% South Asian': 'A dél-ázsiaiként azonosított lakosság aránya',
'% Black': 'A feketeként azonosított lakosság aránya',
'% East Asian': 'A kelet-ázsiaiként azonosított lakosság aránya',
'% East/SE Asian': 'A kelet-/délkelet-ázsiaiként azonosított lakosság aránya',
'% Mixed': 'A vegyes vagy több etnikai csoporthoz tartozóként azonosított lakosság aránya',
'% Other': 'Az egyéb etnikai csoportba tartozóként azonosított lakosság aránya',
'Voter turnout (%)':
@ -450,7 +450,7 @@ const descriptions: Record<string, Record<string, string>> = {
'% Green': 'A Zöld Párt szavazataránya a 2024-es parlamenti választáson',
'% Other parties': 'Az összes többi párt és független jelölt összesített szavazataránya',
'Distance to nearest park (km)': 'Távolság a legközelebbi parkig vagy zöldterületig',
'Noise (dB)': 'Közúti zajszint az irányítószámnál decibelben (Lden)',
'Noise (dB)': 'Az út-, vasúti vagy repülőtéri zaj közül a leghangosabb az irányítószámnál, decibelben (Lden). Csak Anglia; üres = nem térképezett, nem feltétlenül csendes.',
'Max available download speed (Mbps)':
'Az irányítószámnál elérhető maximális szélessávú letöltési sebesség',
Schools: 'Közeli minősített általános és középiskolák',

View file

@ -109,8 +109,8 @@ export const details: Record<string, Record<string, string>> = {
"Provient du Census 2021. Pourcentage de la population de l'autorité locale s'identifiant comme Indien, Pakistanais, Bangladais ou toute autre origine asiatique.",
'% Black':
"Provient du Census 2021. Pourcentage de la population de l'autorité locale s'identifiant comme Noir, Noir britannique, Caribéen ou Africain.",
'% East Asian':
"Provient du Census 2021. Pourcentage de la population de l'autorité locale s'identifiant comme Chinois.",
'% East/SE Asian':
"Provient du Census 2021. Pourcentage de la population de l'autorité locale s'identifiant comme Chinois ou d'une autre origine est/sud-est asiatique.",
'% Mixed':
"Provient du Census 2021. Pourcentage de la population de l'autorité locale s'identifiant comme Mixte ou appartenant à plusieurs groupes ethniques (Blanc et Noir caribéen, Blanc et Noir africain, Blanc et Asiatique, ou tout autre fond mixte ou multiple).",
'% Other':
@ -132,7 +132,7 @@ export const details: Record<string, Record<string, string>> = {
'Distance to nearest park (km)':
"Distance à vol d'oiseau en kilomètres entre le code postal et l'entrée de parc la plus proche. Couvre les parcs publics, jardins, terrains de sport et aires de jeux. Utilise les points d'accès du jeu de données OS Open Greenspace, afin que les biens en bordure d'un grand parc affichent bien une courte distance.",
'Noise (dB)':
"Niveau maximal de bruit routier, ferroviaire ou aérien en décibels (Lden, moyenne pondérée sur 24 heures) d'après le Strategic Noise Mapping Round 4 de Defra (2022). Modélisé à 4 m au-dessus du sol sur une grille de 10 m et échantillonné comme la cellule de 10 m la plus bruyante autour du point représentatif du code postal. Au-dessus d'environ 55 dB, le bruit est généralement perceptible ; au-dessus d'environ 70 dB, l'OMS le considère comme nocif.",
"Niveau maximal de bruit routier, ferroviaire ou aérien en décibels (Lden, moyenne pondérée sur 24 heures) d'après le Strategic Noise Mapping Round 4 de Defra (2022). Modélisé à 4 m au-dessus du sol sur une grille de 10 m et échantillonné comme la cellule de 10 m la plus bruyante autour du point représentatif du code postal. Au-dessus d'environ 55 dB, le bruit est généralement perceptible ; au-dessus d'environ 70 dB, l'OMS le considère comme nocif. Couvre l'Angleterre uniquement ; une valeur vide signifie l'absence de données cartographiées (pas forcément calme).",
'Max available download speed (Mbps)':
"Débit descendant maximal de haut débit fixe disponible auprès de n'importe quel fournisseur, d'après Ofcom Connected Nations 2025. Il s'agit du maximum théorique, pas des débits réellement obtenus. 10 Mbps = basique, 30 = superfast, 100+ = ultrafast, 1000 = gigabit.",
Schools:
@ -255,8 +255,8 @@ export const details: Record<string, Record<string, string>> = {
'Aus dem Census 2021. Prozentsatz der Bevölkerung der Gemeinde, die sich als Indisch, Pakistanisch, Bangladeschisch oder mit sonstigem asiatischen Hintergrund identifiziert.',
'% Black':
'Aus dem Census 2021. Prozentsatz der Bevölkerung der Gemeinde, die sich als Schwarz, Schwarz-Britisch, Karibisch oder Afrikanisch identifiziert.',
'% East Asian':
'Aus dem Census 2021. Prozentsatz der Bevölkerung der Gemeinde, die sich als Chinesisch identifiziert.',
'% East/SE Asian':
'Aus dem Census 2021. Prozentsatz der Bevölkerung der Gemeinde, die sich als Chinesisch oder einer anderen ost-/südostasiatischen Herkunft identifiziert.',
'% Mixed':
'Aus dem Census 2021. Prozentsatz der Bevölkerung der Gemeinde, die sich als gemischt oder mit mehreren ethnischen Zugehörigkeiten identifiziert (Weiß und Schwarzkaribisch, Weiß und Schwarzafrikanisch, Weiß und Asiatisch oder sonstiger gemischter Hintergrund).',
'% Other':
@ -278,7 +278,7 @@ export const details: Record<string, Record<string, string>> = {
'Distance to nearest park (km)':
'Luftlinienentfernung in Kilometern vom Postleitzahlenzentrum zum nächsten Parkeingang. Umfasst öffentliche Parks, Gärten, Sportplätze und Spielbereiche. Verwendet Zugangspunktstandorte aus dem OS Open Greenspace-Datensatz, sodass Immobilien an der Grenze eines großen Parks korrekt eine kurze Entfernung anzeigen.',
'Noise (dB)':
'Straßenlärmpegel in Dezibel (Lden, ein 24-Stunden-gewichteter Durchschnitt) aus Defras Strategic Noise Mapping Round 4 (2022). Modelliert in 4 m Höhe über dem Boden auf einem 10-m-Raster. Über ~55 dB ist in der Regel wahrnehmbar; über ~70 dB gilt laut WHO als gesundheitsschädlich.',
'Straßenlärmpegel in Dezibel (Lden, ein 24-Stunden-gewichteter Durchschnitt) aus Defras Strategic Noise Mapping Round 4 (2022). Modelliert in 4 m Höhe über dem Boden auf einem 10-m-Raster. Über ~55 dB ist in der Regel wahrnehmbar; über ~70 dB gilt laut WHO als gesundheitsschädlich. Nur England; ein leerer Wert bedeutet keine kartierten Daten (nicht unbedingt leise).',
'Max available download speed (Mbps)':
'Maximale verfügbare Festnetz-Download-Geschwindigkeit von einem beliebigen Anbieter, aus Ofcom Connected Nations 2025. Gibt die theoretische Höchstgeschwindigkeit an, keine tatsächlich erreichten Geschwindigkeiten. 10 Mbps = Basis, 30 = schnell, 100+ = ultraschnell, 1000 = Gigabit.',
Schools:
@ -398,7 +398,7 @@ export const details: Record<string, Record<string, string>> = {
'% South Asian':
'来自2021年Census。地方政府人口中认同为印度人、巴基斯坦人、孟加拉国人或其他亚洲背景的百分比。',
'% Black': '来自2021年Census。地方政府人口中认同为黑人、英国黑人、加勒比人或非洲人的百分比。',
'% East Asian': '来自2021年Census。地方政府人口中认同为华人的百分比。',
'% East/SE Asian': '来自2021年Census。地方政府人口中认同为华人或其他东亚/东南亚裔的百分比。',
'% Mixed':
'来自2021年 Census。地方政府人口中认同为混血或多种族群体白人与黑人加勒比裔、白人与黑人非洲裔、白人与亚洲裔或其他混血或多种族背景的百分比。',
'% Other':
@ -416,7 +416,7 @@ export const details: Record<string, Record<string, string>> = {
'Distance to nearest park (km)':
'从邮政编码到最近公园入口的直线距离km。涵盖公共公园、花园、运动场和游乐场地。使用 OS Open Greenspace 数据集中的出入口位置,因此紧邻大型公园的房产可正确显示较短距离。',
'Noise (dB)':
'来自Defra战略噪声图第4轮2022年的道路噪声水平单位为分贝Lden24小时加权平均值。在地面以上4m、10m网格间距处建模。一般而言超过约55 dB可明显感知超过约70 dB被世卫组织认定为有害。',
'来自Defra战略噪声图第4轮2022年的道路噪声水平单位为分贝Lden24小时加权平均值。在地面以上4m、10m网格间距处建模。一般而言超过约55 dB可明显感知超过约70 dB被世卫组织认定为有害。仅覆盖英格兰;空值表示无映射数据(不一定安静)。',
'Max available download speed (Mbps)':
'来自Ofcom Connected Nations 2025的任意运营商可提供的最大固定宽带下载速度。代表理论最大值而非实际达到的速度。10 Mbps为基础级30为超快级100+为极速级1000为千兆级。',
Schools:
@ -539,8 +539,8 @@ export const details: Record<string, Record<string, string>> = {
'Census 2021 से. स्थानीय प्राधिकरण की आबादी का प्रतिशत जो खुद को भारतीय, पाकिस्तानी, बांग्लादेशी या किसी अन्य एशियाई पृष्ठभूमि के रूप में पहचानता है.',
'% Black':
'Census 2021 से. स्थानीय प्राधिकरण की आबादी का प्रतिशत जो खुद को अश्वेत, अश्वेत ब्रिटिश, कैरिबियाई या अफ्रीकी के रूप में पहचानता है.',
'% East Asian':
'Census 2021 से. स्थानीय प्राधिकरण की आबादी का प्रतिशत जो खुद को चीनी के रूप में पहचानता है.',
'% East/SE Asian':
'Census 2021 से. स्थानीय प्राधिकरण की आबादी का प्रतिशत जो खुद को चीनी या अन्य पूर्वी/दक्षिण-पूर्वी एशियाई के रूप में पहचानता है.',
'% Mixed':
'Census 2021 से. स्थानीय प्राधिकरण की आबादी का प्रतिशत जो खुद को मिश्रित या कई जातीय समूहों (श्वेत और अश्वेत कैरिबियाई, श्वेत और अश्वेत अफ्रीकी, श्वेत और एशियाई या अन्य मिश्रित/बहुजातीय पृष्ठभूमि) के रूप में पहचानता है.',
'% Other':
@ -562,7 +562,7 @@ export const details: Record<string, Record<string, string>> = {
'Distance to nearest park (km)':
'पोस्टकोड से निकटतम पार्क प्रवेश तक सीधी रेखा में दूरी, किलोमीटर में. इसमें सार्वजनिक पार्क, बगीचे, खेल मैदान और खेल स्थान शामिल हैं. OS Open Greenspace डेटा सेट के प्रवेश-बिंदु स्थानों का उपयोग करता है, इसलिए बड़े पार्क के किनारे स्थित संपत्तियां सही कम दूरी दिखाती हैं.',
'Noise (dB)':
'Defra Strategic Noise Mapping Round 4 (2022) से सड़क-शोर स्तर, डेसीबल में (Lden, 24-घंटे भारित औसत). जमीन से 4 m ऊपर 10 m ग्रिड पर मॉडल किया गया. लगभग 55 dB से ऊपर शोर आमतौर पर महसूस होता है; लगभग 70 dB से ऊपर WHO इसे हानिकारक मानता है.',
'Defra Strategic Noise Mapping Round 4 (2022) से सड़क-शोर स्तर, डेसीबल में (Lden, 24-घंटे भारित औसत). जमीन से 4 m ऊपर 10 m ग्रिड पर मॉडल किया गया. लगभग 55 dB से ऊपर शोर आमतौर पर महसूस होता है; लगभग 70 dB से ऊपर WHO इसे हानिकारक मानता है. केवल इंग्लैंड को कवर करता है; खाली मान का अर्थ है कोई मैप किया गया डेटा नहीं (जरूरी नहीं कि शांत हो).',
'Max available download speed (Mbps)':
'Ofcom Connected Nations 2025 से किसी भी प्रदाता द्वारा उपलब्ध अधिकतम स्थिर ब्रॉडबैंड डाउनलोड गति. यह सैद्धांतिक अधिकतम दिखाता है, वास्तविक प्राप्त गति नहीं. 10 Mbps = बुनियादी, 30 = तेज, 100+ = अत्यंत तेज, 1000 = गीगाबिट.',
Schools:
@ -685,8 +685,8 @@ export const details: Record<string, Record<string, string>> = {
'A 2021-es Census alapján. A helyi hatóság területén indiai, pakisztáni, bangladesi vagy bármely más ázsiai háttérként azonosított népesség százaléka.',
'% Black':
'A 2021-es Census alapján. A helyi hatóság területén fekete, brit fekete, karibi vagy afrikai háttérként azonosított népesség százaléka.',
'% East Asian':
'A 2021-es Census alapján. A helyi hatóság területén kínaiként azonosított népesség százaléka.',
'% East/SE Asian':
'A 2021-es Census alapján. A helyi hatóság területén kínai vagy más kelet-/délkelet-ázsiai származásúként azonosított népesség százaléka.',
'% Mixed':
'A 2021-es Census alapján. A helyi hatóság területén vegyes vagy többes etnikai csoportként (fehér és fekete karibi, fehér és fekete afrikai, fehér és ázsiai, vagy bármely más vegyes vagy többes háttér) azonosított népesség százaléka.',
'% Other':
@ -708,7 +708,7 @@ export const details: Record<string, Record<string, string>> = {
'Distance to nearest park (km)':
'Légvonalbeli távolság kilométerben az irányítószámtól a legközelebbi park bejáratáig. Magában foglalja a közparkokat, kerteket, játszótereket és szabadidős területeket. Az OS Open Greenspace adatkészlet hozzáférési pont helyszíneit használja, így a nagy park szomszédságában lévő ingatlanok helyesen rövid távolságot mutatnak.',
'Noise (dB)':
'Közúti zajszint decibel (Lden, 24 órás súlyozott átlag) értékben, a Defra Stratégiai Zajtérképezés 4. fordulójából (2022). 4 m magasságban, 10 m-es rácson modellezve. ~55 dB felett általában érzékelhető; ~70 dB felett a WHO károsnak minősíti.',
'Közúti zajszint decibel (Lden, 24 órás súlyozott átlag) értékben, a Defra Stratégiai Zajtérképezés 4. fordulójából (2022). 4 m magasságban, 10 m-es rácson modellezve. ~55 dB felett általában érzékelhető; ~70 dB felett a WHO károsnak minősíti. Csak Angliát fedi le; az üres érték nem térképezett adatot jelent (nem feltétlenül csendes).',
'Max available download speed (Mbps)':
'Bármely szolgáltatótól elérhető maximális rögzített szélessávú letöltési sebesség, az Ofcom Connected Nations 2025 adataiból. Az elméleti maximumot jelöli, nem a valós sebességet. 10 Mbps = alapszintű, 30 = szupergyors, 100+ = ultragyors, 1000 = gigabites.',
Schools:

View file

@ -1519,7 +1519,7 @@ const de: Translations = {
'% White': '% weiß',
'% South Asian': '% südasiatisch',
'% Black': '% schwarz',
'% East Asian': '% ostasiatisch',
'% East/SE Asian': '% ost-/südostasiatisch',
'% Mixed': '% gemischt',
'% Other': '% Sonstige',

View file

@ -1485,7 +1485,7 @@ const en = {
'% White': '% White',
'% South Asian': '% South Asian',
'% Black': '% Black',
'% East Asian': '% East Asian',
'% East/SE Asian': '% East/SE Asian',
'% Mixed': '% Mixed',
'% Other': '% Other',

View file

@ -1531,7 +1531,7 @@ const fr: Translations = {
'% White': '% Blancs',
'% South Asian': '% Sud-Asiatiques',
'% Black': '% Noirs',
'% East Asian': '% Est-Asiatiques',
'% East/SE Asian': '% est/sud-est asiatique',
'% Mixed': '% Métis',
'% Other': '% Autres',

View file

@ -1431,7 +1431,7 @@ const hi: Translations = {
'% White': '% श्वेत',
'% South Asian': '% दक्षिण एशियाई',
'% Black': '% अश्वेत',
'% East Asian': '% पूर्वी एशियाई',
'% East/SE Asian': '% पूर्वी/दक्षिण-पूर्वी एशियाई',
'% Mixed': '% मिश्रित',
'% Other': '% अन्य',
'Voter turnout (%)': 'मतदाता भागीदारी (%)',

View file

@ -1512,7 +1512,7 @@ const hu: Translations = {
'% White': '% fehér',
'% South Asian': '% dél-ázsiai',
'% Black': '% fekete',
'% East Asian': '% kelet-ázsiai',
'% East/SE Asian': '% kelet-/dél-kelet-ázsiai',
'% Mixed': '% vegyes',
'% Other': '% egyéb',

View file

@ -1428,7 +1428,7 @@ const zh: Translations = {
'% White': '% 白人',
'% South Asian': '% 南亚裔',
'% Black': '% 黑人',
'% East Asian': '% 东亚裔',
'% East/SE Asian': '% 东亚/东南亚裔',
'% Mixed': '% 混血',
'% Other': '% 其他',

View file

@ -267,7 +267,7 @@ export const STACKED_GROUPS: Record<
{
label: 'Ethnic composition',
unit: '%',
components: ['% White', '% South Asian', '% East Asian', '% Black', '% Mixed', '% Other'],
components: ['% White', '% South Asian', '% East/SE Asian', '% Black', '% Mixed', '% Other'],
},
{
label: 'Political vote share',
@ -384,13 +384,6 @@ export const ENUM_COLOR_OVERRIDES: Record<string, Record<string, [number, number
F: [239, 68, 68],
G: [126, 34, 206],
},
'Max available download speed (Mbps)': {
'10': [107, 114, 128],
'30': [245, 158, 11],
'100': [59, 130, 246],
'300': [20, 184, 166],
'1000': [34, 197, 94],
},
};
/**
@ -451,7 +444,7 @@ export const STACKED_SEGMENT_COLORS: Record<string, string> = {
'Other crime (avg/yr)': '#6b7280',
'% White': '#3b82f6',
'% South Asian': '#f97316',
'% East Asian': '#eab308',
'% East/SE Asian': '#eab308',
'% Black': '#8b5cf6',
'% Mixed': '#14b8a6',
'% Other': '#6b7280',

View file

@ -6,7 +6,7 @@ export const ETHNICITIES_FILTER_KEY_PREFIX = `${ETHNICITIES_FILTER_NAME}:`;
export const ETHNICITY_FEATURE_NAMES = [
'% White',
'% South Asian',
'% East Asian',
'% East/SE Asian',
'% Black',
'% Mixed',
'% Other',

View file

@ -367,7 +367,7 @@ const FEATURE_ICON_PATHS: Record<string, ReactNode> = {
<path d="M16 3.13a4 4 0 010 7.75" />
</>
),
'% East Asian': (
'% East/SE Asian': (
<>
<path d="M17 21v-2a4 4 0 00-4-4H5a4 4 0 00-4 4v2" />
<circle cx="9" cy="7" r="4" />

View file

@ -36,7 +36,8 @@ GEOGRAPHY_CODE_REPLACEMENTS = {
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
# Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity,
# then aggregate back to the broad groups plus South Asian / East Asian split.
# then aggregate back to the broad groups plus a South Asian / East/SE Asian
# split (Indian/Pakistani/Bangladeshi vs Chinese + other East/SE Asian).
detailed = df.filter(
(pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All")
)
@ -53,9 +54,13 @@ def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
"Indian": "South Asian",
"Pakistani": "South Asian",
"Bangladeshi": "South Asian",
"Any Other Asian Background": "South Asian",
# East Asian
"Chinese": "East Asian",
# East / Southeast Asian. The ONS "Any Other Asian Background" bucket is
# predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
# Japanese, Korean, ...) rather than South Asian, so route it here rather
# than inflating "% South Asian". The split is approximate (the ONS
# bucket also holds some South Asian groups such as Sri Lankan/Nepalese).
"Chinese": "East/SE Asian",
"Any Other Asian Background": "East/SE Asian",
# Black
"Black African": "Black",
"Black Caribbean": "Black",

View file

@ -35,3 +35,31 @@ def test_ethnicity_percentages_recombines_predecessor_lads_by_population():
assert cumberland.select("% White", "% South Asian").to_dicts() == [
{"% White": 45.0, "% South Asian": 55.0}
]
def test_ethnicity_routes_any_other_asian_to_east_se_asian():
"""'Any Other Asian Background' and 'Chinese' both fold into '% East/SE Asian'
(not '% South Asian'), fixing the East/SE Asian undercount."""
rows = [
{
"Geography_code": "E06000001",
"Ethnicity_type": "ONS 2021 19+1",
"Ethnicity": ethnicity,
"Ethnic Population": pop,
"Value1": 0.0,
}
for ethnicity, pop in [
("Chinese", 30),
("Any Other Asian Background", 20),
("Indian", 50),
]
]
result = _ethnicity_percentages(pl.DataFrame(rows))
area = result.filter(pl.col("Geography_code") == "E06000001")
assert "% East/SE Asian" in result.columns
assert "% East Asian" not in result.columns
assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
{"% East/SE Asian": 50.0, "% South Asian": 50.0}
]

View file

@ -222,3 +222,108 @@ def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch,
stderr = capsys.readouterr().err
assert "active English postcodes" in stderr
assert "not active English postcodes" in stderr
def _write_postcode_features(path, rows):
pl.DataFrame(rows).write_parquet(path)
def test_validates_postcode_features_valid(tmp_path, monkeypatch):
path = tmp_path / "postcode.parquet"
_write_postcode_features(
path,
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.5, 53.4],
"lon": [-0.1, -2.2],
"ctry25cd": ["E92000001", "E92000001"],
"% White": [80.0, 55.0],
},
)
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
assert main() == 0
def test_rejects_contaminated_postcode_features(tmp_path, monkeypatch, capsys):
path = tmp_path / "postcode.parquet"
_write_postcode_features(
path,
{
"Postcode": ["AA1 1AA", "AA1 1AA", "CF10 1AA"], # duplicate AA1 1AA
"lat": [51.5, 51.5, None], # Welsh row has null coord
"lon": [-0.1, -0.1, None],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"% White": [80.0, 150.0, 90.0], # 150 out of [0,100]
},
)
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
assert main() == 1
err = capsys.readouterr().err
assert "not unique" in err
assert "E92000001" in err # country contamination
assert "out-of-England" in err or "lat/lon" in err
assert "[0, 100]" in err
def test_validates_properties_subset(tmp_path, monkeypatch):
postcode = tmp_path / "postcode.parquet"
properties = tmp_path / "properties.parquet"
pl.DataFrame({"Postcode": ["AA1 1AA", "BB1 1BB"]}).write_parquet(postcode)
pl.DataFrame(
{"Postcode": ["AA1 1AA"], "Last known price": [250_000]}
).write_parquet(properties)
monkeypatch.setattr(
"sys.argv",
["validate", "--properties-subset", f"{properties}::{postcode}"],
)
assert main() == 0
def test_rejects_orphan_properties(tmp_path, monkeypatch, capsys):
postcode = tmp_path / "postcode.parquet"
properties = tmp_path / "properties.parquet"
pl.DataFrame({"Postcode": ["AA1 1AA"]}).write_parquet(postcode)
pl.DataFrame(
{"Postcode": ["CC1 1CC"], "Last known price": [-5]} # orphan + negative price
).write_parquet(properties)
monkeypatch.setattr(
"sys.argv",
["validate", "--properties-subset", f"{properties}::{postcode}"],
)
assert main() == 1
err = capsys.readouterr().err
assert "absent from" in err
assert "non-positive" in err
def test_validates_price_index_allows_zero_n_pairs(tmp_path, monkeypatch):
path = tmp_path / "price_index.parquet"
pl.DataFrame(
{
"sector": ["A1 1", "A1 1", "B2 2"],
"type_group": ["All", "Detached", "All"],
"year": [2024, 2024, 2024],
"log_index": [0.5, 0.4, 0.0],
"n_pairs": [100, 0, 0], # zero n_pairs is a legitimate fallback
}
).write_parquet(path)
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
assert main() == 0
def test_rejects_price_index_nonfinite_and_duplicate(tmp_path, monkeypatch, capsys):
path = tmp_path / "price_index.parquet"
pl.DataFrame(
{
"sector": ["A1 1", "A1 1"],
"type_group": ["All", "All"], # duplicate (sector, type_group, year)
"year": [2024, 2024],
"log_index": [float("inf"), 0.3], # non-finite
"n_pairs": [10, 10],
}
).write_parquet(path)
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
assert main() == 1
err = capsys.readouterr().err
assert "non-finite" in err
assert "not unique" in err

View file

@ -28,6 +28,17 @@ MINOR_CRIME_TYPES = (
"Other crime",
)
# Legacy police.uk crime-type names (pre-2014 taxonomy) mapped to their closest
# current equivalent. Without this, ~1.9M incidents from 2010-2013 ("Violent
# crime", "Public disorder and weapons") are unrecognised and silently dropped,
# which understates pre-2013 serious crime and creates an artificial 2012->2013
# step in the by-year series. Applied with `.replace` (not `.replace_strict`) so
# unmapped current types pass through unchanged.
LEGACY_CRIME_TYPE_ALIASES = {
"Violent crime": "Violence and sexual offences",
"Public disorder and weapons": "Public order",
}
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
csvs = sorted(crime_dir.rglob("*.csv"))
@ -96,6 +107,7 @@ def transform_crime(
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
)
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
.group_by("LSOA code", "Month", "Crime type")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type")
@ -147,7 +159,10 @@ def _write_crime_by_year(
& (pl.col("LSOA code") != "")
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
).with_columns(
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
)
# Months observed *anywhere* in the dataset for each year (annualisation denominator).
# Using crime-type-specific months would over-scale years where a rare type appears

View file

@ -17,7 +17,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.crime import find_street_crime_csvs
from pipeline.transform.crime import LEGACY_CRIME_TYPE_ALIASES, find_street_crime_csvs
def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
@ -80,6 +80,10 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
.drop_nulls(["lon", "lat"])
.filter(pl.col("lon").is_between(-9.5, 5.0))
.filter(pl.col("lat").is_between(49.0, 57.0))
# Canonicalise any legacy pre-2014 type names so the heatmap's crime_type
# values always match the frontend's canonical filter list (a no-op for
# the recent months this overlay normally covers).
.with_columns(pl.col("crime_type").replace(LEGACY_CRIME_TYPE_ALIASES))
.group_by("lon", "lat", "month", "crime_type")
.len()
.rename({"len": "count"})

View file

@ -44,6 +44,7 @@ import shapely
from pyproj import Transformer
from pipeline.transform.crime import (
LEGACY_CRIME_TYPE_ALIASES,
MINOR_CRIME_TYPES,
SERIOUS_CRIME_TYPES,
find_street_crime_csvs,
@ -150,6 +151,11 @@ def _accumulate_counts(
& (pl.col("Crime type") != "")
& pl.col("year").is_in(years)
)
# Canonicalise legacy pre-2014 crime-type names ("Violent crime",
# "Public disorder and weapons") to their current equivalents before
# indexing, so ~1.9M historical incidents are counted instead of
# dropped. `.replace` leaves current types unchanged.
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
# Map crime types to indices with default=None so an unrecognised
# type yields a null index we can *report* rather than silently drop
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).

View file

@ -18,11 +18,49 @@ from ..utils import (
normalize_postcode_key,
)
pl.Config.set_tbl_cols(-1)
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
MIN_PRICE = 50_000
# Plausible construction-year range; band-derived years outside it (e.g. OCR
# noise like 1012 or 2202) are nulled rather than published.
MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
"""Map an EPC construction age band to a single representative build year.
EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
(1958) rather than the lower bound, which previously biased every band-derived
year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
wide to pin to a year and return null. Single-year / ``... onwards`` bands use
that year. Already-numeric inputs (a year produced by an earlier call) pass
through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
"""
text = (
band.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
)
low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
year = (
pl.when(text.str.starts_with("before "))
.then(None)
.when(high.is_not_null())
.then(((low + high) / 2).round(0).cast(pl.Int32))
.otherwise(low)
)
return (
pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
.then(year)
.otherwise(None)
.cast(pl.UInt16, strict=False)
)
EPC_SOURCE_COLUMNS = [
"address",
"postcode",
@ -410,13 +448,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
# For new-builds (old_new == "Y"), use the first transaction date year as
# the exact construction date; otherwise fall back to the EPC age band.
epc_band_year = (
pl.col("construction_age_band")
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
transfer_year = (
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
)

View file

@ -17,7 +17,11 @@ from shapely.strtree import STRtree
from thefuzz import fuzz
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates
from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
from pipeline.transform.price_estimation.knn import (
MAX_COMPARABLE_PSM,
MIN_COMPARABLE_PSM,
)
from pipeline.utils.fuzzy_join import (
normalize_address_key,
normalize_postcode_key,
@ -59,7 +63,7 @@ _AREA_COLUMNS = [
"Air Quality and Road Safety Score",
# Ethnicity
"% South Asian",
"% East Asian",
"% East/SE Asian",
"% Black",
"% Mixed",
"% White",
@ -1060,14 +1064,10 @@ def _canonical_epc_property_type_expr() -> pl.Expr:
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return (
pl.col(column)
.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
# Use the shared band->midpoint-year mapping so the direct-EPC / listings
# path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
# implausible years -> null). Already-numeric inputs pass through unchanged.
return epc_band_to_year(pl.col(column))
def _address_score(query: str, candidate: str | None) -> int:
@ -1956,7 +1956,9 @@ def _build(
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as a numeric (UInt16) Mbps value so
# it sorts/filters correctly; null (not a fabricated 10) when no availability
# tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
broadband = (
pl.scan_parquet(broadband_path)
.select(
@ -1969,13 +1971,12 @@ def _build(
.then(100)
.when(pl.col("SFBB availability (% premises)") > 0)
.then(30)
.otherwise(10)
.otherwise(None)
.cast(pl.UInt16)
.alias("max_download_speed"),
)
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
)
area_side_tables = {
"iod": iod,
@ -2052,9 +2053,20 @@ def _build(
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
).with_columns(
# Null out implausible per-sqm values (outside the kNN comparable band):
# bulk/block transactions divided by a single unit's floor area otherwise
# produce figures up to ~£1.5M/sqm.
pl.when(
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
& (
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
)
)
.then(
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
)
.otherwise(None)
.alias("Price per sqm"),
)
wide = _finalize_merged_columns(wide)

View file

@ -5,6 +5,7 @@ from pathlib import Path
from pyproj import Transformer
from shapely import make_valid, set_precision
from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
from shapely.ops import transform as transform_geometry
from shapely.ops import unary_union
@ -43,7 +44,14 @@ def _largest_polygonal(geom) -> Polygon | None:
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
just the intermediate Shapely object: coordinate snapping during
serialization can otherwise leave a self-intersecting ring that only shows up
once the feature is read back from disk. Any such geometry is repaired with
``make_valid`` before returning so written features are always valid.
"""
geom = _largest_polygonal(geom)
if geom is None:
return None
@ -55,12 +63,28 @@ def to_wgs84_geojson(
transformer = _get_to_wgs84()
wgs84 = transform_geometry(transformer.transform, simplified)
try:
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
except GEOSException:
# Precision snapping can fail on pathological geometries; fall back to a
# plain validity repair without coordinate snapping.
wgs84 = make_valid(wgs84)
wgs84 = _largest_polygonal(wgs84)
if wgs84 is None:
return None
return mapping(wgs84)
geojson_dict = mapping(wgs84)
# The geometry that actually reaches disk is the GeoJSON dict, so validate
# *that* (not the pre-serialization object) and repair if needed.
round_trip = shape(geojson_dict)
if round_trip.is_empty or not round_trip.is_valid:
round_trip = _largest_polygonal(make_valid(round_trip))
if round_trip is None or round_trip.is_empty:
return None
geojson_dict = mapping(round_trip)
return geojson_dict
def _fill_holes(geom):
@ -119,7 +143,11 @@ def merge_fragments(
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _largest_polygon(combined)
combined = _fill_holes(combined)
# Do NOT _fill_holes here: interior holes carved by the greenspace
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
# Filling them would re-add the removed area and negate the
# subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
# chain were already removed by the _fill_holes above (pre-subtraction).
# Revert if subtraction + fragment selection lost >90% of area
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
combined = pre_green

View file

@ -893,3 +893,54 @@ class TestSubtractGreenspace:
result = subtract_greenspace(postcode, tree, geoms)
# 80% < 90% cap, so subtraction should happen
assert result.area == pytest.approx(2000, rel=0.01)
class TestToWgs84GeojsonValidity:
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
def test_geojson_round_trips_to_valid_geometry(self):
from shapely.geometry import shape
geojson = to_wgs84_geojson(box(530000, 180000, 530100, 180100))
assert geojson is not None
rt = shape(geojson)
assert not rt.is_empty
assert rt.is_valid
def test_written_district_features_are_all_valid(self, tmp_path):
from shapely.geometry import shape
postcodes = {
"AA1 1AA": box(530000, 180000, 530100, 180100),
"AA1 1AB": MultiPolygon(
[
box(530200, 180000, 530250, 180050),
box(530200, 180060, 530250, 180110),
]
),
}
assert write_district_geojson(postcodes, tmp_path) == 1
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
for feature in collection["features"]:
geom = shape(feature["geometry"])
assert geom.is_valid
assert not geom.is_empty
class TestGreenspaceHolePreserved:
"""Interior holes carved by greenspace subtraction must survive merge_fragments
(the post-subtraction _fill_holes that previously negated them was removed)."""
def test_interior_lake_hole_survives_merge_fragments(self):
from shapely.strtree import STRtree
postcode = box(0, 0, 100, 100) # 10000 sqm
lake = box(30, 30, 70, 70) # 1600 sqm fully-interior hole (16% removal)
result = merge_fragments(
[("TEST1", postcode)],
greenspace_tree=STRtree([lake]),
greenspace_geoms=[lake],
)
merged = result["TEST1"]
assert len(list(merged.interiors)) == 1
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)

View file

@ -67,6 +67,16 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
.struct.field("price")
.alias("input_price"),
)
.with_columns(
# Date of the input (second-to-last) sale, used by the kNN leakage
# filter to exclude the target property's own prior sale from its
# comparables. Built from year+month (day defaults to the 1st).
pl.date(
pl.col("input_year").cast(pl.Int32),
pl.col("input_month").cast(pl.Int32),
1,
).alias("input_date"),
)
.with_columns(
(
pl.col("actual_year").cast(pl.Float64)

View file

@ -18,6 +18,8 @@ import polars as pl
from pipeline.transform.price_estimation.knn import (
KNN_BLEND_WEIGHT,
MAX_COMPARABLE_PSM,
MIN_COMPARABLE_PSM,
build_knn_pool,
knn_median_psm,
)
@ -31,7 +33,13 @@ from pipeline.transform.price_estimation.utils import (
MAX_KNN_TO_INDEX_RATIO = 2.0
MIN_KNN_TO_INDEX_RATIO = 0.5
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
# Cap the final estimate at this multiple of the last known price as a guard
# against data errors. Set to ~exp(MAX_LOG_ADJUSTMENT) (~20x) so it is
# consistent with the log-index clip already applied to the index move: many
# UK sectors legitimately grew >6x since the 1990s (e.g. parts of inner London
# 12-14x), so the previous 6x cap truncated genuine appreciation rather than
# only catching outliers.
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0
def guarded_blend_estimates(
@ -222,11 +230,22 @@ def main():
).height
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
# Derive estimated price per sqm where both estimated price and floor area exist
# Derive estimated price per sqm where both estimated price and floor area
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
# from bulk/block transactions or floor-area errors and are not meaningful
# per-unit prices.
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32, strict=False)
pl.when(
pl.col("Estimated current price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& (_est_psm >= MIN_COMPARABLE_PSM)
& (_est_psm <= MAX_COMPARABLE_PSM)
)
.then(_est_psm.round(0).cast(pl.Int32, strict=False))
.otherwise(None)
.alias("Est. price per sqm"),
)

View file

@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS,
build_hedonic_features,
extract_centroids,
@ -165,12 +166,50 @@ def solve_robust_index(
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
# Temporal smoothness prior: penalise curvature in the year betas with a
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
# zero target). This damps single-year index spikes without flattening
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
# year (min_year, implicit beta=0) has no column, so the penalty spans the
# non-baseline years only. For cells with <3 betas there is no curvature to
# penalise and the solve is unchanged.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cols_by_year = [c for _, c in sorted(year_to_col.items())]
n_pen = n_cols - 2
pen_rows = np.repeat(n + np.arange(n_pen), 3)
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
for k in range(n_pen):
pen_cols[3 * k : 3 * k + 3] = (
cols_by_year[k],
cols_by_year[k + 1],
cols_by_year[k + 2],
)
pen_rows_arr = pen_rows.astype(np.int64)
pen_cols_arr = pen_cols
pen_vals_arr = np.tile(
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
).astype(np.float64)
pen_b = np.zeros(n_pen, dtype=np.float64)
n_total_rows = n + n_pen
weights = base_weights.copy()
for _ in range(IRLS_ITERATIONS):
data = signs_arr * weights[rows_arr]
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
if n_pen:
all_data = np.concatenate([data, pen_vals_arr])
all_rows = np.concatenate([rows_arr, pen_rows_arr])
all_cols = np.concatenate([cols_arr, pen_cols_arr])
b = np.concatenate([log_ratios * weights, pen_b])
else:
all_data, all_rows, all_cols = data, rows_arr, cols_arr
b = log_ratios * weights
A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
# Residuals

View file

@ -96,8 +96,11 @@ def spatial_smooth(
for i, sec in enumerate(sectors_with_coords):
n = counts.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue # enough data, skip smoothing
if self_w > 0.90:
# Enough data, skip smoothing. Relaxed from 0.95 so higher-volume
# cells (n ~270-570) that still carry single-year noise get a light
# spatial blend, complementing the temporal smoothness prior.
continue
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
# Skip self (index 0, distance ~0)

View file

@ -81,8 +81,21 @@ def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
last_prices=np.array([100_000.0, 100_000.0]),
)
# Property 0: unstable kNN (>2x index) is dropped, index estimate kept.
assert blended[0] == 120_000.0
assert blended[1] == 600_000.0
# Property 1: a 10x uplift over the last price is legitimate appreciation and
# is no longer truncated (cap raised from 6x to 20x).
assert blended[1] == 1_000_000.0
def test_guarded_blend_caps_uplift_at_20x_last_price():
# 50x index estimate over the last price is capped at the 20x ceiling.
blended = guarded_blend_estimates(
index_est=np.array([5_000_000.0]),
knn_est=np.array([np.nan]),
last_prices=np.array([100_000.0]),
)
assert blended[0] == 2_000_000.0 # 100_000 * 20
def test_bungalow_is_not_a_dead_price_index_type_group():
@ -92,3 +105,50 @@ def test_bungalow_is_not_a_dead_price_index_type_group():
assert "Bungalow" not in TYPE_GROUPS
assert df["type_group"].to_list() == [None, None]
def test_temporal_regularization_damps_curvature_without_breaking_solve():
"""The second-difference prior reduces year-to-year curvature and keeps the
index well-formed (all years present, finite, contiguous)."""
from pipeline.transform.price_estimation import index as index_mod
years = np.arange(2010, 2021)
true = {y: 0.04 * (y - 2010) for y in years}
y1, y2, lr, w = [], [], [], []
for y in years[:-1]: # adjacent-year pairs following a smooth trend
y1.append(y)
y2.append(y + 1)
lr.append(true[y + 1] - true[y])
w.append(1.0)
# A spurious single-year jump at 2015 (poorly identified curvature spike).
y1.append(2014)
y2.append(2015)
lr.append(0.5)
w.append(1.0)
y1, y2 = np.array(y1), np.array(y2)
lr, w = np.array(lr, float), np.array(w, float)
def solve(lmbda):
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda
try:
return index_mod.solve_robust_index(y1, y2, lr, w)
finally:
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
unregularised = solve(0.0)
regularised = solve(0.2)
# Index is well-formed for both.
assert set(regularised) == set(range(2010, 2021))
assert all(np.isfinite(v) for v in regularised.values())
assert regularised[2010] == 0.0 # baseline year pinned to 0
def max_curvature(d):
betas = np.array([d[y] for y in sorted(d)])
return float(np.abs(np.diff(betas, 2)).max())
# Regularisation strictly reduces curvature, and never flattens the genuine
# uptrend (the index still rises end to end).
assert max_curvature(regularised) < max_curvature(unregularised)
assert regularised[2020] > regularised[2010]

View file

@ -22,6 +22,13 @@ FLAT_TYPES = ["Flats/Maisonettes"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
SHRINKAGE_K = 50
# Temporal regularization for the repeat-sales index: a second-difference
# (curvature) penalty lambda * sum((beta_t - 2*beta_{t-1} + beta_{t-2})^2) added
# to the IRLS solve. A mild penalty damps single-year index spikes (which would
# otherwise distort the estimate of any property whose last sale landed on a
# noisy year) without flattening genuine multi-year trends.
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
def type_group_expr():
"""Polars expression: Property type -> type_group."""

View file

@ -15,6 +15,66 @@ SCHOOL_GROUPS = {
}
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for proximity counts.
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
returning a ``(postcode, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
Framework). A large and growing share of schools were last inspected under an
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
variants). Filtering on the graded column alone dropped ~7,000 genuinely
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
is never overridden.
"""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
graded = (
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
.with_columns(
pl.when(oeif.is_in(["1", "2"]))
.then(oeif)
.when(
no_usable_grade
& ungraded.str.starts_with("School remains Outstanding")
)
.then(pl.lit("1"))
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
.filter(pl.col("_ofsted_grade").is_not_null())
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
return graded.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
.alias("category")
).select(
pl.col("Postcode").alias("postcode"),
"category",
)
def main():
parser = argparse.ArgumentParser(
description="Count good+ and outstanding primary/secondary schools near each postcode"
@ -30,42 +90,14 @@ def main():
)
args = parser.parse_args()
# Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
# Post-2025 reform the single "Overall effectiveness" grade was retired;
# the legacy 14 scale is now carried forward under "Latest OEIF overall
# effectiveness" (OEIF = the previous Ofsted Education Inspection
# Framework). The new report-card columns use text judgements instead.
ofsted = pl.read_parquet(args.ofsted).filter(
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
)
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ schools: {len(ofsted):,}")
print(
"Outstanding schools: "
f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
)
# Assign category based on phase and rating. Good+ groups include both
# category variants; outstanding groups count grade 1 only.
ofsted = ofsted.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
.alias("category")
).select(
pl.col("Postcode").alias("postcode"),
"category",
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
)
# Join with arcgis to get lat/lng for each school's postcode

View file

@ -226,3 +226,44 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
def test_transform_crime_maps_legacy_crime_types(tmp_path):
"""Pre-2014 police.uk type names are aliased to current equivalents instead
of being dropped."""
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2013-01"
month_dir.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
(month_dir / "2013-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"1,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Violent crime,Under investigation,",
"2,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Public disorder and weapons,Under investigation,",
"3,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Burglary,Under investigation,",
]
)
+ "\n"
)
output = tmp_path / "crime.parquet"
by_year_output = tmp_path / "crime_by_year.parquet"
transform_crime(crime_dir, output, by_year_output)
row = pl.read_parquet(output).to_dicts()[0]
# Single month -> annualised x12. Legacy names mapped to current columns.
assert row["Violence and sexual offences (avg/yr)"] == 12.0
assert row["Public order (avg/yr)"] == 12.0
assert row["Burglary (avg/yr)"] == 12.0
# The legacy names must NOT survive as their own columns.
assert "Violent crime (avg/yr)" not in row
assert "Public disorder and weapons (avg/yr)" not in row
by_year = pl.read_parquet(by_year_output).row(0, named=True)
serious = {p["year"]: p["count"] for p in by_year["Serious crime (by year)"]}
# Serious = Violence and sexual offences (12) + Burglary (12) = 24
assert serious[2013] == 24.0
minor = {p["year"]: p["count"] for p in by_year["Minor crime (by year)"]}
assert minor[2013] == 12.0 # Public order

View file

@ -279,3 +279,37 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
err = capsys.readouterr().err
assert "Cyber fraud" in err
assert "WARNING" in err
def test_legacy_crime_types_are_mapped(tmp_path):
"""Pre-2014 crime-type names are aliased to current equivalents in the
spatial transform instead of being dropped as unknown types."""
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
)
crime = tmp_path / "crime"
_write_month(
crime,
"2013-01",
[
_crime_row("2013-01", 1005, 1005, "Violent crime"),
_crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
],
)
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
row = pl.read_parquet(output).to_dicts()[0]
# Single postcode -> area-norm factor 1.0; single month/year -> x12.
assert row["Violence and sexual offences (avg/yr)"] == 12.0
assert row["Public order (avg/yr)"] == 12.0
by_year_row = pl.read_parquet(by_year).row(0, named=True)
assert by_year_row["Violence and sexual offences (by year)"] == [
{"year": 2013, "count": 12.0}
]
assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]

View file

@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
"epc_address": "1 Example Street",
"current_energy_rating": "C",
"total_floor_area": 85.0,
"construction_age_band": 1950,
# Band midpoint of 1950-1966, not the lower bound.
"construction_age_band": 1958,
"was_council_house": "Yes",
}
]
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
"current_energy_rating": None,
}
]
def test_epc_band_to_year_uses_midpoint_and_clamps():
import polars as pl
from pipeline.transform.join_epc_pp import epc_band_to_year
df = pl.DataFrame(
{
"b": [
"England and Wales: 1950-1966", # midpoint 1958
"1900-1929", # midpoint 1914
"England and Wales: before 1900", # too wide -> null
"2012 onwards", # single year
"1012", # implausible -> null
"2202", # implausible -> null
None, # null -> null
"1958", # already-numeric-as-string -> pass through
]
}
)
years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
assert years == [1958, 1914, None, 2012, None, None, None, 1958]

View file

@ -0,0 +1,82 @@
import polars as pl
from pipeline.transform.school_proximity import classify_good_plus_schools
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, "AA1 1AA"),
_school("Primary", "2", None, "AA1 1AB"),
_school("Secondary", "1", None, "AA1 1AC"),
_school("Secondary", "2", None, "AA1 1AD"),
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_primary"),
("AA1 1AC", "outstanding_secondary"),
("AA1 1AD", "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", "AA1 1AA"),
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
# "(Concerns)"/"(Improving)" variants are still good+.
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
"AA1 1AD",
),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AC", "good_primary"),
("AA1 1AD", "outstanding_secondary"),
}
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()

View file

@ -352,6 +352,176 @@ def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
return failures
def _failures_for_postcode_features(path: Path) -> list[str]:
"""Validate the postcode feature output: unique Postcode, non-null lat/lon
inside the England bbox, ctry25cd == E92000001, and every '% ' column in
[0, 100]. Mirrors the in-build invariant (merge._validate_postcode_feature_output)
so a stale/contaminated file on disk cannot pass `make`.
"""
failures = _failures_for_parquet(path)
if failures:
return failures
try:
names = pl.scan_parquet(path).collect_schema().names()
required = {"Postcode", "lat", "lon", "ctry25cd"}
missing = sorted(required - set(names))
if missing:
return [f"{path}: postcode features missing required columns: {missing}"]
pct_cols = [c for c in names if c.startswith("% ")]
df = (
pl.scan_parquet(path)
.select(["Postcode", "lat", "lon", "ctry25cd", *pct_cols])
.collect()
)
except Exception as exc:
return [f"{path}: postcode features validation failed: {exc}"]
height = df.height
if df["Postcode"].n_unique() != height:
failures.append(
f"{path}: Postcode is not unique "
f"({height - df['Postcode'].n_unique():,} duplicate rows)"
)
# England bounding box (generous): lat 49.5-60N, lon -8 to 2.5E.
bad_coords = df.filter(
pl.col("lat").is_null()
| pl.col("lon").is_null()
| ~pl.col("lat").is_between(49.5, 60.0)
| ~pl.col("lon").is_between(-8.0, 2.5)
)
if bad_coords.height:
sample = bad_coords.get_column("Postcode").head(10).to_list()
failures.append(
f"{path}: {bad_coords.height:,} rows have null or out-of-England "
f"lat/lon; sample: {_format_samples(sample)}"
)
bad_country = df.filter(pl.col("ctry25cd") != "E92000001")
if bad_country.height:
sample = bad_country.get_column("Postcode").head(10).to_list()
failures.append(
f"{path}: {bad_country.height:,} rows have ctry25cd != 'E92000001' "
f"(non-England contamination); sample: {_format_samples(sample)}"
)
for col in pct_cols:
out_of_range = df.filter(
pl.col(col).is_not_null() & ~pl.col(col).is_between(0.0, 100.0)
).height
if out_of_range:
failures.append(
f"{path}: {col!r} has {out_of_range:,} values outside [0, 100]"
)
return failures
def _failures_for_properties_subset(spec: str) -> list[str]:
"""Validate that every properties Postcode exists in the postcode feature
table (no orphan properties) and that numeric price columns are positive."""
properties_path, postcode_path = _split_pair(spec, "properties subset")
failures = _failures_for_parquet(properties_path) + _failures_for_parquet(
postcode_path
)
if failures:
return failures
try:
postcode_set = _parquet_postcodes(postcode_path)
property_set = _parquet_postcodes(properties_path)
except Exception as exc:
return [f"{properties_path} / {postcode_path}: subset check failed: {exc}"]
orphans = property_set - postcode_set
if orphans:
failures.append(
f"{properties_path}: {len(orphans):,} property postcodes are absent from "
f"{postcode_path}; sample: {_sample(orphans)}"
)
# Positivity check for genuine numeric price columns only (skip nested/list
# columns like historical_prices, which contain "price" in the name).
try:
schema = pl.scan_parquet(properties_path).collect_schema()
numeric = {
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
pl.Float32, pl.Float64,
}
price_cols = [
c
for c, dtype in schema.items()
if ("price" in c.lower() or "rent" in c.lower()) and dtype in numeric
]
for col in price_cols:
bad = (
pl.scan_parquet(properties_path)
.filter(pl.col(col).is_not_null() & (pl.col(col) <= 0))
.select(pl.len())
.collect()
.item()
)
if bad:
failures.append(
f"{properties_path}: {col!r} has {bad:,} non-positive values"
)
except Exception as exc:
failures.append(f"{properties_path}: price positivity check failed: {exc}")
return failures
def _failures_for_price_index(path: Path) -> list[str]:
"""Validate price_index.parquet structural integrity: required columns, a
finite non-null log_index, and unique (sector, type_group, year) keys.
n_pairs == 0 is intentionally NOT treated as a failure: those rows are
legitimate hedonic/shrinkage fallbacks for sectors with too few repeat-sale
pairs.
"""
failures = _failures_for_parquet(path)
if failures:
return failures
try:
names = pl.scan_parquet(path).collect_schema().names()
required = {"sector", "type_group", "year", "log_index", "n_pairs"}
missing = sorted(required - set(names))
if missing:
return [f"{path}: price index missing required columns: {missing}"]
stats = (
pl.scan_parquet(path)
.select(
pl.len().alias("n"),
pl.col("log_index").null_count().alias("null_log"),
(~pl.col("log_index").is_finite()).sum().alias("nonfinite_log"),
pl.struct("sector", "type_group", "year").n_unique().alias("unique_keys"),
)
.collect()
.row(0, named=True)
)
except Exception as exc:
return [f"{path}: price index validation failed: {exc}"]
if stats["null_log"]:
failures.append(f"{path}: {stats['null_log']:,} rows have null log_index")
if stats["nonfinite_log"]:
failures.append(
f"{path}: {stats['nonfinite_log']:,} rows have non-finite log_index"
)
if stats["unique_keys"] != stats["n"]:
failures.append(
f"{path}: (sector, type_group, year) is not unique "
f"({stats['n'] - stats['unique_keys']:,} duplicate rows)"
)
return failures
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--file", action="append", default=[], type=Path)
@ -385,6 +555,29 @@ def main() -> int:
"GeoJSON postcodes: ARCGIS_PARQUET::DIR"
),
)
parser.add_argument(
"--postcode-features",
action="append",
default=[],
type=Path,
help=(
"Validate a postcode feature parquet: unique Postcode, non-null "
"lat/lon in England, ctry25cd=E92000001, '% ' columns in [0,100]"
),
)
parser.add_argument(
"--properties-subset",
action="append",
default=[],
help="Require properties postcodes to be a subset of postcode keys: PROPERTIES::POSTCODE",
)
parser.add_argument(
"--price-index",
action="append",
default=[],
type=Path,
help="Validate price_index.parquet: finite log_index and unique (sector,type_group,year)",
)
args = parser.parse_args()
failures: list[str] = []
@ -404,6 +597,12 @@ def main() -> int:
failures.extend(_failures_for_postcode_boundary_match(spec))
for spec in args.active_postcode_boundary_match:
failures.extend(_failures_for_active_postcode_boundary_match(spec))
for path in args.postcode_features:
failures.extend(_failures_for_postcode_features(path))
for spec in args.properties_subset:
failures.extend(_failures_for_properties_subset(spec))
for path in args.price_index:
failures.extend(_failures_for_price_index(path))
if failures:
print("Output validation failed:", file=sys.stderr)

View file

@ -202,7 +202,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Maximum transport noise level near the postcode in decibels (Lden)",
detail: "Maximum road, rail, or airport noise level in decibels (Lden, a 24-hour weighted average) from Defra's Strategic Noise Mapping Round 4 (2022). Modelled at 4m above ground on a 10m grid and sampled as the maximum 10m cell around the postcode representative point. Above ~55 dB is typically noticeable; above ~70 dB is considered harmful by the WHO.",
detail: "Loudest of road, rail, or airport noise in decibels (Lden, a 24-hour day-evening-night weighted average) from Defra's Strategic Noise Mapping Round 4 (2022). Covers England only; rail noise dominates the value at ~120k postcodes and airport noise at ~4k. Modelled at 4m above ground on a 10m grid and sampled as the maximum 10m cell around the postcode representative point. Blank means no mapped data in the source (Wales, Scotland and areas away from major roads/railways/airports all return blank) — not necessarily quiet. Above ~55 dB is typically noticeable; above ~70 dB is considered harmful by the WHO.",
source: "noise",
prefix: "",
suffix: " dB",
@ -832,14 +832,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "% East Asian",
name: "% East/SE Asian",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 0.1,
description: "Percentage of population identifying as East Asian",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Chinese.",
description: "Percentage of population identifying as East or Southeast Asian",
detail: "From the 2021 Census. Percentage of the local authority population identifying as Chinese, Vietnamese, Filipino, Thai, or any other East or Southeast Asian background.",
source: "ethnicity",
prefix: "",
suffix: "%",
@ -987,12 +987,20 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
FeatureGroup {
name: "Amenities",
features: &[
Feature::Enum(EnumFeatureConfig {
Feature::Numeric(FeatureConfig {
name: "Max available download speed (Mbps)",
order: Some(&["10", "30", "100", "300", "1000"]),
bounds: Bounds::Fixed {
min: 10.0,
max: 1000.0,
},
step: 1.0,
description: "Maximum broadband download speed available at the postcode",
detail: "Maximum fixed broadband download speed available from any provider, from Ofcom Connected Nations 2025. Represents theoretical maximum, not achieved speeds. 10 Mbps = basic, 30 = superfast, 100+ = ultrafast, 1000 = gigabit.",
detail: "Maximum fixed broadband download speed available from any provider, from Ofcom Connected Nations 2025. Represents theoretical maximum, not achieved speeds. 10 Mbps = basic, 30 = superfast, 100+ = ultrafast, 1000 = gigabit. Null where no availability data is published.",
source: "broadband",
prefix: "",
suffix: " Mbps",
raw: true,
absolute: true,
}),
],
},

View file

@ -549,7 +549,7 @@ function createRecordingStoryboard(
'Good+ primary schools within 2km': [1, 10],
'Noise (dB)': [50, 70],
'Street tree density percentile': [25, 100],
'Max available download speed (Mbps)': ['100', '300', '1000'],
'Max available download speed (Mbps)': [100, 1000],
},
// Travel-time filters returned by the AI stub. Slug matches the real
// /api/travel-destinations?mode=transit response.