Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -32,7 +32,7 @@ PRICES_STAMP := $(DATA_DIR)/.prices_done
EPC := $(MANUAL_DATA)/domestic-csv.zip EPC := $(MANUAL_DATA)/domestic-csv.zip
ACTUAL_LISTINGS_RAW := $(FINDER_DATA)/online_listings_buy.parquet ACTUAL_LISTINGS_RAW := $(FINDER_DATA)/online_listings_buy.parquet
ACTUAL_LISTINGS_ENRICHED := $(FINDER_DATA)/online_listings_buy_enriched.parquet ACTUAL_LISTINGS_ENRICHED := $(FINDER_DATA)/online_listings_buy_enriched.parquet
ETHNICITY := $(DATA_DIR)/ethnicity_by_la.parquet ETHNICITY := $(DATA_DIR)/ethnicity_by_lsoa.parquet
CRIME_DIR := $(DATA_DIR)/crime CRIME_DIR := $(DATA_DIR)/crime
CRIME := $(DATA_DIR)/crime_by_postcode.parquet CRIME := $(DATA_DIR)/crime_by_postcode.parquet
CRIME_BY_YEAR := $(DATA_DIR)/crime_by_postcode_by_year.parquet CRIME_BY_YEAR := $(DATA_DIR)/crime_by_postcode_by_year.parquet
@ -364,8 +364,8 @@ $(CRIME) $(CRIME_BY_YEAR) &: $(CRIME_STAMP) $(PC_BOUNDARIES_STAMP) pipeline/tran
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DEPS) $(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DEPS)
uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --greenspace $(OS_GREENSPACE) --output $@ uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --greenspace $(OS_GREENSPACE) --output $@
$(SCHOOL_PROX): $(OFSTED) $(ARCGIS) pipeline/transform/school_proximity.py pipeline/utils/poi_counts.py $(SCHOOL_PROX): $(OFSTED) $(ARCGIS) $(GIAS) pipeline/transform/school_proximity.py pipeline/utils/poi_counts.py
uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --output $@ uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --gias $(GIAS) --output $@
$(TREE_DENSITY_PC): $(FR_TOW) $(NFI) $(ARCGIS) $(TREE_DENSITY_DEPS) $(TREE_DENSITY_PC): $(FR_TOW) $(NFI) $(ARCGIS) $(TREE_DENSITY_DEPS)
uv run python -m pipeline.transform.tree_density \ uv run python -m pipeline.transform.tree_density \

View file

@ -31,7 +31,7 @@ const descriptions: Record<string, Record<string, string>> = {
'Potential energy rating': 'Potential energy rating':
'Classe énergétique EPC possible si toutes les améliorations recommandées étaient réalisées', 'Classe énergétique EPC possible si toutes les améliorations recommandées étaient réalisées',
'Interior height (m)': 'Hauteur intérieure moyenne relevée lors du diagnostic EPC', 'Interior height (m)': 'Hauteur intérieure moyenne relevée lors du diagnostic EPC',
'Street tree density percentile': 'Tree canopy density percentile':
'Percentile estimé de couverture arborée autour du code postal', 'Percentile estimé de couverture arborée autour du code postal',
'Within conservation area': 'Within conservation area':
'Indique si le point représentatif du code postal se situe dans une zone de conservation désignée', 'Indique si le point représentatif du code postal se situe dans une zone de conservation désignée',
@ -131,8 +131,8 @@ const descriptions: Record<string, Record<string, string>> = {
'Potential energy rating': 'Potential energy rating':
'Mögliche EPC-Energieeffizienzklasse, wenn alle empfohlenen Maßnahmen umgesetzt würden', 'Mögliche EPC-Energieeffizienzklasse, wenn alle empfohlenen Maßnahmen umgesetzt würden',
'Interior height (m)': 'Durchschnittliche Raumhöhe laut EPC-Gutachten', 'Interior height (m)': 'Durchschnittliche Raumhöhe laut EPC-Gutachten',
'Street tree density percentile': 'Tree canopy density percentile':
'Geschätztes Perzentil der Straßenbaumdichte rund um den Postcode', 'Geschätztes Perzentil der Baumkronendichte rund um den Postcode',
'Within conservation area': 'Within conservation area':
'Ob der repräsentative Punkt des Postcodes in einem ausgewiesenen Denkmalschutzgebiet liegt', 'Ob der repräsentative Punkt des Postcodes in einem ausgewiesenen Denkmalschutzgebiet liegt',
'Listed building': 'Listed building':
@ -229,7 +229,7 @@ const descriptions: Record<string, Record<string, string>> = {
'Current energy rating': '当前 EPC 能源评级A = 最佳G = 最差)', 'Current energy rating': '当前 EPC 能源评级A = 最佳G = 最差)',
'Potential energy rating': '完成所有建议改进后的潜在 EPC 评级', 'Potential energy rating': '完成所有建议改进后的潜在 EPC 评级',
'Interior height (m)': 'EPC 评估记录的平均室内层高', 'Interior height (m)': 'EPC 评估记录的平均室内层高',
'Street tree density percentile': '邮编周边估计街道树木覆盖率百分位', 'Tree canopy density percentile': '邮编周边估计树冠覆盖密度百分位',
'Within conservation area': '邮编代表点是否位于指定保护区内', 'Within conservation area': '邮编代表点是否位于指定保护区内',
'Listed building': '该房产是否疑似对应 Historic England 的登录建筑记录', 'Listed building': '该房产是否疑似对应 Historic England 的登录建筑记录',
'Good+ primary schools within 2km': '2 公里内 Ofsted 评为良好或优秀的小学', 'Good+ primary schools within 2km': '2 公里内 Ofsted 评为良好或优秀的小学',
@ -304,7 +304,7 @@ const descriptions: Record<string, Record<string, string>> = {
'Current energy rating': 'मौजूदा EPC ऊर्जा रेटिंग (A = सबसे अच्छी, G = सबसे खराब)', 'Current energy rating': 'मौजूदा EPC ऊर्जा रेटिंग (A = सबसे अच्छी, G = सबसे खराब)',
'Potential energy rating': 'सभी सुझाए गए सुधार होने पर संभावित EPC रेटिंग', 'Potential energy rating': 'सभी सुझाए गए सुधार होने पर संभावित EPC रेटिंग',
'Interior height (m)': 'EPC सर्वेक्षण के अनुसार औसत अंदरूनी ऊंचाई', 'Interior height (m)': 'EPC सर्वेक्षण के अनुसार औसत अंदरूनी ऊंचाई',
'Street tree density percentile': 'संपत्ति वाली सड़क का अनुमानित वृक्ष आच्छादन प्रतिशतक', 'Tree canopy density percentile': 'पोस्टकोड के आसपास का अनुमानित वृक्ष आच्छादन घनत्व प्रतिशतक',
'Within conservation area': 'पोस्टकोड प्रतिनिधि बिंदु नामित संरक्षण क्षेत्र में है या नहीं', 'Within conservation area': 'पोस्टकोड प्रतिनिधि बिंदु नामित संरक्षण क्षेत्र में है या नहीं',
'Listed building': 'Listed building':
'यह संपत्ति Historic England के सूचीबद्ध भवन रिकॉर्ड से मिलती-जुलती है या नहीं', 'यह संपत्ति Historic England के सूचीबद्ध भवन रिकॉर्ड से मिलती-जुलती है या नहीं',
@ -391,8 +391,8 @@ const descriptions: Record<string, Record<string, string>> = {
'Potential energy rating': 'Potential energy rating':
'Potenciális EPC besorolás az összes javasolt fejlesztés elvégzése után', 'Potenciális EPC besorolás az összes javasolt fejlesztés elvégzése után',
'Interior height (m)': 'Átlagos belmagasság az EPC felmérés alapján', 'Interior height (m)': 'Átlagos belmagasság az EPC felmérés alapján',
'Street tree density percentile': 'Tree canopy density percentile':
'Az ingatlan utcájának becsült lombkorona-fedettségi percentilise', 'A postai irányítószám környékének becsült lombkorona-fedettségi percentilise',
'Within conservation area': 'Within conservation area':
'Az irányítószám reprezentatív pontja kijelölt műemléki területre esik-e', 'Az irányítószám reprezentatív pontja kijelölt műemléki területre esik-e',
'Listed building': 'Listed building':

View file

@ -35,8 +35,8 @@ export const details: Record<string, Record<string, string>> = {
"La note d'efficacité énergétique potentielle issue du certificat de performance énergétique (EPC), si toutes les améliorations rentables recommandées dans le rapport EPC étaient réalisées. Va de A (plus efficace) à G (moins efficace).", "La note d'efficacité énergétique potentielle issue du certificat de performance énergétique (EPC), si toutes les améliorations rentables recommandées dans le rapport EPC étaient réalisées. Va de A (plus efficace) à G (moins efficace).",
'Interior height (m)': 'Interior height (m)':
"Hauteur intérieure moyenne (sol au plafond) en mètres telle qu'enregistrée lors de l'évaluation du certificat de performance énergétique (EPC). Calculée en divisant le volume intérieur total par la surface habitable totale.", "Hauteur intérieure moyenne (sol au plafond) en mètres telle qu'enregistrée lors de l'évaluation du certificat de performance énergétique (EPC). Calculée en divisant le volume intérieur total par la surface habitable totale.",
'Street tree density percentile': 'Tree canopy density percentile':
"Couverture arborée approximative autour du centroïde du code postal, dérivée de la carte Trees Outside Woodland 2025 de Forest Research. Les polygones de couvert arboré des arbres isolés et groupes d'arbres sont comptés dans un rayon de 50 m de chaque centroïde de code postal, puis convertis en percentile parmi les codes postaux anglais. Il s'agit d'une approximation fondée sur le centroïde du code postal, pas d'une mesure exacte du bien ou du segment de rue.", "Couverture arborée approximative autour du centroïde du code postal, dérivée de la carte Trees Outside Woodland 2025 de Forest Research et de l'Inventaire forestier national (NFI). Les polygones de couvert arboré des arbres isolés, des groupes d'arbres ET des boisements de l'Inventaire forestier national sont comptés dans un rayon de 50 m de chaque centroïde de code postal, puis convertis en percentile parmi les codes postaux anglais. Il s'agit d'une approximation fondée sur le centroïde du code postal, pas d'une mesure exacte du bien ou du segment de rue.",
'Within conservation area': 'Within conservation area':
"Limites des conservation areas dans Planning Data, rattachées au point représentatif du code postal. Le jeu de données national est encore en cours de constitution et peut contenir des doublons ou une couverture locale incomplète ; toute décision dépendant précisément d'une limite doit être vérifiée auprès de la local planning authority.", "Limites des conservation areas dans Planning Data, rattachées au point représentatif du code postal. Le jeu de données national est encore en cours de constitution et peut contenir des doublons ou une couverture locale incomplète ; toute décision dépendant précisément d'une limite doit être vérifiée auprès de la local planning authority.",
'Listed building': 'Listed building':
@ -181,8 +181,8 @@ export const details: Record<string, Record<string, string>> = {
'Die potenzielle Energieeffizienzklasse aus dem Energieausweis-Zertifikat, wenn alle im EPC-Bericht empfohlenen kosteneffizienten Verbesserungen durchgeführt würden. Reicht von A (am effizientesten) bis G (am wenigsten effizient).', 'Die potenzielle Energieeffizienzklasse aus dem Energieausweis-Zertifikat, wenn alle im EPC-Bericht empfohlenen kosteneffizienten Verbesserungen durchgeführt würden. Reicht von A (am effizientesten) bis G (am wenigsten effizient).',
'Interior height (m)': 'Interior height (m)':
'Durchschnittliche lichte Raumhöhe in Metern, wie während der Energieausweis-Begutachtung erfasst. Berechnet durch Division des gesamten Innenvolumens durch die Gesamtwohnfläche.', 'Durchschnittliche lichte Raumhöhe in Metern, wie während der Energieausweis-Begutachtung erfasst. Berechnet durch Division des gesamten Innenvolumens durch die Gesamtwohnfläche.',
'Street tree density percentile': 'Tree canopy density percentile':
'Ungefähre Baumkronenbedeckung rund um den Postleitzahlen-Zentroiden aus der Forest-Research-Karte Trees Outside Woodland 2025. Baumkronen-Polygone für Einzelbäume und Baumgruppen werden im Umkreis von 50 m um jeden Postleitzahlen-Zentroiden gezählt und dann in ein Perzentil über englische Postleitzahlen umgerechnet. Dies ist ein Näherungswert auf Basis des Postleitzahlen-Zentroids, keine exakte Messung für Immobilie oder Straßenabschnitt.', 'Ungefähre Baumkronenbedeckung rund um den Postleitzahlen-Zentroiden aus der Forest-Research-Karte Trees Outside Woodland 2025 und dem National Forest Inventory (NFI). Baumkronen-Polygone für Einzelbäume, Baumgruppen UND Waldflächen des National Forest Inventory werden im Umkreis von 50 m um jeden Postleitzahlen-Zentroiden gezählt und dann in ein Perzentil über englische Postleitzahlen umgerechnet. Dies ist ein Näherungswert auf Basis des Postleitzahlen-Zentroids, keine exakte Messung für Immobilie oder Straßenabschnitt.',
'Within conservation area': 'Within conservation area':
'Planning-Data-Grenzen für Erhaltungsgebiete, dem repräsentativen Punkt der Postleitzahl zugeordnet. Der nationale Datensatz wird laufend verbessert und kann Duplikate oder unvollständige lokale Abdeckung enthalten; grenznahe Entscheidungen sollten bei der lokalen Planungsbehörde geprüft werden.', 'Planning-Data-Grenzen für Erhaltungsgebiete, dem repräsentativen Punkt der Postleitzahl zugeordnet. Der nationale Datensatz wird laufend verbessert und kann Duplikate oder unvollständige lokale Abdeckung enthalten; grenznahe Entscheidungen sollten bei der lokalen Planungsbehörde geprüft werden.',
'Listed building': 'Listed building':
@ -327,8 +327,8 @@ export const details: Record<string, Record<string, string>> = {
'若实施EPC报告中建议的所有具有成本效益的改进措施后该房产的潜在能源效率等级。从A最高效到G最低效。', '若实施EPC报告中建议的所有具有成本效益的改进措施后该房产的潜在能源效率等级。从A最高效到G最低效。',
'Interior height (m)': 'Interior height (m)':
'EPC评估期间记录的平均室内净高。通过将室内总容积除以总建筑面积计算得出。', 'EPC评估期间记录的平均室内净高。通过将室内总容积除以总建筑面积计算得出。',
'Street tree density percentile': 'Tree canopy density percentile':
'基于 Forest Research 2025 年 Trees Outside Woodland 地图估算的邮编质心周边树冠覆盖率。系统会统计每个邮编质心 50 米范围内的孤立树木和树群树冠多边形,然后转换为英格兰邮编范围内的百分位。这是邮编质心近似指标,不是精确的房产或道路路段测量。', '基于 Forest Research 2025 年 Trees Outside Woodland 地图与国家森林清查NFI估算的邮编质心周边树冠覆盖率。系统会统计每个邮编质心 50 米范围内的孤立树木、树群以及国家森林清查林地的树冠多边形,然后转换为英格兰邮编范围内的百分位。这是邮编质心近似指标,不是精确的房产或道路路段测量。',
'Within conservation area': 'Within conservation area':
'Planning Data 保护区边界,与邮编代表点匹配。全国数据集仍在完善中,可能包含重复记录或地方覆盖不完整;涉及边界的决策应向地方规划部门核实。', 'Planning Data 保护区边界,与邮编代表点匹配。全国数据集仍在完善中,可能包含重复记录或地方覆盖不完整;涉及边界的决策应向地方规划部门核实。',
'Listed building': 'Listed building':
@ -465,8 +465,8 @@ export const details: Record<string, Record<string, string>> = {
'EPC से संभावित ऊर्जा दक्षता रेटिंग, यदि EPC रिपोर्ट में सुझाए गए सभी किफायती सुधार कर दिए जाएं. यह A (सबसे दक्ष) से G (सबसे कम दक्ष) तक होती है.', 'EPC से संभावित ऊर्जा दक्षता रेटिंग, यदि EPC रिपोर्ट में सुझाए गए सभी किफायती सुधार कर दिए जाएं. यह A (सबसे दक्ष) से G (सबसे कम दक्ष) तक होती है.',
'Interior height (m)': 'Interior height (m)':
'EPC आकलन के दौरान दर्ज औसत अंदरूनी फर्श-से-छत ऊंचाई, मीटर में. कुल आंतरिक आयतन को कुल फर्श क्षेत्र से भाग देकर निकाली जाती है.', 'EPC आकलन के दौरान दर्ज औसत अंदरूनी फर्श-से-छत ऊंचाई, मीटर में. कुल आंतरिक आयतन को कुल फर्श क्षेत्र से भाग देकर निकाली जाती है.',
'Street tree density percentile': 'Tree canopy density percentile':
'Forest Research के 2025 Trees Outside Woodland नक्शे से निकाला गया पोस्टकोड केंद्र के आसपास का अनुमानित वृक्ष आच्छादन. अकेले पेड़ों और पेड़ों के समूहों के वृक्ष-शिखर बहुभुजों को हर पोस्टकोड केंद्र से 50m के भीतर गिना जाता है, फिर इंग्लैंड के पोस्टकोडों के मुकाबले प्रतिशतक में बदला जाता है. यह पोस्टकोड-केंद्र पर आधारित अनुमानक है, किसी संपत्ति या सड़क-खंड की सटीक माप नहीं.', 'Forest Research के 2025 Trees Outside Woodland नक्शे और राष्ट्रीय वन सूची (NFI) से निकाला गया पोस्टकोड केंद्र के आसपास का अनुमानित वृक्ष आच्छादन. अकेले पेड़ों, पेड़ों के समूहों और राष्ट्रीय वन सूची की वनभूमि के वृक्ष-शिखर बहुभुजों को हर पोस्टकोड केंद्र से 50m के भीतर गिना जाता है, फिर इंग्लैंड के पोस्टकोडों के मुकाबले प्रतिशतक में बदला जाता है. यह पोस्टकोड-केंद्र पर आधारित अनुमानक है, किसी संपत्ति या सड़क-खंड की सटीक माप नहीं.',
'Within conservation area': 'Within conservation area':
'Planning Data संरक्षण क्षेत्र सीमाएं पोस्टकोड प्रतिनिधि बिंदु से मिलाई जाती हैं. राष्ट्रीय डेटासेट अभी बेहतर किया जा रहा है और इसमें डुप्लीकेट या अधूरी स्थानीय कवरेज हो सकती है; सीमा-संवेदनशील निर्णय स्थानीय योजना प्राधिकरण से जांचे जाने चाहिए.', 'Planning Data संरक्षण क्षेत्र सीमाएं पोस्टकोड प्रतिनिधि बिंदु से मिलाई जाती हैं. राष्ट्रीय डेटासेट अभी बेहतर किया जा रहा है और इसमें डुप्लीकेट या अधूरी स्थानीय कवरेज हो सकती है; सीमा-संवेदनशील निर्णय स्थानीय योजना प्राधिकरण से जांचे जाने चाहिए.',
'Listed building': 'Listed building':
@ -611,8 +611,8 @@ export const details: Record<string, Record<string, string>> = {
'Az EPC-tanúsítvány potenciális energiahatékonysági besorolása, amennyiben az EPC-jelentésben ajánlott összes költséghatékony fejlesztést elvégeznék. A-tól (leghatékonyabb) G-ig (legkevésbé hatékony) terjed.', 'Az EPC-tanúsítvány potenciális energiahatékonysági besorolása, amennyiben az EPC-jelentésben ajánlott összes költséghatékony fejlesztést elvégeznék. A-tól (leghatékonyabb) G-ig (legkevésbé hatékony) terjed.',
'Interior height (m)': 'Interior height (m)':
'Az EPC-tanúsítvány felmérése során rögzített átlagos belső padló-mennyezet magasság méterben. A teljes belső térfogatot osztják a teljes alapterülettel.', 'Az EPC-tanúsítvány felmérése során rögzített átlagos belső padló-mennyezet magasság méterben. A teljes belső térfogatot osztják a teljes alapterülettel.',
'Street tree density percentile': 'Tree canopy density percentile':
'A Forest Research 2025-os Trees Outside Woodland térképéből származó hozzávetőleges lombkorona-fedettség az irányítószám-középpont körül. A magányos fák és facsoportok lombkorona-poligonjait minden irányítószám-középpont 50 méteres körzetében számoljuk, majd az angliai irányítószámok közötti percentilissé alakítjuk. Ez az irányítószám-középponton alapuló közelítő mutató, nem pontos ingatlan- vagy utcaszakasz-mérés.', 'A Forest Research 2025-os Trees Outside Woodland térképéből és a National Forest Inventory (NFI) adataiból származó hozzávetőleges lombkorona-fedettség az irányítószám-középpont körül. A magányos fák, facsoportok ÉS a National Forest Inventory erdőterületeinek lombkorona-poligonjait minden irányítószám-középpont 50 méteres körzetében számoljuk, majd az angliai irányítószámok közötti percentilissé alakítjuk. Ez az irányítószám-középponton alapuló közelítő mutató, nem pontos ingatlan- vagy utcaszakasz-mérés.',
'Within conservation area': 'Within conservation area':
'A Planning Data műemléki területeinek határai az irányítószám reprezentatív pontjához rendelve. Az országos adatállomány fejlesztés alatt áll, és tartalmazhat duplikátumokat vagy hiányos helyi lefedettséget; határérzékeny döntéseknél a helyi tervezési hatóság adatait kell ellenőrizni.', 'A Planning Data műemléki területeinek határai az irányítószám reprezentatív pontjához rendelve. Az országos adatállomány fejlesztés alatt áll, és tartalmazhat duplikátumokat vagy hiányos helyi lefedettséget; határérzékeny döntéseknél a helyi tervezési hatóság adatait kell ellenőrizni.',
'Listed building': 'Listed building':

View file

@ -1459,7 +1459,7 @@ const de: Translations = {
'Current energy rating': 'Aktuelle Energieeffizienzklasse', 'Current energy rating': 'Aktuelle Energieeffizienzklasse',
'Potential energy rating': 'Mögliche Energieeffizienzklasse', 'Potential energy rating': 'Mögliche Energieeffizienzklasse',
'Interior height (m)': 'Raumhöhe (m)', 'Interior height (m)': 'Raumhöhe (m)',
'Street tree density percentile': 'Perzentil der Straßenbaumdichte', 'Tree canopy density percentile': 'Perzentil der Baumkronendichte',
'Within conservation area': 'In Denkmalschutzgebiet', 'Within conservation area': 'In Denkmalschutzgebiet',
'Listed building': 'Denkmalgeschütztes Gebäude', 'Listed building': 'Denkmalgeschütztes Gebäude',

View file

@ -1436,7 +1436,7 @@ const en = {
'Current energy rating': 'Current energy rating', 'Current energy rating': 'Current energy rating',
'Potential energy rating': 'Potential energy rating', 'Potential energy rating': 'Potential energy rating',
'Interior height (m)': 'Interior height (m)', 'Interior height (m)': 'Interior height (m)',
'Street tree density percentile': 'Street tree density percentile', 'Tree canopy density percentile': 'Tree canopy density percentile',
'Within conservation area': 'Within conservation area', 'Within conservation area': 'Within conservation area',
'Listed building': 'Listed building', 'Listed building': 'Listed building',

View file

@ -1479,7 +1479,7 @@ const fr: Translations = {
'Current energy rating': 'Classe énergétique actuelle', 'Current energy rating': 'Classe énergétique actuelle',
'Potential energy rating': 'Classe énergétique potentielle', 'Potential energy rating': 'Classe énergétique potentielle',
'Interior height (m)': 'Hauteur intérieure (m)', 'Interior height (m)': 'Hauteur intérieure (m)',
'Street tree density percentile': 'Percentile de densité arborée de la rue', 'Tree canopy density percentile': 'Percentile de densité de couvert arboré',
'Within conservation area': 'Dans une zone de conservation', 'Within conservation area': 'Dans une zone de conservation',
'Listed building': 'Bâtiment classé', 'Listed building': 'Bâtiment classé',

View file

@ -1390,7 +1390,7 @@ const hi: Translations = {
'Current energy rating': 'मौजूदा ऊर्जा रेटिंग', 'Current energy rating': 'मौजूदा ऊर्जा रेटिंग',
'Potential energy rating': 'संभावित ऊर्जा रेटिंग', 'Potential energy rating': 'संभावित ऊर्जा रेटिंग',
'Interior height (m)': 'भीतरी ऊँचाई (मी)', 'Interior height (m)': 'भीतरी ऊँचाई (मी)',
'Street tree density percentile': 'सड़क वृक्ष घनत्व प्रतिशतक', 'Tree canopy density percentile': 'वृक्ष आच्छादन घनत्व प्रतिशतक',
'Within conservation area': 'संरक्षण क्षेत्र में', 'Within conservation area': 'संरक्षण क्षेत्र में',
'Listed building': 'सूचीबद्ध भवन', 'Listed building': 'सूचीबद्ध भवन',
'Travel time to nearest train or tube station (min)': 'Travel time to nearest train or tube station (min)':

View file

@ -1462,7 +1462,7 @@ const hu: Translations = {
'Current energy rating': 'Jelenlegi energetikai besorolás', 'Current energy rating': 'Jelenlegi energetikai besorolás',
'Potential energy rating': 'Lehetséges energetikai besorolás', 'Potential energy rating': 'Lehetséges energetikai besorolás',
'Interior height (m)': 'Belmagasság (m)', 'Interior height (m)': 'Belmagasság (m)',
'Street tree density percentile': 'Utcai fasűrűségi percentilis', 'Tree canopy density percentile': 'Lombkorona-sűrűségi percentilis',
'Within conservation area': 'Műemlékvédelmi területen', 'Within conservation area': 'Műemlékvédelmi területen',
'Listed building': 'Műemlék épület', 'Listed building': 'Műemlék épület',

View file

@ -1382,7 +1382,7 @@ const zh: Translations = {
'Current energy rating': '当前能源评级', 'Current energy rating': '当前能源评级',
'Potential energy rating': '潜在能源评级', 'Potential energy rating': '潜在能源评级',
'Interior height (m)': '室内层高(米)', 'Interior height (m)': '室内层高(米)',
'Street tree density percentile': '街道树木覆盖率百分位', 'Tree canopy density percentile': '树冠覆盖密度百分位',
'Within conservation area': '位于保护区内', 'Within conservation area': '位于保护区内',
'Listed building': '登录建筑', 'Listed building': '登录建筑',

View file

@ -102,7 +102,7 @@ const FEATURE_ICON_PATHS: Record<string, ReactNode> = {
<polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2" /> <polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2" />
</> </>
), ),
'Street tree density percentile': ( 'Tree canopy density percentile': (
<> <>
<path d="M12 22V12" /> <path d="M12 22V12" />
<path d="M6 22h12" /> <path d="M6 22h12" />

View file

@ -1,4 +1,22 @@
"""Download Census 2021 ethnic group (TS021) by LSOA.
Downloads the 20-category ethnic-group breakdown (TS021, classification
C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
leaf categories into our 6 output buckets, and emits one row per LSOA with the
percentage in each bucket.
Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
~100x granularity gain with no change to the 6-bucket output schema: two very
different neighbourhoods in one borough no longer share an identical ethnicity
profile. The join key downstream (merge.py) is `lsoa21`, the same key already
used for median age and IoD.
Source: NOMIS (ONS Census 2021 TS021 dataset, NM_2041_1)
License: Open Government Licence v3.0
"""
import argparse import argparse
from io import BytesIO
from pathlib import Path from pathlib import Path
import httpx import httpx
@ -6,143 +24,168 @@ import polars as pl
pl.Config.set_tbl_cols(-1) pl.Config.set_tbl_cols(-1)
# NOMIS API: Census 2021 TS021 (ethnic group, 20 categories) by LSOA 2021
# (TYPE151). c2021_eth_20=1..19 selects the 19 detailed leaf categories
# (excluding the 5 broad aggregates 1001-1005 and the 0 = Total, which we
# re-derive ourselves). measures=20100 selects the absolute count.
BASE_URL = (
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2041_1.data.csv"
"?geography=TYPE151"
"&c2021_eth_20=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"
"&measures=20100"
"&select=GEOGRAPHY_CODE,C2021_ETH_20_NAME,OBS_VALUE"
)
PAGE_SIZE = 25000
URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv" # Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
# The split mirrors the previous Local-Authority source exactly:
GEOGRAPHY_CODE_REPLACEMENTS = { # * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
# 2023 Cumberland unitary authority # Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
"E07000026": "E06000063", # Allerdale # Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
"E07000028": "E06000063", # Carlisle # avoids inflating "% South Asian". The split is approximate (the bucket also
"E07000029": "E06000063", # Copeland # holds some South Asian groups such as Sri Lankan/Nepalese).
# 2023 Westmorland and Furness unitary authority GROUP_MAP = {
"E07000027": "E06000064", # Barrow-in-Furness # White
"E07000030": "E06000064", # Eden "White: English, Welsh, Scottish, Northern Irish or British": "White",
"E07000031": "E06000064", # South Lakeland "White: Irish": "White",
# 2023 North Yorkshire unitary authority "White: Gypsy or Irish Traveller": "White",
"E07000163": "E06000065", # Craven "White: Roma": "White",
"E07000164": "E06000065", # Hambleton "White: Other White": "White",
"E07000165": "E06000065", # Harrogate # South Asian
"E07000166": "E06000065", # Richmondshire "Asian, Asian British or Asian Welsh: Indian": "South Asian",
"E07000167": "E06000065", # Ryedale "Asian, Asian British or Asian Welsh: Pakistani": "South Asian",
"E07000168": "E06000065", # Scarborough "Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
"E07000169": "E06000065", # Selby # East / Southeast Asian
# 2023 Somerset unitary authority "Asian, Asian British or Asian Welsh: Chinese": "East Asian",
"E07000187": "E06000066", # Mendip "Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
"E07000188": "E06000066", # Sedgemoor # Black
"E07000189": "E06000066", # South Somerset "Black, Black British, Black Welsh, Caribbean or African: African": "Black",
"E07000246": "E06000066", # Somerset West and Taunton "Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
"Black, Black British, Black Welsh, Caribbean or African: Other Black": "Black",
# Mixed
"Mixed or Multiple ethnic groups: White and Asian": "Mixed",
"Mixed or Multiple ethnic groups: White and Black African": "Mixed",
"Mixed or Multiple ethnic groups: White and Black Caribbean": "Mixed",
"Mixed or Multiple ethnic groups: Other Mixed or Multiple ethnic groups": "Mixed",
# Other
"Other ethnic group: Arab": "Other",
"Other ethnic group: Any other ethnic group": "Other",
} }
# The 6 output groups, in a fixed order so the largest-remainder rounding below
# is deterministic regardless of pivot column ordering.
OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
"GROUP_MAP values must be exactly the OUTPUT_GROUPS"
)
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame: def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
# Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity, """Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
# then aggregate back to the broad groups plus a South Asian / East/SE Asian
# split (Indian/Pakistani/Bangladeshi vs Chinese + other East/SE Asian). `df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
detailed = df.filter( C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
(pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All") missing/extra/relabelled leaf category would silently drop people from the
denominator, so we validate the category set against GROUP_MAP first and
fail loudly otherwise.
"""
found = set(df["C2021_ETH_20_NAME"].unique().to_list())
expected = set(GROUP_MAP)
if found != expected:
missing = sorted(expected - found)
unexpected = sorted(found - expected)
raise ValueError(
"Census ethnic-group categories do not match the expected NOMIS "
"TS021 C2021_ETH_20 leaf set.\n"
f" expected {len(expected)} categories, found {len(found)}\n"
f" missing: {missing}\n"
f" unexpected: {unexpected}\n"
"Refusing to compute percentages against an unrecognised breakdown."
)
# Map each leaf to its output group and sum counts per (LSOA, group). Summing
# counts (not rounded percentages) keeps the denominator exact.
grouped = (
df.with_columns(
pl.col("C2021_ETH_20_NAME").replace_strict(GROUP_MAP).alias("group"),
pl.col("OBS_VALUE").cast(pl.Float64, strict=False).alias("_count"),
)
.group_by("GEOGRAPHY_CODE", "group")
.agg(pl.col("_count").sum())
)
wide = grouped.pivot(on="group", index="GEOGRAPHY_CODE", values="_count").rename(
{"GEOGRAPHY_CODE": "lsoa21"}
) )
# Map detailed categories to our output groups # A group with no people in an LSOA is absent from the long rows, so the pivot
group_map = { # leaves a null; treat it as 0 before normalising.
# White wide = wide.with_columns(pl.col(OUTPUT_GROUPS).fill_null(0.0))
"White British": "White",
"White Irish": "White",
"Gypsy Or Irish Traveller": "White",
"Roma": "White",
"Any Other White Background": "White",
# South Asian
"Indian": "South Asian",
"Pakistani": "South Asian",
"Bangladeshi": "South Asian",
# East / Southeast Asian. The ONS "Any Other Asian Background" bucket is
# predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
# Japanese, Korean, ...) rather than South Asian, so route it here rather
# than inflating "% South Asian". The split is approximate (the ONS
# bucket also holds some South Asian groups such as Sri Lankan/Nepalese).
"Chinese": "East/SE Asian",
"Any Other Asian Background": "East/SE Asian",
# Black
"Black African": "Black",
"Black Caribbean": "Black",
"Any Other Black Background": "Black",
# Mixed
"Mixed White And Asian": "Mixed",
"Mixed White And Black African": "Mixed",
"Mixed White And Black Caribbean": "Mixed",
"Any Other Mixed/Multiple Ethnic Background": "Mixed",
# Other
"Arab": "Other",
"Any Other Ethnic Background": "Other",
}
detailed = detailed.with_columns( # Normalize so each row sums to exactly 100%, then round with the
pl.col("Ethnicity").replace_strict(group_map).alias("group"), # largest-remainder method to preserve the sum. Independent rounding of 6
pl.col("Geography_code") # values can drift +/-0.3.
.replace(GEOGRAPHY_CODE_REPLACEMENTS) row_total = sum(pl.col(c) for c in OUTPUT_GROUPS)
.alias("output_geography_code"),
pl.col("Ethnic Population").cast(pl.Float64, strict=False).alias("_population"),
)
# Sum counts, not rounded percentages, so old districts can be safely
# recombined into their current unitary authorities.
grouped = detailed.group_by("output_geography_code", "group").agg(
pl.col("_population").sum()
)
wide = grouped.pivot(
on="group", index="output_geography_code", values="_population"
).rename({"output_geography_code": "Geography_code"})
# Normalize so each row sums to exactly 100%, then round using largest-remainder
# method to preserve the sum. Independent rounding of 6 values can drift ±0.3.
group_cols = [c for c in wide.columns if c != "Geography_code"]
row_total = sum(pl.col(c) for c in group_cols)
# Scale each group so they sum to exactly 100
wide = wide.with_columns( wide = wide.with_columns(
[(pl.col(c) / row_total * 100.0).alias(c) for c in group_cols] [(pl.col(c) / row_total * 100.0).alias(c) for c in OUTPUT_GROUPS]
) )
# Round to 1 decimal, then adjust the largest group to absorb residual # Round to 1 decimal, then adjust the largest group to absorb the residual.
rounded_cols = [pl.col(c).round(1).alias(c) for c in group_cols] wide = wide.with_columns([pl.col(c).round(1).alias(c) for c in OUTPUT_GROUPS])
wide = wide.with_columns(rounded_cols) rounded_sum = sum(pl.col(c) for c in OUTPUT_GROUPS)
rounded_sum = sum(pl.col(c) for c in group_cols)
residual = (100.0 - rounded_sum).round(1) residual = (100.0 - rounded_sum).round(1)
# Find which group is largest per row and add the residual there largest_col = pl.concat_list(OUTPUT_GROUPS).list.arg_max()
largest_col = pl.concat_list(group_cols).list.arg_max()
wide = wide.with_columns( wide = wide.with_columns(
[ [
pl.when(largest_col == i) pl.when(largest_col == i)
.then(pl.col(c) + residual) .then(pl.col(c) + residual)
.otherwise(pl.col(c)) .otherwise(pl.col(c))
.alias(c) .alias(c)
for i, c in enumerate(group_cols) for i, c in enumerate(OUTPUT_GROUPS)
] ]
) )
# Rename columns to be descriptive rename_map = {col: f"% {col}" for col in OUTPUT_GROUPS}
rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"} return wide.rename(rename_map)
wide = wide.rename(rename_map)
return wide
def download_and_convert(output_path: Path) -> None: def download_and_convert(output_path: Path) -> None:
print("Downloading ethnicity data...") print("Downloading Census 2021 ethnic group (TS021) by LSOA from NOMIS...")
response = httpx.get(URL, follow_redirects=True, timeout=60) frames = []
response.raise_for_status() offset = 0
while True:
url = f"{BASE_URL}&recordoffset={offset}"
response = httpx.get(url, follow_redirects=True, timeout=120)
response.raise_for_status()
if len(response.content) == 0:
break
chunk = pl.read_csv(BytesIO(response.content))
if chunk.height == 0:
break
frames.append(chunk)
print(f" Fetched {chunk.height} rows (offset={offset})")
if chunk.height < PAGE_SIZE:
break
offset += PAGE_SIZE
df = pl.read_csv(response.content) df = pl.concat(frames)
print(f"Raw shape: {df.head(100)}") print(f"Total rows: {df.height}")
# Filter to England only (E-prefixed LSOA codes); the merge joins on the
# English postcode universe and the IoD coverage check is England-wide.
df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))
wide = _ethnicity_percentages(df) wide = _ethnicity_percentages(df)
print(f"Output shape: {wide.shape}") print(f"England LSOAs: {wide.height}")
print(f"Columns: {wide.columns}") print(f"Columns: {wide.columns}")
output_path.parent.mkdir(parents=True, exist_ok=True)
wide.write_parquet(output_path, compression="zstd") wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}") print(f"Saved to {output_path}")
def main() -> None: def main() -> None:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Download and convert ethnicity by local authority data" description="Download Census 2021 ethnic group (TS021) by LSOA"
) )
parser.add_argument( parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path" "--output", type=Path, required=True, help="Output parquet file path"

View file

@ -192,6 +192,10 @@ def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
infer_schema_length=20000, infer_schema_length=20000,
null_values=_NULL_VALUES, null_values=_NULL_VALUES,
truncate_ragged_lines=True, truncate_ragged_lines=True,
# Force the phone number to stay a string: schema inference reads it as
# an integer and strips the leading 0 (e.g. 020 8427 7222 -> 2084277222),
# making nearly every school phone number un-diallable.
schema_overrides={"TelephoneNum": pl.String},
) )

View file

@ -1,65 +1,118 @@
import polars as pl import polars as pl
import pytest
from pipeline.download.ethnicity import _ethnicity_percentages from pipeline.download.ethnicity import GROUP_MAP, OUTPUT_GROUPS, _ethnicity_percentages
def test_ethnicity_percentages_recombines_predecessor_lads_by_population(): def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
rows = [] """Build NOMIS-shaped long rows for one LSOA from {leaf_label: count}.
for code, white, indian in [
("E07000026", 80, 20),
("E07000028", 10, 90),
]:
total = white + indian
rows.extend(
[
{
"Geography_code": code,
"Ethnicity_type": "ONS 2021 19+1",
"Ethnicity": "White British",
"Ethnic Population": white,
"Value1": white / total * 100,
},
{
"Geography_code": code,
"Ethnicity_type": "ONS 2021 19+1",
"Ethnicity": "Indian",
"Ethnic Population": indian,
"Value1": indian / total * 100,
},
]
)
result = _ethnicity_percentages(pl.DataFrame(rows)) Every one of the 19 leaf categories must be present in the download (NOMIS
emits a 0-count row when an LSOA has none), so categories not given default
cumberland = result.filter(pl.col("Geography_code") == "E06000063") to 0 to mirror that.
assert cumberland.select("% White", "% South Asian").to_dicts() == [ """
{"% White": 45.0, "% South Asian": 55.0} return [
]
def test_ethnicity_routes_any_other_asian_to_east_se_asian():
"""'Any Other Asian Background' and 'Chinese' both fold into '% East/SE Asian'
(not '% South Asian'), fixing the East/SE Asian undercount."""
rows = [
{ {
"Geography_code": "E06000001", "GEOGRAPHY_CODE": geo,
"Ethnicity_type": "ONS 2021 19+1", "C2021_ETH_20_NAME": label,
"Ethnicity": ethnicity, "OBS_VALUE": counts.get(label, 0),
"Ethnic Population": pop,
"Value1": 0.0,
} }
for ethnicity, pop in [ for label in GROUP_MAP
("Chinese", 30),
("Any Other Asian Background", 20),
("Indian", 50),
]
] ]
result = _ethnicity_percentages(pl.DataFrame(rows))
area = result.filter(pl.col("Geography_code") == "E06000001") def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
df = pl.DataFrame(
_long_rows(
"E01000001",
{
"White: English, Welsh, Scottish, Northern Irish or British": 60,
"White: Other White": 10,
"Asian, Asian British or Asian Welsh: Indian": 20,
"Black, Black British, Black Welsh, Caribbean or African: African": 10,
},
)
)
result = _ethnicity_percentages(df)
assert result.columns[0] == "lsoa21"
assert set(result.columns) == {"lsoa21", *(f"% {g}" for g in OUTPUT_GROUPS)}
row = result.filter(pl.col("lsoa21") == "E01000001").to_dicts()[0]
assert row["% White"] == 70.0
assert row["% South Asian"] == 20.0
assert row["% Black"] == 10.0
# Percentages always sum to exactly 100 (largest-remainder rounding).
assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0
def test_ethnicity_routes_other_asian_to_east_se_asian():
"""'Other Asian' and 'Chinese' both fold into '% East/SE Asian' (not
'% South Asian'), preserving the East/SE Asian split from the LAD source."""
df = pl.DataFrame(
_long_rows(
"E01000002",
{
"Asian, Asian British or Asian Welsh: Chinese": 30,
"Asian, Asian British or Asian Welsh: Other Asian": 20,
"Asian, Asian British or Asian Welsh: Indian": 50,
},
)
)
result = _ethnicity_percentages(df)
area = result.filter(pl.col("lsoa21") == "E01000002")
assert "% East/SE Asian" in result.columns assert "% East/SE Asian" in result.columns
assert "% East Asian" not in result.columns assert "% East Asian" not in result.columns
assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [ assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
{"% East/SE Asian": 50.0, "% South Asian": 50.0} {"% East/SE Asian": 50.0, "% South Asian": 50.0}
] ]
def test_ethnicity_percentages_independent_per_lsoa():
"""Two LSOAs get independent profiles — the LSOA granularity is the point."""
df = pl.concat(
[
pl.DataFrame(
_long_rows(
"E01000010",
{"White: Other White": 100},
)
),
pl.DataFrame(
_long_rows(
"E01000011",
{"Asian, Asian British or Asian Welsh: Pakistani": 100},
)
),
]
)
result = _ethnicity_percentages(df).sort("lsoa21")
assert result["% White"].to_list() == [100.0, 0.0]
assert result["% South Asian"].to_list() == [0.0, 100.0]
def test_ethnicity_percentages_rejects_unexpected_category():
rows = _long_rows("E01000003", {"White: Other White": 10})
rows.append(
{
"GEOGRAPHY_CODE": "E01000003",
"C2021_ETH_20_NAME": "White: A Brand New Census Category",
"OBS_VALUE": 5,
}
)
with pytest.raises(ValueError, match="do not match the expected"):
_ethnicity_percentages(pl.DataFrame(rows))
def test_ethnicity_percentages_rejects_missing_category():
# Drop one leaf entirely: its people would vanish from the denominator.
rows = [r for r in _long_rows("E01000004", {"White: Other White": 10}) if
r["C2021_ETH_20_NAME"] != "Other ethnic group: Arab"]
with pytest.raises(ValueError, match="missing"):
_ethnicity_percentages(pl.DataFrame(rows))

View file

@ -1011,11 +1011,6 @@ def main() -> None:
action="store_true", action="store_true",
help="Skip TfL TransXChange download and conversion", help="Skip TfL TransXChange download and conversion",
) )
parser.add_argument(
"--skip-national-rail",
action="store_true",
help="Skip National Rail CIF download and conversion",
)
args = parser.parse_args() args = parser.parse_args()
output_dir: Path = args.output output_dir: Path = args.output
@ -1039,13 +1034,20 @@ def main() -> None:
download_tfl_transxchange(raw_dir) download_tfl_transxchange(raw_dir)
convert_tfl_to_gtfs(raw_dir, output_dir) convert_tfl_to_gtfs(raw_dir, output_dir)
# 3. National Rail CIF → GTFS # 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
if args.skip_national_rail: # reach the ~2,725 railway-station destinations, so a bus/TfL-only network
print("Skipping National Rail (--skip-national-rail)") # silently overstates every train commute. Missing credentials are a HARD
else: # error, so a rail-less network can never ship.
cif = download_national_rail_cif(raw_dir) cif = download_national_rail_cif(raw_dir)
if cif is not None: if cif is None:
convert_national_rail_to_gtfs(raw_dir, output_dir) raise RuntimeError(
"National Rail timetable was not downloaded — set "
"NATIONAL_RAIL_EMAIL / NATIONAL_RAIL_PASSWORD (register free at "
"https://opendata.nationalrail.co.uk/). National Rail heavy rail is "
"required; without it the transit network models every train journey "
"as bus-only and overstates commute times."
)
convert_national_rail_to_gtfs(raw_dir, output_dir)
# Summary # Summary
print() print()

View file

@ -273,27 +273,24 @@ def _write_avg_yr(
for type_idx, name in enumerate(ALL_CRIME_TYPES): for type_idx, name in enumerate(ALL_CRIME_TYPES):
data[f"{name} (avg/yr)"] = avg[:, type_idx] data[f"{name} (avg/yr)"] = avg[:, type_idx]
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup # Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then # columns, so each rollup always equals the sum of the parts shown beside it
# average over the years in which ANY of those types occurred. This keeps the # and can never fall below one of its own components. (Previously the rollup
# headline equal to the mean of the "Serious/Minor crime (by year)" bars. # re-derived a union-years-present mean: it divided the summed counts by the
# Summing the per-type avg/yr values instead (as the merge previously did) # number of years in which ANY component type occurred, whereas each
# divides each type by its OWN years-present and overstates the rollup when a # component divides by its OWN years-present. When a postcode's serious/minor
# postcode's serious/minor types occur in disjoint years. # types occurred in disjoint years the union denominator was larger, so the
# rollup came out smaller than the sum of its parts.) The by-year rollup
# series in _write_by_year is likewise the per-year sum of the component
# bars, so headline and chart both present the rollup as the sum of its parts.
for rollup_name, rollup_types in ( for rollup_name, rollup_types in (
("Serious crime", SERIOUS_CRIME_TYPES), ("Serious crime", SERIOUS_CRIME_TYPES),
("Minor crime", MINOR_CRIME_TYPES), ("Minor crime", MINOR_CRIME_TYPES),
): ):
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types] rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years) data[f"{rollup_name} (avg/yr)"] = np.round(
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1) avg[:, rollup_idx].sum(axis=1), 1
rollup_years_present = np.clip( ).astype(np.float32)
(rollup_counts > 0).sum(axis=1), 1, None
).astype(np.float64)
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
np.float32
)
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
pl.DataFrame(data).write_parquet(output_path, compression="zstd") pl.DataFrame(data).write_parquet(output_path, compression="zstd")

View file

@ -36,6 +36,16 @@ MIN_PRICE = 10_000
MIN_BUILD_YEAR = 1700 MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030 MAX_BUILD_YEAR = 2030
# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
# habitable rooms) that otherwise propagate verbatim into the published per-
# property columns. Values outside these bands are nulled (treated as unknown)
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
MIN_FLOOR_HEIGHT_M = 1.5 # below this a storey is not habitable
MAX_FLOOR_HEIGHT_M = 6.0 # above this is a data error, not a normal storey
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0 # ~21,500 sqft; larger is a bulk/garbage record
MAX_HABITABLE_ROOMS = 20 # dwellings above this are data errors
def epc_band_to_year(band: pl.Expr) -> pl.Expr: def epc_band_to_year(band: pl.Expr) -> pl.Expr:
"""Map an EPC construction age band to a single representative build year. """Map an EPC construction age band to a single representative build year.
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
) )
.filter(pl.col("epc_address").is_not_null()) .filter(pl.col("epc_address").is_not_null())
.with_columns( .with_columns(
pl.when(pl.col("number_habitable_rooms") == 0) # Null implausible EPC dimensions so data-entry errors don't reach
.then(None) # the published per-property columns (Interior height, Total floor
.otherwise(pl.col("number_habitable_rooms")) # area, Number of bedrooms & living rooms). Treated as unknown.
pl.when(
(pl.col("number_habitable_rooms") >= 1)
& (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
)
.then(pl.col("number_habitable_rooms"))
.otherwise(None)
.alias("number_habitable_rooms"), .alias("number_habitable_rooms"),
pl.when(
pl.col("floor_height").is_between(
MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
)
)
.then(pl.col("floor_height"))
.otherwise(None)
.alias("floor_height"),
pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
.then(pl.col("total_floor_area"))
.otherwise(None)
.alias("total_floor_area"),
) )
) )

View file

@ -2,6 +2,7 @@ import argparse
import re import re
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from datetime import date
from typing import Literal from typing import Literal
import numpy as np import numpy as np
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10 MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area" CONSERVATION_AREA_FEATURE = "Within conservation area"
TREE_DENSITY_FEATURE = "Street tree density percentile" # Named "Tree canopy" (not "Street tree") because the underlying density unions
# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
# woodland-edge postcode's score reflects forest canopy, not only street trees.
TREE_DENSITY_FEATURE = "Tree canopy density percentile"
LISTED_BUILDING_FEATURE = "Listed building" LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0 LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3 LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:
def _is_current_planning_record(end_date: object) -> bool: def _is_current_planning_record(end_date: object) -> bool:
"""A planning record is current when it has no end-date OR its end-date is
still in the future. The planning.data.gov.uk `end-date` field marks when a
designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
area and must NOT be dropped the previous "any non-empty date = ended"
logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
if end_date is None: if end_date is None:
return True return True
if isinstance(end_date, str): if isinstance(end_date, str):
return end_date.strip() == "" text = end_date.strip()
if text == "":
return True
try:
return date.fromisoformat(text[:10]) > date.today()
except ValueError:
# Unparseable end-date: keep the record rather than silently drop it.
return True
return False return False
@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
) )
def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
"""Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
like median age and IoD. The IoD table defines the LSOA universe every
postcode resolves into, so a missing LSOA would silently null the ethnicity
columns for those postcodes; require full coverage instead.
"""
iod_lsoas = pl.read_parquet(
iod_path, columns=["LSOA code (2021)"]
).rename({"LSOA code (2021)": "lsoa21"})
ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
missing_ethnicity = iod_lsoas.join(
ethnicity_lsoas, on="lsoa21", how="anti"
).sort("lsoa21")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing LSOA coverage: "
f"{missing_ethnicity.height} LSOAs, e.g. "
f"{missing_ethnicity.head(10).to_dicts()}"
)
def _validate_lad_source_coverage( def _validate_lad_source_coverage(
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path iod_path: Path, rental_prices_path: Path
) -> None: ) -> None:
iod_lads = ( iod_lads = (
pl.read_parquet( pl.read_parquet(
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
.unique(["lad"]) .unique(["lad"])
) )
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
{"Geography_code": "lad"}
)
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing 2024 LAD coverage: "
f"{missing_ethnicity.to_dicts()}"
)
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename( rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
{"area_code": "lad"} {"area_code": "lad"}
) )
@ -849,12 +879,10 @@ def _join_area_side_tables(
broadband: pl.LazyFrame, broadband: pl.LazyFrame,
) -> pl.LazyFrame: ) -> pl.LazyFrame:
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left") base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
base = base.join( # Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
ethnicity, # `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
left_on="Local Authority District code (2024)", # Local-Authority broadcast, with no change to the 6-bucket output schema.
right_on="Geography_code", base = base.join(ethnicity, on="lsoa21", how="left")
how="left",
)
# Crime is counted spatially per postcode (incidents within 50m of the # Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial # postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
@ -1966,7 +1994,8 @@ def _build(
""" """
if mode == "listings" and actual_listings_path is None: if mode == "listings" and actual_listings_path is None:
raise ValueError("listings mode requires actual_listings_path") raise ValueError("listings mode requires actual_listings_path")
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path) _validate_lsoa_source_coverage(iod_path, ethnicity_path)
_validate_lad_source_coverage(iod_path, rental_prices_path)
wide = pl.scan_parquet(epc_pp_path).filter( wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null() pl.col("total_floor_area").is_null()
@ -2225,7 +2254,7 @@ def main():
"--ethnicity", "--ethnicity",
type=Path, type=Path,
required=True, required=True,
help="Ethnicity by local authority parquet file (optional)", help="Census 2021 ethnic group (TS021) by LSOA parquet file",
) )
parser.add_argument( parser.add_argument(
"--crime", "--crime",

View file

@ -53,6 +53,18 @@ _OUTPUT_PRECISION_DEG = 0.000001
# tolerance), we fatten it just enough to survive snapping rather than drop it. # tolerance), we fatten it just enough to survive snapping rather than drop it.
_MIN_FOOTPRINT_BUFFER_M = 0.5 _MIN_FOOTPRINT_BUFFER_M = 0.5
# Building-scale buffer for POINTLIKE inputs that carry no real extent. Multi-
# dwelling (tower-block) postcodes have every UPRN geocoded to a single shared
# coordinate, so the boundary collapses to a point; a 0.5 m buffer then yields an
# invisible ~0.8 m² dot covering hundreds of homes. Such inputs get a ~200 m²
# building-scale footprint instead. (Genuine thin slivers, which still carry
# length, keep the minimal buffer.) _resolve_overlaps runs afterwards, so any
# overlap this introduces is trimmed; a postcode shaved back to sub-grid still
# falls through to the tiny _grid_footprint, so this can only improve the result.
_POINT_RESCUE_BUFFER_M = 8.0
_POINTLIKE_AREA_M2 = 1.0
_POINTLIKE_PERIMETER_M = 4.0
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None: def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
"""Transform a BNG polygon to WGS84, snap to output precision, validate. """Transform a BNG polygon to WGS84, snap to output precision, validate.
@ -90,8 +102,23 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
def _rescue_footprint(geom_bng) -> dict | None: def _rescue_footprint(geom_bng) -> dict | None:
"""Fatten a degenerate BNG geometry into a representable footprint and snap.""" """Fatten a degenerate BNG geometry into a representable footprint and snap.
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
A POINTLIKE input (a point, or a near-zero-area/short-perimeter polygon the
signature of a tower-block postcode whose UPRNs all share one coordinate)
gets a building-scale buffer so it is not reduced to an invisible sub-metre
dot; thin slivers that still carry length keep the minimal buffer.
"""
buffer_m = _MIN_FOOTPRINT_BUFFER_M
try:
if (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
):
buffer_m = _POINT_RESCUE_BUFFER_M
except GEOSException:
pass
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
if footprint is None: if footprint is None:
return None return None
return _snap_to_wgs84_geojson(footprint) return _snap_to_wgs84_geojson(footprint)

View file

@ -906,6 +906,37 @@ class TestToWgs84Geojson:
assert result is not None assert result is not None
assert result["type"] == "Polygon" assert result["type"] == "Polygon"
def test_pointlike_input_gets_building_scale_footprint(self):
"""A tower-block postcode (all UPRNs at one point) must not collapse to a
sub-metre dot; it gets a building-scale footprint instead."""
import pyproj
from shapely.geometry import Point, shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
result = to_wgs84_geojson(Point(360000, 170000))
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
def test_thin_sliver_keeps_minimal_buffer(self):
"""A genuine elongated sliver still carries length, so it is NOT inflated
to building scale only truly pointlike inputs are."""
import pyproj
from shapely.geometry import LineString, shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
sliver = LineString([(360000, 170000), (360040, 170000)]).buffer(0.05)
result = to_wgs84_geojson(sliver)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 < 100, f"sliver inflated to {area_m2:.1f} m^2"
def test_coordinates_have_limited_precision(self): def test_coordinates_have_limited_precision(self):
"""GeoJSON coordinates should be rounded to 6 decimal places.""" """GeoJSON coordinates should be rounded to 6 decimal places."""
import json import json

View file

@ -230,11 +230,28 @@ def main():
).height ).height
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates") print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
# Null the absolute "Estimated current price" itself when its implied
# per-sqm is implausible (outside [MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM])
# AND the floor area is known: these come from bulk/block transfers or
# garbage source prices (e.g. a £207.5M "sale" on a 93 m² terrace -> a £197M
# estimate) and are not meaningful single-dwelling values. Previously only
# the derived per-sqm was nulled, leaving the absurd headline price visible.
_raw_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
df = df.with_columns(
pl.when(
pl.col("Estimated current price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& ((_raw_est_psm < MIN_COMPARABLE_PSM) | (_raw_est_psm > MAX_COMPARABLE_PSM))
)
.then(None)
.otherwise(pl.col("Estimated current price"))
.alias("Estimated current price"),
)
# Derive estimated price per sqm where both estimated price and floor area # Derive estimated price per sqm where both estimated price and floor area
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM, # exist. Now that the implausible-psm estimates are nulled above, the band
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come # filter here mainly guards the floor-area>0 case.
# from bulk/block transactions or floor-area errors and are not meaningful
# per-unit prices.
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)") _est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
df = df.with_columns( df = df.with_columns(
pl.when( pl.when(

View file

@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
) )
from pipeline.transform.price_estimation.utils import ( from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR, CURRENT_YEAR,
LATEST_COMPLETE_YEAR,
TEMPORAL_SMOOTHNESS_LAMBDA, TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS, TYPE_GROUPS,
build_hedonic_features, build_hedonic_features,
@ -395,14 +396,22 @@ def build_index(
The index is still forward-filled to CURRENT_YEAR. The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path. postcodes_path: if provided, lat/lon are read from this file instead of input_path.
""" """
pairs = extract_pairs(input_path, max_year2=max_pair_year) # Solve the index only on COMPLETE calendar years: exclude the partial
# current year, whose thin repeat-sale set yields wild betas. The index is
# still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
# follows the established trend rather than a partial-year spike. Backtest
# passes a stricter max_pair_year, which is honoured.
estimation_cap = (
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
)
pairs = extract_pairs(input_path, max_year2=estimation_cap)
centroids = extract_centroids(postcodes_path or input_path) centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min()) min_year = int(pairs["year1"].min())
max_year = CURRENT_YEAR max_year = CURRENT_YEAR
hedonic_idx = compute_hedonic_index( hedonic_idx = compute_hedonic_index(
input_path, min_year, max_year, max_sale_year=max_pair_year input_path, min_year, max_year, max_sale_year=estimation_cap
) )
# Precompute hierarchy # Precompute hierarchy

View file

@ -6,6 +6,13 @@ import numpy as np
import polars as pl import polars as pl
CURRENT_YEAR = 2026 CURRENT_YEAR = 2026
# Latest COMPLETE calendar year. The current year's transactions are only
# partially reported (Land Registry lags ~2-3 months), so a sector's thin
# partial-year repeat-sale set produces wild index betas (e.g. +334% in a
# single sector). The index is SOLVED only on complete years (<= this) and
# forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value
# projections follow the established trend instead of a partial-year spike.
LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1
_today = date.today() _today = date.today()
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12 CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12

View file

@ -15,11 +15,24 @@ SCHOOL_GROUPS = {
} }
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame: # Age thresholds for deciding which phase(s) a school serves. A school serves
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
# phase" labels such schools as just "Secondary", which previously hid them from
# every postcode's primary-school count.
PRIMARY_MAX_AGE = 10
SECONDARY_MIN_AGE = 12
def classify_good_plus_schools(
ofsted: pl.DataFrame, open_urns: set[int] | None = None
) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for proximity counts. """Label good+/outstanding primary & secondary schools for proximity counts.
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``, Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
returning a ``(postcode, category)`` frame. ``category`` rows per school, returning a ``(postcode, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection overall effectiveness" (OEIF = the previous Ofsted Education Inspection
@ -27,49 +40,89 @@ def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)" remains Good"/"School remains Outstanding"). Filtering on the graded column
variants). Filtering on the graded column alone dropped ~7,000 genuinely alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when ungraded outcome, but ONLY when there is no usable graded result
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4 (null/"Not judged"), so a genuine grade 3/4 is never overridden.
is never overridden.
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
(Concerns)" outcome signals inspectors found issues warranting an earlier
graded re-inspection, so marketing it as a good+ school is misleading.
Phase assignment uses the statutory age range when available (so all-through
and middle schools count toward BOTH primary and secondary), falling back to
the coarse "Ofsted phase" label when age columns are absent. When
``open_urns`` is given, schools whose URN is not in the current GIAS open
register are dropped so closed/merged schools are not counted.
""" """
# Cast to Utf8 so the string predicates below are well-defined even if a # Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype). # column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False) oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False) ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged") no_usable_grade = oeif.is_null() | (oeif == "Not judged")
has_concern = ungraded.str.contains(r"\(Concerns\)")
remains_outstanding = (
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
)
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
graded = ( graded = (
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"])) ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
.with_columns( .with_columns(
pl.when(oeif.is_in(["1", "2"])) pl.when(oeif.is_in(["1", "2"]))
.then(oeif) .then(oeif)
.when( .when(no_usable_grade & remains_outstanding)
no_usable_grade
& ungraded.str.starts_with("School remains Outstanding")
)
.then(pl.lit("1")) .then(pl.lit("1"))
.when(no_usable_grade & ungraded.str.starts_with("School remains Good")) .when(no_usable_grade & remains_good)
.then(pl.lit("2")) .then(pl.lit("2"))
.otherwise(None) .otherwise(None)
.alias("_ofsted_grade") .alias("_ofsted_grade")
) )
.filter(pl.col("_ofsted_grade").is_not_null()) .filter(pl.col("_ofsted_grade").is_not_null())
) )
# Drop schools no longer open (closed/merged) when the GIAS open register is
# provided, so stale Ofsted "latest inspection" rows are not counted.
if open_urns is not None and "URN" in graded.columns:
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
# Decide which phase(s) each school serves.
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
serves_primary = (
pl.when(low.is_not_null())
.then(low <= PRIMARY_MAX_AGE)
.otherwise(pl.col("Ofsted phase") == "Primary")
)
serves_secondary = (
pl.when(high.is_not_null())
.then(high >= SECONDARY_MIN_AGE)
.otherwise(pl.col("Ofsted phase") == "Secondary")
)
else:
serves_primary = pl.col("Ofsted phase") == "Primary"
serves_secondary = pl.col("Ofsted phase") == "Secondary"
graded = graded.with_columns(
serves_primary.alias("_serves_primary"),
serves_secondary.alias("_serves_secondary"),
)
# Good+ groups include both grade variants; outstanding groups count grade 1. # Good+ groups include both grade variants; outstanding groups count grade 1.
return graded.with_columns( # A school can yield up to two rows (primary and secondary).
pl.when(pl.col("Ofsted phase") == "Primary") primary = graded.filter(pl.col("_serves_primary")).with_columns(
.then( pl.when(pl.col("_ofsted_grade") == "1")
pl.when(pl.col("_ofsted_grade") == "1") .then(pl.lit("outstanding_primary"))
.then(pl.lit("outstanding_primary")) .otherwise(pl.lit("good_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
.alias("category") .alias("category")
).select( )
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
.alias("category")
)
return pl.concat([primary, secondary]).select(
pl.col("Postcode").alias("postcode"), pl.col("Postcode").alias("postcode"),
"category", "category",
) )
@ -85,12 +138,24 @@ def main():
parser.add_argument( parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet" "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
) )
parser.add_argument(
"--gias",
type=Path,
default=None,
help="GIAS open-school parquet; if given, only currently-open schools are counted",
)
parser.add_argument( parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path" "--output", type=Path, required=True, help="Output parquet path"
) )
args = parser.parse_args() args = parser.parse_args()
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted)) open_urns: set[int] | None = None
if args.gias is not None:
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
print(f"GIAS open register: {len(open_urns):,} open school URNs")
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
if ofsted.is_empty(): if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found") raise ValueError("No good+ primary/secondary Ofsted schools found")

View file

@ -252,14 +252,15 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)} assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path): def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
# Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in # Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
# 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline # 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
# "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)" # "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
# bars (which span the UNION of years any serious type occurred), NOT the sum # (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
# of the per-type means. Summing per-type means divides each type by its OWN # shown beside it and can never fall below a single component. (The previous
# years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the # union-years-present mean would have divided the per-year serious total by the
# per-year serious total by the years any serious type occurred (2) -> 12. # 2 years any serious type occurred, giving a misleading 12 that sits below
# both the burglary and robbery rollup contributions.)
units = tmp_path / "units" units = tmp_path / "units"
_write_boundaries( _write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
@ -274,13 +275,16 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0) transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
avg = pl.read_parquet(output).row(0, named=True) avg = pl.read_parquet(output).row(0, named=True)
# The precomputed rollup headline exists and equals the mean of the bars (12),
# not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
assert "Serious crime (avg/yr)" in avg assert "Serious crime (avg/yr)" in avg
assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05) assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05) assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05) # Rollup == sum of its component (avg/yr) columns.
assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
assert avg["Serious crime (avg/yr)"] == pytest.approx(
avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
)
# The by-year rollup series remains the per-year sum of the component bars.
serious_bars = { serious_bars = {
p["year"]: p["count"] p["year"]: p["count"]
for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"] for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
@ -289,8 +293,6 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
2014: pytest.approx(12.0, abs=0.05), 2014: pytest.approx(12.0, abs=0.05),
2024: pytest.approx(12.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05),
} }
mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path): def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):

View file

@ -34,6 +34,7 @@ from pipeline.transform.merge import (
_split_normal_outputs, _split_normal_outputs,
_tree_density_by_postcode, _tree_density_by_postcode,
_validate_lad_source_coverage, _validate_lad_source_coverage,
_validate_lsoa_source_coverage,
_validate_postcode_feature_output, _validate_postcode_feature_output,
_validate_property_postcodes, _validate_property_postcodes,
) )
@ -297,7 +298,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
joined = _join_area_side_tables( joined = _join_area_side_tables(
base, base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}), iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}), ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime, crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}), election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -355,7 +356,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
joined = _join_area_side_tables( joined = _join_area_side_tables(
base, base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}), iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}), ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime, crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}), election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -531,7 +532,6 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
tmp_path, tmp_path,
) -> None: ) -> None:
iod_path = tmp_path / "iod.parquet" iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet" rental_path = tmp_path / "rental.parquet"
pl.DataFrame( pl.DataFrame(
{ {
@ -547,19 +547,15 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
], ],
} }
).write_parquet(iod_path) ).write_parquet(iod_path)
pl.DataFrame(
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet( pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
rental_path rental_path
) )
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path) _validate_lad_source_coverage(iod_path, rental_path)
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None: def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet" iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet" rental_path = tmp_path / "rental.parquet"
pl.DataFrame( pl.DataFrame(
{ {
@ -567,13 +563,41 @@ def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) ->
"Local Authority District name (2024)": ["Barnsley"], "Local Authority District name (2024)": ["Barnsley"],
} }
).write_parquet(iod_path) ).write_parquet(iod_path)
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet( pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
rental_path rental_path
) )
with pytest.raises(ValueError, match="Rental data is missing"): with pytest.raises(ValueError, match="Rental data is missing"):
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path) _validate_lad_source_coverage(iod_path, rental_path)
def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage(
tmp_path,
) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
iod_path
)
# Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD
# LSOAs are required to all be present.
pl.DataFrame(
{"lsoa21": ["E01000001", "E01000002", "E01000003"]}
).write_parquet(ethnicity_path)
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
iod_path
)
pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path)
with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"):
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None: def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
@ -1027,7 +1051,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
joined = _join_area_side_tables( joined = _join_area_side_tables(
base, base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}), iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}), ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime, crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}), election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -1427,7 +1451,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
"Property type": ["Terraced", None], "Property type": ["Terraced", None],
"Leasehold/Freehold": ["Leasehold", None], "Leasehold/Freehold": ["Leasehold", None],
"Last known price": [500_000, None], "Last known price": [500_000, None],
"Street tree density percentile": [42.0, 42.0], "Tree canopy density percentile": [42.0, 42.0],
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none. # Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
"_actual_listing_url": ["url0", "url1"], "_actual_listing_url": ["url0", "url1"],
"_actual_asking_price": [600_000, 700_000], "_actual_asking_price": [600_000, 700_000],
@ -1458,7 +1482,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
"Property type": pl.Utf8, "Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8, "Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64, "Last known price": pl.Int64,
"Street tree density percentile": pl.Float32, "Tree canopy density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8, "_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64, "_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32, "_actual_asking_price_per_sqm": pl.Int32,
@ -1496,7 +1520,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"] assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"] assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
# Postcode-level feature carried through to both matched and unmatched rows. # Postcode-level feature carried through to both matched and unmatched rows.
assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0] assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0]
# Match status reflects historical context availability. # Match status reflects historical context availability.
assert finalized["Historical property match status"].to_list() == [ assert finalized["Historical property match status"].to_list() == [
"matched", "matched",
@ -1524,7 +1548,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
"Property type": ["Terraced", "Terraced"], "Property type": ["Terraced", "Terraced"],
"Leasehold/Freehold": ["Leasehold", "Leasehold"], "Leasehold/Freehold": ["Leasehold", "Leasehold"],
"Last known price": [500_000, 480_000], "Last known price": [500_000, 480_000],
"Street tree density percentile": [42.0, 42.0], "Tree canopy density percentile": [42.0, 42.0],
# Same listing URL on both collapsed rows — the fan-out to fix. # Same listing URL on both collapsed rows — the fan-out to fix.
"_actual_listing_url": ["url0", "url0"], "_actual_listing_url": ["url0", "url0"],
"_actual_asking_price": [600_000, 600_000], "_actual_asking_price": [600_000, 600_000],
@ -1555,7 +1579,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
"Property type": pl.Utf8, "Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8, "Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64, "Last known price": pl.Int64,
"Street tree density percentile": pl.Float32, "Tree canopy density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8, "_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64, "_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32, "_actual_asking_price_per_sqm": pl.Int32,

View file

@ -42,7 +42,20 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
rows = [ rows = [
_school("Primary", None, "School remains Good", "AA1 1AA"), _school("Primary", None, "School remains Good", "AA1 1AA"),
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"), _school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
# "(Concerns)"/"(Improving)" variants are still good+. # "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AE", "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"), _school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
_school( _school(
"Secondary", "Secondary",
@ -51,12 +64,7 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
"AA1 1AD", "AA1 1AD",
), ),
] ]
assert _classify(rows) == { assert _classify(rows) == set()
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AC", "good_primary"),
("AA1 1AD", "outstanding_secondary"),
}
def test_ungraded_non_good_outcomes_are_excluded(): def test_ungraded_non_good_outcomes_are_excluded():
@ -80,3 +88,52 @@ def test_non_primary_secondary_phases_excluded():
_school("Not applicable", "2", None), _school("Not applicable", "2", None),
] ]
assert _classify(rows) == set() assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"URN": 100000,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AA", "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"), # primary only
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"), # secondary only
_aged_school("Secondary", "1", 9, 13, "AA1 1AC"), # middle -> both
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_secondary"),
("AA1 1AC", "outstanding_primary"),
("AA1 1AC", "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"),
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
]
rows[0]["URN"] = 111
rows[1]["URN"] = 222
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {("AA1 1AA", "outstanding_primary")}

View file

@ -33,6 +33,14 @@ DROP_CATEGORIES = {
"emergency/water_tank", "emergency/water_tank",
"leisure/bleachers", "leisure/bleachers",
"leisure/schoolyard", "leisure/schoolyard",
# Park "furniture" / incidental features — not parks; they massively
# inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
"public_transport/pay_scale_area", "public_transport/pay_scale_area",
"shop/taxi", "shop/taxi",
"amenity/feeding_place", "amenity/feeding_place",
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
"tourism/village_sign", "tourism/village_sign",
"tourism/wilderness_hut", "tourism/wilderness_hut",
"tourism/yes", "tourism/yes",
# Public transport (from NaPTAN instead) # Public transport (from NaPTAN instead). public_transport/platform is the
# EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
# authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
# NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
# a single stop. stop_position is left dropped to avoid double-counting the
# same stop (platform + stop_position).
"public_transport/entrance", "public_transport/entrance",
"public_transport/platform",
"public_transport/station", "public_transport/station",
"public_transport/stop_position", "public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for # Education amenities — schools come from GIAS instead. OSM coverage for
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🌳", "🌳",
[ [
"leisure/park", "leisure/park",
# leisure/garden is dominated by private residential gardens (98%+
# unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
# so only named (public/notable) gardens count as a Park.
"leisure/garden", "leisure/garden",
"leisure/common", "leisure/common",
"leisure/nature_reserve", "leisure/nature_reserve",
"leisure/dog_park", "leisure/dog_park",
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
], ],
), ),
( (
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[ [
"leisure/sports_centre", "leisure/sports_centre",
"leisure/sports_hall", "leisure/sports_hall",
# leisure/pitch (73% of the old bucket) and leisure/swimming_pool
# (98% unnamed = private/garden pools) are name-gated in transform()
# via REQUIRE_NAME_CATEGORIES so only named public facilities count.
"leisure/pitch", "leisure/pitch",
"leisure/track", "leisure/track",
"leisure/golf_course", "leisure/golf_course",
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"amenity/townhall", "amenity/townhall",
], ],
), ),
# ── Public transport (OSM supplement to NaPTAN) ──────────
# OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
# / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
# one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
# transform() (osm_stops_near_naptan).
(
"Public Transport",
"Bus stop",
"🚏",
[
"public_transport/platform",
],
),
] ]
# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
# These tags are overwhelmingly private/incidental when unnamed: a nameless
# `leisure/garden` is a private residential garden (not a public park), and a
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
# Keeping only named instances stops them inflating Park / Sports Centre counts
# while preserving genuinely public, notable facilities (which carry a name).
REQUIRE_NAME_CATEGORIES = {
"leisure/garden",
"leisure/pitch",
"leisure/practice_pitch",
"leisure/swimming_pool",
"leisure/paddling_pool",
}
# Build flat lookup: OSM category → (group, friendly_name, emoji) # Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = { CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
osm_key: (group, name, emoji) osm_key: (group, name, emoji)
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
) )
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame: def transform_gias_schools(
gias_path: Path, ofsted_path: Path, boundary_path: Path
) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata. """Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/ overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup.""" Inadequate/Not judged), surfaced in the map popup.
Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
England-only Education layer (and depress apparent Ofsted coverage, since
Wales is inspected by Estyn, not Ofsted)."""
icon_category_expr = _school_icon_category_expr() icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES) emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path) ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per # category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…) # school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill. # instead of bundling every GIAS row under a single "School" pill.
return ( schools = (
pl.scan_parquet(gias_path) pl.scan_parquet(gias_path)
.join(ofsted, on="urn", how="left") .join(ofsted, on="urn", how="left")
.select( .select(
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
pl.col("head_name").alias("school_head_name"), pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"), pl.col("ofsted_rating").alias("school_ofsted_rating"),
) )
.collect()
) )
mask = in_england_mask(
boundary_path,
schools["lat"].to_numpy(),
schools["lng"].to_numpy(),
)
return schools.filter(pl.Series(mask)).lazy()
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express, # OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
return tokens return tokens
# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
# so the colocated OSM platform is dropped to avoid double-counting; OSM
# platforms with no nearby NaPTAN stop (the gaps) are kept.
BUS_STOP_DEDUP_RADIUS_M = 50.0
def osm_stops_near_naptan(
osm_stops: pl.DataFrame,
naptan_stops: pl.DataFrame,
radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
) -> list[str]:
"""Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
"""
if osm_stops.is_empty() or naptan_stops.is_empty():
return []
from scipy.spatial import cKDTree
n_lat = naptan_stops["lat"].to_numpy().astype(float)
n_lng = naptan_stops["lng"].to_numpy().astype(float)
o_lat = osm_stops["lat"].to_numpy().astype(float)
o_lng = osm_stops["lng"].to_numpy().astype(float)
o_ids = osm_stops["id"].to_list()
mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
cos_lat = float(np.cos(np.radians(mean_lat)))
n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
tree = cKDTree(n_xy)
dist, _ = tree.query(o_xy, k=1)
return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
def osm_groceries_colocated_with_geolytix( def osm_groceries_colocated_with_geolytix(
osm_groceries: pl.DataFrame, osm_groceries: pl.DataFrame,
geolytix: pl.DataFrame, geolytix: pl.DataFrame,
@ -1601,6 +1694,19 @@ def transform(
# Drop unwanted categories # Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES))) lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
# Drop UNNAMED instances of private-dominated tags (gardens, pitches,
# pools) so they don't inflate Park / Sports Centre proximity counts. Done
# while `category` still holds the raw OSM key, before the friendly mapping.
lf = lf.filter(
~(
pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
& (
pl.col("name").is_null()
| (pl.col("name").cast(pl.String).str.strip_chars() == "")
)
)
)
# Build lookup expressions from the 3-tuple mapping # Build lookup expressions from the 3-tuple mapping
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()} group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()} name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
@ -1665,11 +1771,37 @@ def transform(
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids)) ~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
) )
# Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
# supplement only adds stops in NaPTAN's coverage gaps (no double-count in
# covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
# with NaPTAN ATCO ids.
osm_bus_stops = (
lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
.select("id", "lat", "lng")
.collect(engine="streaming")
)
naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
kept_osm = osm_bus_stops.height - len(covered_bus_ids)
print(
f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
f"{kept_osm:,} to fill NaPTAN gaps"
)
if covered_bus_ids:
lf = lf.filter(
~(
(pl.col("group") == "Public Transport")
& (pl.col("category") == "Bus stop")
& pl.col("id").is_in(covered_bus_ids)
)
)
frames = [ frames = [
lf, lf,
naptan, naptan,
grocery_pois.lazy(), grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path), transform_gias_schools(gias_path, ofsted_path, boundary_path),
] ]
return pl.concat(frames, how="diagonal_relaxed") return pl.concat(frames, how="diagonal_relaxed")

View file

@ -10,6 +10,26 @@ EARTH_RADIUS_KM = 6371.0088
KM_PER_DEGREE_LAT = 111.32 KM_PER_DEGREE_LAT = 111.32
DEFAULT_GRID_SIZE_DEGREES = 0.02 DEFAULT_GRID_SIZE_DEGREES = 0.02
# Generous GB/UK bounding box. The ArcGIS postcode source stores grid-less
# postcodes with a placeholder coordinate (lat=99.999999, lon=0.0); these are
# finite, so an isfinite() check alone lets them through and produces absurd
# ~5,000 km "nearest amenity" distances. Reject anything outside this box so
# such postcodes get NaN distance / zero counts instead of a fabricated value.
UK_LAT_MIN, UK_LAT_MAX = 49.0, 61.5
UK_LON_MIN, UK_LON_MAX = -9.0, 2.5
def valid_uk_coords_mask(lats: np.ndarray, lons: np.ndarray) -> np.ndarray:
"""Boolean mask of coordinates that are finite AND within the UK bbox."""
return (
np.isfinite(lats)
& np.isfinite(lons)
& (lats >= UK_LAT_MIN)
& (lats <= UK_LAT_MAX)
& (lons >= UK_LON_MIN)
& (lons <= UK_LON_MAX)
)
def _build_poi_grid( def _build_poi_grid(
pois: pl.DataFrame, grid_size: float = 0.05 pois: pl.DataFrame, grid_size: float = 0.05
@ -43,7 +63,12 @@ def _get_nearby_indices(
grid_size: float = DEFAULT_GRID_SIZE_DEGREES, grid_size: float = DEFAULT_GRID_SIZE_DEGREES,
) -> np.ndarray | None: ) -> np.ndarray | None:
"""Get POI indices from all grid cells intersecting the radius bounding box.""" """Get POI indices from all grid cells intersecting the radius bounding box."""
if not np.isfinite(pc_lat) or not np.isfinite(pc_lon): if (
not np.isfinite(pc_lat)
or not np.isfinite(pc_lon)
or not (UK_LAT_MIN <= pc_lat <= UK_LAT_MAX)
or not (UK_LON_MIN <= pc_lon <= UK_LON_MAX)
):
return None return None
lat_delta = radius_km / KM_PER_DEGREE_LAT lat_delta = radius_km / KM_PER_DEGREE_LAT
@ -182,7 +207,7 @@ def min_distance_per_postcode(
pc_lats = postcodes_df["lat"].to_numpy() pc_lats = postcodes_df["lat"].to_numpy()
pc_lons = postcodes_df["lon"].to_numpy() pc_lons = postcodes_df["lon"].to_numpy()
pc_codes = postcodes_df["postcode"].to_list() pc_codes = postcodes_df["postcode"].to_list()
valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons) valid_pc_mask = valid_uk_coords_mask(pc_lats, pc_lons)
valid_pc_indices = np.flatnonzero(valid_pc_mask) valid_pc_indices = np.flatnonzero(valid_pc_mask)
result_min_dist = { result_min_dist = {

View file

@ -10,7 +10,9 @@ import java.nio.file.DirectoryStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.time.DayOfWeek;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.temporal.TemporalAdjusters;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
@ -124,9 +126,15 @@ public class App {
Path outDir = Paths.get(outputDirStr); Path outDir = Paths.get(outputDirStr);
Files.createDirectories(outDir); Files.createDirectories(outDir);
LocalDate today = LocalDate.now(); // Always route on a representative WEEKDAY (Monday), not the day the job
// happens to start: a commute search must reflect a typical weekday
// service pattern, and the 07:45-08:15 peak window paired with a weekend
// calendar (e.g. a Saturday run) understated frequencies and overstated
// transit times. --date=YYYY-MM-DD overrides for testing/holidays.
LocalDate routingDate = resolveRoutingDate(args);
System.err.printf("Routing date: %s (%s)%n", routingDate, routingDate.getDayOfWeek());
TransportNetwork network = Router.loadNetwork(requiredEnv("DATA_DIR"), requiredEnv("NETWORK_CACHE_DIR")); TransportNetwork network = Router.loadNetwork(requiredEnv("DATA_DIR"), requiredEnv("NETWORK_CACHE_DIR"));
Router.validateTransitServices(network, today); Router.validateTransitServices(network, routingDate);
System.err.println("Loading postcodes (England only)..."); System.err.println("Loading postcodes (England only)...");
Parquet.Postcodes postcodes = Parquet.loadEnglandPostcodes( Parquet.Postcodes postcodes = Parquet.loadEnglandPostcodes(
@ -224,7 +232,7 @@ public class App {
int modeThreads = threadsForMode(mode, threads); int modeThreads = threadsForMode(mode, threads);
processMode(network, postcodeIndex, transitTiles, processMode(network, postcodeIndex, transitTiles,
postcodes.codes(), postcodes.lats(), postcodes.lons(), postcodes.codes(), postcodes.lats(), postcodes.lons(),
originNames, originLats, originLons, outDir, mode, today, originNames, originLats, originLons, outDir, mode, routingDate,
modeThreads, writeQueue, enablePaths, originIndices, skipCompleted); modeThreads, writeQueue, enablePaths, originIndices, skipCompleted);
} }
} finally { } finally {
@ -544,6 +552,28 @@ public class App {
return defaultValue; return defaultValue;
} }
/**
* Resolve the date to route on. Defaults to the next-or-same Monday relative
* to today (a representative weekday close enough to "now" to stay within the
* GTFS calendar window), so transit times reflect a typical weekday commute
* rather than whatever day the batch job started. An explicit
* {@code --date=YYYY-MM-DD} (or {@code --date YYYY-MM-DD}) overrides this.
*/
static LocalDate resolveRoutingDate(String[] args) {
for (int i = 0; i < args.length; i++) {
String value = null;
if (args[i].startsWith("--date=")) {
value = args[i].substring("--date=".length());
} else if (args[i].equals("--date") && i + 1 < args.length) {
value = args[i + 1];
}
if (value != null && !value.isBlank()) {
return LocalDate.parse(value.trim());
}
}
return LocalDate.now().with(TemporalAdjusters.nextOrSame(DayOfWeek.MONDAY));
}
/** /**
* Filter place indices to those near at least one England postcode. * Filter place indices to those near at least one England postcode.
* Uses a 0.1° grid (~11km cells) built from postcode locations a place is kept * Uses a 0.1° grid (~11km cells) built from postcode locations a place is kept

View file

@ -166,7 +166,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
name: "Defining characteristics", name: "Defining characteristics",
features: &[ features: &[
Feature::Numeric(FeatureConfig { Feature::Numeric(FeatureConfig {
name: "Street tree density percentile", name: "Tree canopy density percentile",
bounds: Bounds::Fixed { bounds: Bounds::Fixed {
min: 0.0, min: 0.0,
max: 100.0, max: 100.0,

View file

@ -672,7 +672,7 @@ mod tests {
assert!(fields_specified); assert!(fields_specified);
assert!(field_set.contains("Property type")); assert!(field_set.contains("Property type"));
assert!(field_set.contains("Street tree density percentile")); assert!(field_set.contains("Tree canopy density percentile"));
assert!(field_set.contains("Noise (dB)")); assert!(field_set.contains("Noise (dB)"));
assert!(!field_set.contains("Max available download speed (Mbps)")); assert!(!field_set.contains("Max available download speed (Mbps)"));
assert!(!field_set.contains("Distance to nearest amenity (Cafe) (km)")); assert!(!field_set.contains("Distance to nearest amenity (Cafe) (km)"));